From 5c8704c5bbc9f14005d27dd3e55010700ed3fba1 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Tue, 22 Jan 2013 15:54:42 +0530 Subject: [PATCH] Improve Deduplication throughtput by 90%. Use SSE4 register as sliding window for default 16-byte window size. Use local variable for sliding window position to avoid spurios memory access in non-SIMD case. Avoid computing breakpoint check value if processed length < minimum block length. --- INSTALL | 4 +++ rabin/rabin_dedup.c | 76 ++++++++++++++++++++++++++++++++++++++------- rabin/rabin_dedup.h | 1 - utils/utils.c | 5 +++ 4 files changed, 74 insertions(+), 12 deletions(-) diff --git a/INSTALL b/INSTALL index d0465f0..b2dbc4c 100644 --- a/INSTALL +++ b/INSTALL @@ -8,6 +8,10 @@ The simplest process to build and install this utility is: make make install +The current makefiles and config scripts assume Gcc compiler is +in the PATH. Please update PATH before running config if that is +not the case. + In order to remove all binaries: make clean diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index b8dbe9b..16a0e51 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -71,6 +71,10 @@ #include #include "rabin_dedup.h" +#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16 +# include +# define SSE_MODE 1 +#endif #define DELTA_EXTRA2_PCT(x) ((x) >> 1) #define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3)) @@ -221,7 +225,11 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s destroy_dedupe_context(ctx); return (NULL); } +#ifndef SSE_MODE ctx->current_window_data = (uchar_t *)slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); +#else + ctx->current_window_data = (uchar_t *)1; +#endif ctx->blocks = NULL; if (real_chunksize > 0) { ctx->blocks = (rabin_blockentry_t **)slab_calloc(NULL, @@ -258,8 +266,9 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s void reset_dedupe_context(dedupe_context_t *ctx) { +#ifndef SSE_MODE memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); - ctx->window_pos = 0; +#endif ctx->valid = 0; } @@ -268,7 +277,9 @@ destroy_dedupe_context(dedupe_context_t *ctx) { if (ctx) { uint32_t i; +#ifndef SSE_MODE if (ctx->current_window_data) slab_free(NULL, ctx->current_window_data); +#endif if (ctx->blocks) { for (i=0; iblknum && ctx->blocks[i] != NULL; i++) { slab_free(NULL, ctx->blocks[i]); @@ -290,7 +301,7 @@ uint32_t dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t offset, uint64_t *rabin_pos) { uint64_t i, last_offset, j, ary_sz; - uint32_t blknum; + uint32_t blknum, window_pos; uchar_t *buf1 = (uchar_t *)buf; uint32_t length; uint64_t cur_roll_checksum, cur_pos_checksum; @@ -304,6 +315,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of length = offset; last_offset = 0; blknum = 0; + window_pos = 0; ctx->valid = 0; cur_roll_checksum = 0; if (*size < ctx->rabin_poly_avg_block_size) return (0); @@ -346,7 +358,13 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ary_sz = ctx->rabin_poly_max_block_size; ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); } +#ifndef SSE_MODE memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); +#else + __m128i cur_sse_byte = _mm_setzero_si128(); + __m128i window = _mm_setzero_si128(); +#endif + j = *size - RAB_POLYNOMIAL_WIN_SIZE; /* * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary @@ -358,16 +376,29 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of if (rabin_pos) { offset = *size - ctx->rabin_poly_max_block_size; length = 0; - for (i=offset; i<*size; i++) { + for (i=offset; icurrent_window_data[ctx->window_pos]; - ctx->current_window_data[ctx->window_pos] = cur_byte; +#ifdef SSE_MODE + uint32_t pushed_out = _mm_extract_epi32(window, 3); + pushed_out >>= 24; + asm ("movd %[cur_byte], %[cur_sse_byte]" + : [cur_sse_byte] "=x" (cur_sse_byte) + : [cur_byte] "r" (cur_byte) + ); + window = _mm_slli_si128(window, 1); + window = _mm_or_si128(window, cur_sse_byte); +#else + uint32_t pushed_out = ctx->current_window_data[window_pos]; + ctx->current_window_data[window_pos] = cur_byte; +#endif cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum += cur_byte; cur_roll_checksum -= out[pushed_out]; - ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); +#ifndef SSE_MODE + window_pos = (window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); +#endif ++length; if (length < ctx->rabin_poly_min_block_size) continue; @@ -385,22 +416,44 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of return (0); } - for (i=offset; i<*size; i++) { + for (i=offset; icurrent_window_data[ctx->window_pos]; - ctx->current_window_data[ctx->window_pos] = cur_byte; + uint32_t cur_byte = buf1[i]; + +#ifdef SSE_MODE + /* + * A 16-byte XMM register is used as a sliding window if our window size is 16 bytes + * and at least SSE 4.1 is enabled. Avoids memory access for the sliding window. + */ + uint32_t pushed_out = _mm_extract_epi32(window, 3); + pushed_out >>= 24; + + /* + * No intrinsic available for this. + */ + asm ("movd %[cur_byte], %[cur_sse_byte]" + : [cur_sse_byte] "=x" (cur_sse_byte) + : [cur_byte] "r" (cur_byte) + ); + window = _mm_slli_si128(window, 1); + window = _mm_or_si128(window, cur_sse_byte); +#else + uint32_t pushed_out = ctx->current_window_data[window_pos]; + ctx->current_window_data[window_pos] = cur_byte; +#endif cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum += cur_byte; cur_roll_checksum -= out[pushed_out]; +#ifndef SSE_MODE /* * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE * to be power of 2 */ - ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); + window_pos = (window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); +#endif ++length; if (length < ctx->rabin_poly_min_block_size) continue; @@ -444,6 +497,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ++blknum; last_offset = i+1; length = 0; + if (*size - last_offset <= ctx->rabin_poly_min_block_size) break; } } diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h index dd5b565..2aeb4a3 100644 --- a/rabin/rabin_dedup.h +++ b/rabin/rabin_dedup.h @@ -150,7 +150,6 @@ typedef struct { rabin_blockentry_t **blocks; uint32_t blknum; unsigned char *cbuf; - int window_pos; uint32_t rabin_poly_max_block_size; uint32_t rabin_poly_min_block_size; uint32_t rabin_poly_avg_block_size; diff --git a/utils/utils.c b/utils/utils.c index c729865..d1e4c81 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -223,6 +223,11 @@ Read_Adjusted(int fd, uchar_t *buf, uint64_t count, int64_t *rabin_count, void * uint64_t rc, rbc; rc = rcount; rbc = *rabin_count; + + /* + * This call does not actually dedupe but finds the last rabin boundary + * in the buf. + */ dedupe_compress(rctx, buf, &rc, 0, &rbc); rcount = rc; *rabin_count = rbc;