Improve Deduplication throughtput by 90%.

Use SSE4 register as sliding window for default 16-byte window size.
Use local variable for sliding window position to avoid spurios memory access in non-SIMD case.
Avoid computing breakpoint check value if processed length < minimum block length.
This commit is contained in:
Moinak Ghosh 2013-01-22 15:54:42 +05:30
parent e9e3e1e632
commit 5c8704c5bb
4 changed files with 74 additions and 12 deletions

View file

@ -8,6 +8,10 @@ The simplest process to build and install this utility is:
make make
make install make install
The current makefiles and config scripts assume Gcc compiler is
in the PATH. Please update PATH before running config if that is
not the case.
In order to remove all binaries: In order to remove all binaries:
make clean make clean

View file

@ -71,6 +71,10 @@
#include <xxhash.h> #include <xxhash.h>
#include "rabin_dedup.h" #include "rabin_dedup.h"
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16
# include <smmintrin.h>
# define SSE_MODE 1
#endif
#define DELTA_EXTRA2_PCT(x) ((x) >> 1) #define DELTA_EXTRA2_PCT(x) ((x) >> 1)
#define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3)) #define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3))
@ -221,7 +225,11 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
destroy_dedupe_context(ctx); destroy_dedupe_context(ctx);
return (NULL); return (NULL);
} }
#ifndef SSE_MODE
ctx->current_window_data = (uchar_t *)slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); ctx->current_window_data = (uchar_t *)slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
#else
ctx->current_window_data = (uchar_t *)1;
#endif
ctx->blocks = NULL; ctx->blocks = NULL;
if (real_chunksize > 0) { if (real_chunksize > 0) {
ctx->blocks = (rabin_blockentry_t **)slab_calloc(NULL, ctx->blocks = (rabin_blockentry_t **)slab_calloc(NULL,
@ -258,8 +266,9 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
void void
reset_dedupe_context(dedupe_context_t *ctx) reset_dedupe_context(dedupe_context_t *ctx)
{ {
#ifndef SSE_MODE
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
ctx->window_pos = 0; #endif
ctx->valid = 0; ctx->valid = 0;
} }
@ -268,7 +277,9 @@ destroy_dedupe_context(dedupe_context_t *ctx)
{ {
if (ctx) { if (ctx) {
uint32_t i; uint32_t i;
#ifndef SSE_MODE
if (ctx->current_window_data) slab_free(NULL, ctx->current_window_data); if (ctx->current_window_data) slab_free(NULL, ctx->current_window_data);
#endif
if (ctx->blocks) { if (ctx->blocks) {
for (i=0; i<ctx->blknum && ctx->blocks[i] != NULL; i++) { for (i=0; i<ctx->blknum && ctx->blocks[i] != NULL; i++) {
slab_free(NULL, ctx->blocks[i]); slab_free(NULL, ctx->blocks[i]);
@ -290,7 +301,7 @@ uint32_t
dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t offset, uint64_t *rabin_pos) dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t offset, uint64_t *rabin_pos)
{ {
uint64_t i, last_offset, j, ary_sz; uint64_t i, last_offset, j, ary_sz;
uint32_t blknum; uint32_t blknum, window_pos;
uchar_t *buf1 = (uchar_t *)buf; uchar_t *buf1 = (uchar_t *)buf;
uint32_t length; uint32_t length;
uint64_t cur_roll_checksum, cur_pos_checksum; uint64_t cur_roll_checksum, cur_pos_checksum;
@ -304,6 +315,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
length = offset; length = offset;
last_offset = 0; last_offset = 0;
blknum = 0; blknum = 0;
window_pos = 0;
ctx->valid = 0; ctx->valid = 0;
cur_roll_checksum = 0; cur_roll_checksum = 0;
if (*size < ctx->rabin_poly_avg_block_size) return (0); if (*size < ctx->rabin_poly_avg_block_size) return (0);
@ -346,7 +358,13 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ary_sz = ctx->rabin_poly_max_block_size; ary_sz = ctx->rabin_poly_max_block_size;
ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
} }
#ifndef SSE_MODE
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
#else
__m128i cur_sse_byte = _mm_setzero_si128();
__m128i window = _mm_setzero_si128();
#endif
j = *size - RAB_POLYNOMIAL_WIN_SIZE;
/* /*
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
@ -358,16 +376,29 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
if (rabin_pos) { if (rabin_pos) {
offset = *size - ctx->rabin_poly_max_block_size; offset = *size - ctx->rabin_poly_max_block_size;
length = 0; length = 0;
for (i=offset; i<*size; i++) { for (i=offset; i<j; i++) {
int cur_byte = buf1[i]; int cur_byte = buf1[i];
int pushed_out = ctx->current_window_data[ctx->window_pos]; #ifdef SSE_MODE
ctx->current_window_data[ctx->window_pos] = cur_byte; uint32_t pushed_out = _mm_extract_epi32(window, 3);
pushed_out >>= 24;
asm ("movd %[cur_byte], %[cur_sse_byte]"
: [cur_sse_byte] "=x" (cur_sse_byte)
: [cur_byte] "r" (cur_byte)
);
window = _mm_slli_si128(window, 1);
window = _mm_or_si128(window, cur_sse_byte);
#else
uint32_t pushed_out = ctx->current_window_data[window_pos];
ctx->current_window_data[window_pos] = cur_byte;
#endif
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
cur_roll_checksum += cur_byte; cur_roll_checksum += cur_byte;
cur_roll_checksum -= out[pushed_out]; cur_roll_checksum -= out[pushed_out];
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); #ifndef SSE_MODE
window_pos = (window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
#endif
++length; ++length;
if (length < ctx->rabin_poly_min_block_size) continue; if (length < ctx->rabin_poly_min_block_size) continue;
@ -385,22 +416,44 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
return (0); return (0);
} }
for (i=offset; i<*size; i++) { for (i=offset; i<j; i++) {
uint64_t pc[4]; uint64_t pc[4];
int cur_byte = buf1[i]; uint32_t cur_byte = buf1[i];
int pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte; #ifdef SSE_MODE
/*
* A 16-byte XMM register is used as a sliding window if our window size is 16 bytes
* and at least SSE 4.1 is enabled. Avoids memory access for the sliding window.
*/
uint32_t pushed_out = _mm_extract_epi32(window, 3);
pushed_out >>= 24;
/*
* No intrinsic available for this.
*/
asm ("movd %[cur_byte], %[cur_sse_byte]"
: [cur_sse_byte] "=x" (cur_sse_byte)
: [cur_byte] "r" (cur_byte)
);
window = _mm_slli_si128(window, 1);
window = _mm_or_si128(window, cur_sse_byte);
#else
uint32_t pushed_out = ctx->current_window_data[window_pos];
ctx->current_window_data[window_pos] = cur_byte;
#endif
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
cur_roll_checksum += cur_byte; cur_roll_checksum += cur_byte;
cur_roll_checksum -= out[pushed_out]; cur_roll_checksum -= out[pushed_out];
#ifndef SSE_MODE
/* /*
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
* to be power of 2 * to be power of 2
*/ */
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); window_pos = (window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
#endif
++length; ++length;
if (length < ctx->rabin_poly_min_block_size) continue; if (length < ctx->rabin_poly_min_block_size) continue;
@ -444,6 +497,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
++blknum; ++blknum;
last_offset = i+1; last_offset = i+1;
length = 0; length = 0;
if (*size - last_offset <= ctx->rabin_poly_min_block_size) break;
} }
} }

View file

@ -150,7 +150,6 @@ typedef struct {
rabin_blockentry_t **blocks; rabin_blockentry_t **blocks;
uint32_t blknum; uint32_t blknum;
unsigned char *cbuf; unsigned char *cbuf;
int window_pos;
uint32_t rabin_poly_max_block_size; uint32_t rabin_poly_max_block_size;
uint32_t rabin_poly_min_block_size; uint32_t rabin_poly_min_block_size;
uint32_t rabin_poly_avg_block_size; uint32_t rabin_poly_avg_block_size;

View file

@ -223,6 +223,11 @@ Read_Adjusted(int fd, uchar_t *buf, uint64_t count, int64_t *rabin_count, void *
uint64_t rc, rbc; uint64_t rc, rbc;
rc = rcount; rc = rcount;
rbc = *rabin_count; rbc = *rabin_count;
/*
* This call does not actually dedupe but finds the last rabin boundary
* in the buf.
*/
dedupe_compress(rctx, buf, &rc, 0, &rbc); dedupe_compress(rctx, buf, &rc, 0, &rbc);
rcount = rc; rcount = rc;
*rabin_count = rbc; *rabin_count = rbc;