Many optimization tweaks

Optimize Rabin Deduplication and Bsdiff
Vectorize XXHash using SE4
This commit is contained in:
Moinak Ghosh 2013-01-20 22:02:26 +05:30
parent 455c8107d5
commit 3888c8d316
8 changed files with 115 additions and 35 deletions

View file

@ -85,7 +85,7 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
};
if(V[I[k+i]+h]==x) {
tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
j++;
++j;
};
};
for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
@ -97,30 +97,30 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
x=V[I[start+len/2]+h];
jj=0;kk=0;
for(i=start;i<start+len;i++) {
if(V[I[i]+h]<x) jj++;
if(V[I[i]+h]==x) kk++;
if(V[I[i]+h]<x) ++jj;
if(V[I[i]+h]==x) ++kk;
};
jj+=start;kk+=jj;
i=start;j=0;k=0;
while(i<jj) {
if(V[I[i]+h]<x) {
i++;
++i;
} else if(V[I[i]+h]==x) {
tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
j++;
++j;
} else {
tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
k++;
++k;
};
};
while(jj+j<kk) {
if(V[I[jj+j]+h]==x) {
j++;
++j;
} else {
tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
k++;
++k;
};
};
@ -336,7 +336,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
s=0;Sf=0;lenf=0;
for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
i++;
++i;
if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
};

View file

@ -53,7 +53,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
if (val) break;
pos1 += sizeof (val); count += sizeof (val);
}
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) ++count;
count |= ZERO_MASK;
*((unsigned short *)(ob + pos2)) = htons(count);
pos2 += 2;
@ -75,7 +75,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
}
}
ob[pos2++] = ib[pos1++];
count++;
++count;
}
*((unsigned short *)(ob + pos3)) = htons(count);
}

View file

@ -138,7 +138,7 @@ PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
p[k] ^= digtmp[k];
}
tkeylen-= cplen;
i++;
++i;
p+= cplen;
}
HMAC_CTX_cleanup(&hctx);
@ -263,7 +263,7 @@ serialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
j = 0;
for (i=cksum_bytes; i>0; i--) {
buf[j] = checksum[i-1];
j++;
++j;
}
}
@ -275,7 +275,7 @@ deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
j = 0;
for (i=cksum_bytes; i>0; i--) {
checksum[i-1] = buf[j];
j++;
++j;
}
}

View file

@ -418,7 +418,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
pos2 += sizeof (uint64_t);
for (cnt = 0; cnt < val; cnt++) {
*pos2 = *pos;
pos2++; pos++;
++pos2; ++pos;
}
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
}

8
main.c
View file

@ -277,8 +277,8 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
DEBUG_STAT_EN(double strt, en);
type = *sorc;
sorc++;
srclen--;
++sorc;
--srclen;
if (type & PREPROC_COMPRESSED) {
*dstlen = ntohll(*((uint64_t *)(sorc)));
sorc += 8;
@ -1093,7 +1093,7 @@ start_decompress(const char *filename, const char *to_filename)
}
}
sem_post(&tdat->start_sem);
chunk_num++;
++chunk_num;
}
}
@ -1903,7 +1903,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
}
/* Signal the compression thread to start */
sem_post(&tdat->start_sem);
chunk_num++;
++chunk_num;
if (single_chunk) {
rbytes = 0;

View file

@ -359,20 +359,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
offset = *size - ctx->rabin_poly_max_block_size;
length = 0;
for (i=offset; i<*size; i++) {
uchar_t cur_byte = buf1[i];
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
int cur_byte = buf1[i];
int pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte;
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
cur_roll_checksum += cur_byte;
cur_roll_checksum -= out[pushed_out];
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
++length;
if (length < ctx->rabin_poly_min_block_size) continue;
// If we hit our special value update block offset
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt) {
last_offset = i;
length = 0;
@ -385,18 +385,15 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
return (0);
}
j = 0;
for (i=offset; i<*size; i++) {
uint64_t pc[4];
uchar_t cur_byte = buf1[i];
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
int cur_byte = buf1[i];
int pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte;
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
cur_roll_checksum += cur_byte;
cur_roll_checksum -= out[pushed_out];
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
/*
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
@ -408,6 +405,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
if (length < ctx->rabin_poly_min_block_size) continue;
// If we hit our special value or reached the max block size update block offset
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
length >= ctx->rabin_poly_max_block_size) {
if (ctx->blocks[blknum] == 0)
@ -446,7 +444,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
++blknum;
last_offset = i+1;
length = 0;
j = 0;
}
}
@ -475,12 +472,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ksmallest((int64_t *)ctx_heap, length, &heap);
cur_sketch =
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
ctx->blocks[blknum]->similarity_hash = cur_sketch;
} else {
cur_sketch =
XXH32((const uchar_t *)(buf1+last_offset), length, 0);
ctx->blocks[blknum]->similarity_hash = cur_sketch;
}
ctx->blocks[blknum]->similarity_hash = cur_sketch;
}
++blknum;
last_offset = *size;
@ -556,7 +552,7 @@ process_blocks:
length = 0;
/*
* Look for exact duplicates. Same cksum, length and memcmp()\
* Look for exact duplicates. Same cksum, length and memcmp()
*/
while (1) {
if (be->hash == ctx->blocks[i]->hash &&

View file

@ -301,7 +301,7 @@ set_threadcounts(algo_props_t *props, int *nthreads, int nprocs, algo_threads_ty
props->nthreads++;
} else if (props->nthreads * (nthreads1+1) <= nprocs && nthreads1 < *nthreads) {
nthreads1++;
++nthreads1;
} else {
break;
}

View file

@ -109,7 +109,15 @@ static inline unsigned int XXH_swap32 (unsigned int x) {
}
#endif
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
#include <smmintrin.h>
static inline __m128i _x_mm_rotl_epi32(const __m128i a, int bits)
{
__m128i tmp1 = _mm_slli_epi32(a, bits);
__m128i tmp2 = _mm_srli_epi32(a, 32 - bits);
return (_mm_or_si128(tmp1, tmp2));
}
#endif
//**************************************
// Constants
@ -146,7 +154,84 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
const unsigned char* const bEnd = p + len;
unsigned int h32;
if (len>=16)
if (len>=256)
{
const unsigned char* const limit = bEnd - 32;
unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
unsigned int v2 = seed + PRIME32_2;
unsigned int v3 = seed + 0;
unsigned int v4 = seed - PRIME32_1;
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
unsigned int vx[4], vx1[4];
__m128i accum = _mm_set_epi32(v4, v3, v2, v1);
__m128i accum1 = _mm_set_epi32(v4, v3, v2, v1);
__m128i prime1 = _mm_set1_epi32(PRIME32_1);
__m128i prime2 = _mm_set1_epi32(PRIME32_2);
/*
* 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
* interleave to some extent on a hyperthreaded processor providing 10% - 14%
* speedup over original xxhash depending on processor. We could have used
* aligned loads but we actually want the unaligned penalty. It helps to
* interleave better for a slight benefit over aligned loads here!
*/
do {
__m128i mem = _mm_loadu_si128((__m128i *)p);
p += 16;
mem = _mm_mullo_epi32(mem, prime2);
accum = _mm_add_epi32(accum, mem);
accum = _x_mm_rotl_epi32(accum, 13);
accum = _mm_mullo_epi32(accum, prime1);
mem = _mm_loadu_si128((__m128i *)p);
p += 16;
mem = _mm_mullo_epi32(mem, prime2);
accum1 = _mm_add_epi32(accum1, mem);
accum1 = _x_mm_rotl_epi32(accum1, 13);
accum1 = _mm_mullo_epi32(accum1, prime1);
} while (p<=limit);
_mm_storeu_si128((__m128i *)vx, accum);
_mm_storeu_si128((__m128i *)vx1, accum1);
/*
* Combine the two accumulators into a single hash value.
*/
v1 = vx[0];
v2 = vx[1];
v3 = vx[2];
v4 = vx[3];
v1 += vx1[0] * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
v2 += vx1[1] * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
v3 += vx1[2] * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
v4 += vx1[3] * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
#else
unsigned int vx1 = seed + PRIME32_1 + PRIME32_2;
unsigned int vx2 = seed + PRIME32_2;
unsigned int vx3 = seed + 0;
unsigned int vx4 = seed - PRIME32_1;
do
{
v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
vx1 += XXH_LE32(p) * PRIME32_2; vx1 = XXH_rotl32(vx1, 13); vx1 *= PRIME32_1; p+=4;
vx2 += XXH_LE32(p) * PRIME32_2; vx2 = XXH_rotl32(vx2, 13); vx2 *= PRIME32_1; p+=4;
vx3 += XXH_LE32(p) * PRIME32_2; vx3 = XXH_rotl32(vx3, 13); vx3 *= PRIME32_1; p+=4;
vx4 += XXH_LE32(p) * PRIME32_2; vx4 = XXH_rotl32(vx4, 13); vx4 *= PRIME32_1; p+=4;
} while (p<=limit) ;
v1 += vx1 * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
v2 += vx2 * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
v3 += vx3 * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
v4 += vx4 * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
#endif
} else if (len>=16)
{
const unsigned char* const limit = bEnd - 16;
unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
@ -161,7 +246,6 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
} while (p<=limit) ;
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
}
else