Many optimization tweaks
Optimize Rabin Deduplication and Bsdiff Vectorize XXHash using SE4
This commit is contained in:
parent
455c8107d5
commit
3888c8d316
8 changed files with 115 additions and 35 deletions
|
@ -85,7 +85,7 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
|
|||
};
|
||||
if(V[I[k+i]+h]==x) {
|
||||
tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
|
||||
j++;
|
||||
++j;
|
||||
};
|
||||
};
|
||||
for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
|
||||
|
@ -97,30 +97,30 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
|
|||
x=V[I[start+len/2]+h];
|
||||
jj=0;kk=0;
|
||||
for(i=start;i<start+len;i++) {
|
||||
if(V[I[i]+h]<x) jj++;
|
||||
if(V[I[i]+h]==x) kk++;
|
||||
if(V[I[i]+h]<x) ++jj;
|
||||
if(V[I[i]+h]==x) ++kk;
|
||||
};
|
||||
jj+=start;kk+=jj;
|
||||
|
||||
i=start;j=0;k=0;
|
||||
while(i<jj) {
|
||||
if(V[I[i]+h]<x) {
|
||||
i++;
|
||||
++i;
|
||||
} else if(V[I[i]+h]==x) {
|
||||
tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
|
||||
j++;
|
||||
++j;
|
||||
} else {
|
||||
tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
|
||||
k++;
|
||||
++k;
|
||||
};
|
||||
};
|
||||
|
||||
while(jj+j<kk) {
|
||||
if(V[I[jj+j]+h]==x) {
|
||||
j++;
|
||||
++j;
|
||||
} else {
|
||||
tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
|
||||
k++;
|
||||
++k;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -336,7 +336,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
|
|||
s=0;Sf=0;lenf=0;
|
||||
for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
|
||||
s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
|
||||
i++;
|
||||
++i;
|
||||
if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
|
||||
};
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
|
|||
if (val) break;
|
||||
pos1 += sizeof (val); count += sizeof (val);
|
||||
}
|
||||
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
|
||||
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) ++count;
|
||||
count |= ZERO_MASK;
|
||||
*((unsigned short *)(ob + pos2)) = htons(count);
|
||||
pos2 += 2;
|
||||
|
@ -75,7 +75,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
|
|||
}
|
||||
}
|
||||
ob[pos2++] = ib[pos1++];
|
||||
count++;
|
||||
++count;
|
||||
}
|
||||
*((unsigned short *)(ob + pos3)) = htons(count);
|
||||
}
|
||||
|
|
|
@ -138,7 +138,7 @@ PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
|
|||
p[k] ^= digtmp[k];
|
||||
}
|
||||
tkeylen-= cplen;
|
||||
i++;
|
||||
++i;
|
||||
p+= cplen;
|
||||
}
|
||||
HMAC_CTX_cleanup(&hctx);
|
||||
|
@ -263,7 +263,7 @@ serialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
|
|||
j = 0;
|
||||
for (i=cksum_bytes; i>0; i--) {
|
||||
buf[j] = checksum[i-1];
|
||||
j++;
|
||||
++j;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,7 +275,7 @@ deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
|
|||
j = 0;
|
||||
for (i=cksum_bytes; i>0; i--) {
|
||||
checksum[i-1] = buf[j];
|
||||
j++;
|
||||
++j;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -418,7 +418,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
|
|||
pos2 += sizeof (uint64_t);
|
||||
for (cnt = 0; cnt < val; cnt++) {
|
||||
*pos2 = *pos;
|
||||
pos2++; pos++;
|
||||
++pos2; ++pos;
|
||||
}
|
||||
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
|
||||
}
|
||||
|
|
8
main.c
8
main.c
|
@ -277,8 +277,8 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
|
|||
DEBUG_STAT_EN(double strt, en);
|
||||
|
||||
type = *sorc;
|
||||
sorc++;
|
||||
srclen--;
|
||||
++sorc;
|
||||
--srclen;
|
||||
if (type & PREPROC_COMPRESSED) {
|
||||
*dstlen = ntohll(*((uint64_t *)(sorc)));
|
||||
sorc += 8;
|
||||
|
@ -1093,7 +1093,7 @@ start_decompress(const char *filename, const char *to_filename)
|
|||
}
|
||||
}
|
||||
sem_post(&tdat->start_sem);
|
||||
chunk_num++;
|
||||
++chunk_num;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1903,7 +1903,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
|||
}
|
||||
/* Signal the compression thread to start */
|
||||
sem_post(&tdat->start_sem);
|
||||
chunk_num++;
|
||||
++chunk_num;
|
||||
|
||||
if (single_chunk) {
|
||||
rbytes = 0;
|
||||
|
|
|
@ -359,20 +359,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
offset = *size - ctx->rabin_poly_max_block_size;
|
||||
length = 0;
|
||||
for (i=offset; i<*size; i++) {
|
||||
uchar_t cur_byte = buf1[i];
|
||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||
int cur_byte = buf1[i];
|
||||
int pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||
|
||||
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||
cur_roll_checksum += cur_byte;
|
||||
cur_roll_checksum -= out[pushed_out];
|
||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||
|
||||
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
|
||||
++length;
|
||||
if (length < ctx->rabin_poly_min_block_size) continue;
|
||||
|
||||
// If we hit our special value update block offset
|
||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||
if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt) {
|
||||
last_offset = i;
|
||||
length = 0;
|
||||
|
@ -385,18 +385,15 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
return (0);
|
||||
}
|
||||
|
||||
j = 0;
|
||||
|
||||
for (i=offset; i<*size; i++) {
|
||||
uint64_t pc[4];
|
||||
uchar_t cur_byte = buf1[i];
|
||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||
int cur_byte = buf1[i];
|
||||
int pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||
|
||||
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||
cur_roll_checksum += cur_byte;
|
||||
cur_roll_checksum -= out[pushed_out];
|
||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||
|
||||
/*
|
||||
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
||||
|
@ -408,6 +405,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
if (length < ctx->rabin_poly_min_block_size) continue;
|
||||
|
||||
// If we hit our special value or reached the max block size update block offset
|
||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||
if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||
length >= ctx->rabin_poly_max_block_size) {
|
||||
if (ctx->blocks[blknum] == 0)
|
||||
|
@ -446,7 +444,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
++blknum;
|
||||
last_offset = i+1;
|
||||
length = 0;
|
||||
j = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -475,12 +472,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
ksmallest((int64_t *)ctx_heap, length, &heap);
|
||||
cur_sketch =
|
||||
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
|
||||
ctx->blocks[blknum]->similarity_hash = cur_sketch;
|
||||
} else {
|
||||
cur_sketch =
|
||||
XXH32((const uchar_t *)(buf1+last_offset), length, 0);
|
||||
ctx->blocks[blknum]->similarity_hash = cur_sketch;
|
||||
}
|
||||
ctx->blocks[blknum]->similarity_hash = cur_sketch;
|
||||
}
|
||||
++blknum;
|
||||
last_offset = *size;
|
||||
|
@ -556,7 +552,7 @@ process_blocks:
|
|||
length = 0;
|
||||
|
||||
/*
|
||||
* Look for exact duplicates. Same cksum, length and memcmp()\
|
||||
* Look for exact duplicates. Same cksum, length and memcmp()
|
||||
*/
|
||||
while (1) {
|
||||
if (be->hash == ctx->blocks[i]->hash &&
|
||||
|
|
|
@ -301,7 +301,7 @@ set_threadcounts(algo_props_t *props, int *nthreads, int nprocs, algo_threads_ty
|
|||
props->nthreads++;
|
||||
|
||||
} else if (props->nthreads * (nthreads1+1) <= nprocs && nthreads1 < *nthreads) {
|
||||
nthreads1++;
|
||||
++nthreads1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -109,7 +109,15 @@ static inline unsigned int XXH_swap32 (unsigned int x) {
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
static inline __m128i _x_mm_rotl_epi32(const __m128i a, int bits)
|
||||
{
|
||||
__m128i tmp1 = _mm_slli_epi32(a, bits);
|
||||
__m128i tmp2 = _mm_srli_epi32(a, 32 - bits);
|
||||
return (_mm_or_si128(tmp1, tmp2));
|
||||
}
|
||||
#endif
|
||||
|
||||
//**************************************
|
||||
// Constants
|
||||
|
@ -146,7 +154,84 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
|
|||
const unsigned char* const bEnd = p + len;
|
||||
unsigned int h32;
|
||||
|
||||
if (len>=16)
|
||||
if (len>=256)
|
||||
{
|
||||
const unsigned char* const limit = bEnd - 32;
|
||||
unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
|
||||
unsigned int v2 = seed + PRIME32_2;
|
||||
unsigned int v3 = seed + 0;
|
||||
unsigned int v4 = seed - PRIME32_1;
|
||||
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
|
||||
unsigned int vx[4], vx1[4];
|
||||
|
||||
__m128i accum = _mm_set_epi32(v4, v3, v2, v1);
|
||||
__m128i accum1 = _mm_set_epi32(v4, v3, v2, v1);
|
||||
__m128i prime1 = _mm_set1_epi32(PRIME32_1);
|
||||
__m128i prime2 = _mm_set1_epi32(PRIME32_2);
|
||||
|
||||
/*
|
||||
* 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
|
||||
* interleave to some extent on a hyperthreaded processor providing 10% - 14%
|
||||
* speedup over original xxhash depending on processor. We could have used
|
||||
* aligned loads but we actually want the unaligned penalty. It helps to
|
||||
* interleave better for a slight benefit over aligned loads here!
|
||||
*/
|
||||
do {
|
||||
__m128i mem = _mm_loadu_si128((__m128i *)p);
|
||||
p += 16;
|
||||
mem = _mm_mullo_epi32(mem, prime2);
|
||||
accum = _mm_add_epi32(accum, mem);
|
||||
accum = _x_mm_rotl_epi32(accum, 13);
|
||||
accum = _mm_mullo_epi32(accum, prime1);
|
||||
|
||||
mem = _mm_loadu_si128((__m128i *)p);
|
||||
p += 16;
|
||||
mem = _mm_mullo_epi32(mem, prime2);
|
||||
accum1 = _mm_add_epi32(accum1, mem);
|
||||
accum1 = _x_mm_rotl_epi32(accum1, 13);
|
||||
accum1 = _mm_mullo_epi32(accum1, prime1);
|
||||
} while (p<=limit);
|
||||
|
||||
_mm_storeu_si128((__m128i *)vx, accum);
|
||||
_mm_storeu_si128((__m128i *)vx1, accum1);
|
||||
|
||||
/*
|
||||
* Combine the two accumulators into a single hash value.
|
||||
*/
|
||||
v1 = vx[0];
|
||||
v2 = vx[1];
|
||||
v3 = vx[2];
|
||||
v4 = vx[3];
|
||||
v1 += vx1[0] * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
|
||||
v2 += vx1[1] * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
|
||||
v3 += vx1[2] * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
|
||||
v4 += vx1[3] * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
|
||||
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
|
||||
#else
|
||||
unsigned int vx1 = seed + PRIME32_1 + PRIME32_2;
|
||||
unsigned int vx2 = seed + PRIME32_2;
|
||||
unsigned int vx3 = seed + 0;
|
||||
unsigned int vx4 = seed - PRIME32_1;
|
||||
|
||||
do
|
||||
{
|
||||
v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
|
||||
v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
|
||||
v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
|
||||
v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
|
||||
|
||||
vx1 += XXH_LE32(p) * PRIME32_2; vx1 = XXH_rotl32(vx1, 13); vx1 *= PRIME32_1; p+=4;
|
||||
vx2 += XXH_LE32(p) * PRIME32_2; vx2 = XXH_rotl32(vx2, 13); vx2 *= PRIME32_1; p+=4;
|
||||
vx3 += XXH_LE32(p) * PRIME32_2; vx3 = XXH_rotl32(vx3, 13); vx3 *= PRIME32_1; p+=4;
|
||||
vx4 += XXH_LE32(p) * PRIME32_2; vx4 = XXH_rotl32(vx4, 13); vx4 *= PRIME32_1; p+=4;
|
||||
} while (p<=limit) ;
|
||||
v1 += vx1 * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
|
||||
v2 += vx2 * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
|
||||
v3 += vx3 * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
|
||||
v4 += vx4 * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
|
||||
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
|
||||
#endif
|
||||
} else if (len>=16)
|
||||
{
|
||||
const unsigned char* const limit = bEnd - 16;
|
||||
unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
|
||||
|
@ -161,7 +246,6 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
|
|||
v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
|
||||
v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
|
||||
} while (p<=limit) ;
|
||||
|
||||
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
|
||||
}
|
||||
else
|
||||
|
|
Loading…
Reference in a new issue