Many optimization tweaks

Optimize Rabin Deduplication and Bsdiff Vectorize XXHash using SE4
2013-01-20 22:02:26 +05:30 · 2013-01-20 22:02:26 +05:30 · 3888c8d316
commit 3888c8d316
parent 455c8107d5
8 changed files with 115 additions and 35 deletions
--- a/bsdiff/bsdiff.c
+++ b/bsdiff/bsdiff.c
@ -85,7 +85,7 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
 				};
 				if(V[I[k+i]+h]==x) {
 					tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
-					j++;
+					++j;
 				};
 			};
 			for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
@ -97,30 +97,30 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
 	x=V[I[start+len/2]+h];
 	jj=0;kk=0;
 	for(i=start;i<start+len;i++) {
-		if(V[I[i]+h]<x) jj++;
-		if(V[I[i]+h]==x) kk++;
+		if(V[I[i]+h]<x) ++jj;
+		if(V[I[i]+h]==x) ++kk;
 	};
 	jj+=start;kk+=jj;

 	i=start;j=0;k=0;
 	while(i<jj) {
 		if(V[I[i]+h]<x) {
-			i++;
+			++i;
 		} else if(V[I[i]+h]==x) {
 			tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
-			j++;
+			++j;
 		} else {
 			tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
-			k++;
+			++k;
 		};
 	};

 	while(jj+j<kk) {
 		if(V[I[jj+j]+h]==x) {
-			j++;
+			++j;
 		} else {
 			tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
-			k++;
+			++k;
 		};
 	};

@ -336,7 +336,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
 			s=0;Sf=0;lenf=0;
 			for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
 				s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
-				i++;
+				++i;
 				if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
 			};

--- a/bsdiff/rle_encoder.c
+++ b/bsdiff/rle_encoder.c
@ -53,7 +53,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
 				if (val) break;
 				pos1 += sizeof (val); count += sizeof (val);
 			}
-			for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
+			for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) ++count;
 			count |= ZERO_MASK;
 			*((unsigned short *)(ob + pos2)) = htons(count);
 			pos2 += 2;
@ -75,7 +75,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen,
 					}
 				}
 				ob[pos2++] = ib[pos1++];
-				count++;
+				++count;
 			}
 			*((unsigned short *)(ob + pos3)) = htons(count);
 		}
--- a/crypto/crypto_utils.c
+++ b/crypto/crypto_utils.c
@ -138,7 +138,7 @@ PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 				p[k] ^= digtmp[k];
 		}
 		tkeylen-= cplen;
-		i++;
+		++i;
 		p+= cplen;
 	}
 	HMAC_CTX_cleanup(&hctx);
@ -263,7 +263,7 @@ serialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
 	j = 0;
 	for (i=cksum_bytes; i>0; i--) {
 		buf[j] = checksum[i-1];
-		j++;
+		++j;
 	}
 }

@ -275,7 +275,7 @@ deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes)
 	j = 0;
 	for (i=cksum_bytes; i>0; i--) {
 		checksum[i-1] = buf[j];
-		j++;
+		++j;
 	}
 }

--- a/delta2/delta2.c
+++ b/delta2/delta2.c
@ -418,7 +418,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 			pos2 += sizeof (uint64_t);
 			for (cnt = 0; cnt < val; cnt++) {
 				*pos2 = *pos;
-				pos2++; pos++;
+				++pos2; ++pos;
 			}
 			DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
 		}
--- a/main.c
+++ b/main.c
@ -277,8 +277,8 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
 	DEBUG_STAT_EN(double strt, en);

 	type = *sorc;
-	sorc++;
-	srclen--;
+	++sorc;
+	--srclen;
 	if (type & PREPROC_COMPRESSED) {
 		*dstlen = ntohll(*((uint64_t *)(sorc)));
 		sorc += 8;
@ -1093,7 +1093,7 @@ start_decompress(const char *filename, const char *to_filename)
 				}
 			}
 			sem_post(&tdat->start_sem);
-			chunk_num++;
+			++chunk_num;
 		}
 	}

@ -1903,7 +1903,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 			}
 			/* Signal the compression thread to start */
 			sem_post(&tdat->start_sem);
-			chunk_num++;
+			++chunk_num;

 			if (single_chunk) {
 				rbytes = 0;
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@ -359,20 +359,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 		offset = *size - ctx->rabin_poly_max_block_size;
 		length = 0;
 		for (i=offset; i<*size; i++) {
-			uchar_t cur_byte = buf1[i];
-			uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
+			int cur_byte = buf1[i];
+			int pushed_out = ctx->current_window_data[ctx->window_pos];
 			ctx->current_window_data[ctx->window_pos] = cur_byte;

 			cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
 			cur_roll_checksum += cur_byte;
 			cur_roll_checksum -= out[pushed_out];
-			cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];

 			ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
 			++length;
 			if (length < ctx->rabin_poly_min_block_size) continue;

 			// If we hit our special value update block offset
+			cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
 			if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt) {
 				last_offset = i;
 				length = 0;
@ -385,18 +385,15 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 		return (0);
 	}

-	j = 0;
-
 	for (i=offset; i<*size; i++) {
 		uint64_t pc[4];
-		uchar_t cur_byte = buf1[i];
-		uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
+		int cur_byte = buf1[i];
+		int pushed_out = ctx->current_window_data[ctx->window_pos];
 		ctx->current_window_data[ctx->window_pos] = cur_byte;

 		cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
 		cur_roll_checksum += cur_byte;
 		cur_roll_checksum -= out[pushed_out];
-		cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];

 		/*
 		 * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
@ -408,6 +405,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 		if (length < ctx->rabin_poly_min_block_size) continue;

 		// If we hit our special value or reached the max block size update block offset
+		cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
 		if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
 		    length >= ctx->rabin_poly_max_block_size) {
 			if (ctx->blocks[blknum] == 0)
@ -446,7 +444,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 			++blknum;
 			last_offset = i+1;
 			length = 0;
-			j = 0;
 		}
 	}

@ -475,12 +472,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 				ksmallest((int64_t *)ctx_heap, length, &heap);
 				cur_sketch =
 				    XXH32((const uchar_t *)ctx_heap,  pc[ctx->delta_flag]*8, 0);
-				ctx->blocks[blknum]->similarity_hash = cur_sketch;
 			} else {
 				cur_sketch =
 				    XXH32((const uchar_t *)(buf1+last_offset), length, 0);
-				ctx->blocks[blknum]->similarity_hash = cur_sketch;
 			}
+			ctx->blocks[blknum]->similarity_hash = cur_sketch;
 		}
 		++blknum;
 		last_offset = *size;
@ -556,7 +552,7 @@ process_blocks:
 				length = 0;

 				/*
-				 * Look for exact duplicates. Same cksum, length and memcmp()\
+				 * Look for exact duplicates. Same cksum, length and memcmp()
 				 */
 				while (1) {
 					if (be->hash == ctx->blocks[i]->hash &&
--- a/utils/utils.c
+++ b/utils/utils.c
@ -301,7 +301,7 @@ set_threadcounts(algo_props_t *props, int *nthreads, int nprocs, algo_threads_ty
 				props->nthreads++;

 			} else if (props->nthreads * (nthreads1+1) <= nprocs && nthreads1 < *nthreads) {
-				nthreads1++;
+				++nthreads1;
 			} else {
 				break;
 			}
--- a/utils/xxhash.c
+++ b/utils/xxhash.c
@ -109,7 +109,15 @@ static inline unsigned int XXH_swap32 (unsigned int x) {
                 }
 #endif

-
+#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
+#include <smmintrin.h>
+static inline __m128i _x_mm_rotl_epi32(const __m128i a, int bits)
+{
+	__m128i tmp1 = _mm_slli_epi32(a, bits);
+	__m128i tmp2 = _mm_srli_epi32(a, 32 - bits);
+	return (_mm_or_si128(tmp1, tmp2));
+}
+#endif

 //**************************************
 // Constants
@ -146,7 +154,84 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
 	const unsigned char* const bEnd = p + len;
 	unsigned int h32;

-	if (len>=16)
+	if (len>=256)
+	{
+		const unsigned char* const limit = bEnd - 32;
+		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
+		unsigned int v2 = seed + PRIME32_2;
+		unsigned int v3 = seed + 0;
+		unsigned int v4 = seed - PRIME32_1;
+#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__)
+		unsigned int vx[4], vx1[4];
+
+		__m128i accum = _mm_set_epi32(v4, v3, v2, v1);
+		__m128i accum1 = _mm_set_epi32(v4, v3, v2, v1);
+		__m128i prime1 = _mm_set1_epi32(PRIME32_1);
+		__m128i prime2 = _mm_set1_epi32(PRIME32_2);
+
+		/*
+		 * 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
+		 * interleave to some extent on a hyperthreaded processor providing 10% - 14%
+		 * speedup over original xxhash depending on processor. We could have used
+		 * aligned loads but we actually want the unaligned penalty. It helps to
+		 * interleave better for a slight benefit over aligned loads here!
+		 */
+		do {
+			__m128i mem = _mm_loadu_si128((__m128i *)p);
+			p += 16;
+			mem = _mm_mullo_epi32(mem, prime2);
+			accum = _mm_add_epi32(accum, mem);
+			accum = _x_mm_rotl_epi32(accum, 13);
+			accum = _mm_mullo_epi32(accum, prime1);
+
+			mem = _mm_loadu_si128((__m128i *)p);
+			p += 16;
+			mem = _mm_mullo_epi32(mem, prime2);
+			accum1 = _mm_add_epi32(accum1, mem);
+			accum1 = _x_mm_rotl_epi32(accum1, 13);
+			accum1 = _mm_mullo_epi32(accum1, prime1);
+		} while (p<=limit);
+
+		_mm_storeu_si128((__m128i *)vx, accum);
+		_mm_storeu_si128((__m128i *)vx1, accum1);
+
+		/*
+		 * Combine the two accumulators into a single hash value.
+		 */
+		v1 = vx[0];
+		v2 = vx[1];
+		v3 = vx[2];
+		v4 = vx[3];
+		v1 += vx1[0] * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
+		v2 += vx1[1] * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
+		v3 += vx1[2] * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
+		v4 += vx1[3] * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
+		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+#else
+		unsigned int vx1 = seed + PRIME32_1 + PRIME32_2;
+		unsigned int vx2 = seed + PRIME32_2;
+		unsigned int vx3 = seed + 0;
+		unsigned int vx4 = seed - PRIME32_1;
+
+		do
+		{
+			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+
+			vx1 += XXH_LE32(p) * PRIME32_2; vx1 = XXH_rotl32(vx1, 13); vx1 *= PRIME32_1; p+=4;
+			vx2 += XXH_LE32(p) * PRIME32_2; vx2 = XXH_rotl32(vx2, 13); vx2 *= PRIME32_1; p+=4;
+			vx3 += XXH_LE32(p) * PRIME32_2; vx3 = XXH_rotl32(vx3, 13); vx3 *= PRIME32_1; p+=4;
+			vx4 += XXH_LE32(p) * PRIME32_2; vx4 = XXH_rotl32(vx4, 13); vx4 *= PRIME32_1; p+=4;
+		} while (p<=limit) ;
+		v1 += vx1 * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1;
+		v2 += vx2 * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1;
+		v3 += vx3 * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1;
+		v4 += vx4 * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1;
+		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+#endif
+	} else if (len>=16)
 	{
 		const unsigned char* const limit = bEnd - 16;
 		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
@ -161,7 +246,6 @@ unsigned int XXH32(const void* input, int len, unsigned int seed)
 			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
 			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
 		} while (p<=limit) ;
-
 		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
 	}
 	else