diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c index af6ff8b..7c4fc0a 100644 --- a/bsdiff/bsdiff.c +++ b/bsdiff/bsdiff.c @@ -85,7 +85,7 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h) }; if(V[I[k+i]+h]==x) { tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; - j++; + ++j; }; }; for(i=0;iSf*2-lenf) { Sf=s; lenf=i; }; }; diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c index 1665b51..57933f6 100644 --- a/bsdiff/rle_encoder.c +++ b/bsdiff/rle_encoder.c @@ -53,7 +53,7 @@ zero_rle_encode(const void *ibuf, const unsigned int ilen, if (val) break; pos1 += sizeof (val); count += sizeof (val); } - for (;pos10; i--) { buf[j] = checksum[i-1]; - j++; + ++j; } } @@ -275,7 +275,7 @@ deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes) j = 0; for (i=cksum_bytes; i>0; i--) { checksum[i-1] = buf[j]; - j++; + ++j; } } diff --git a/delta2/delta2.c b/delta2/delta2.c index c7274ad..e3c9c84 100644 --- a/delta2/delta2.c +++ b/delta2/delta2.c @@ -418,7 +418,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen pos2 += sizeof (uint64_t); for (cnt = 0; cnt < val; cnt++) { *pos2 = *pos; - pos2++; pos++; + ++pos2; ++pos; } DEBUG_STAT_EN(*hdr_ovr += LIT_HDR); } diff --git a/main.c b/main.c index 3e68790..d06c73b 100644 --- a/main.c +++ b/main.c @@ -277,8 +277,8 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void DEBUG_STAT_EN(double strt, en); type = *sorc; - sorc++; - srclen--; + ++sorc; + --srclen; if (type & PREPROC_COMPRESSED) { *dstlen = ntohll(*((uint64_t *)(sorc))); sorc += 8; @@ -1093,7 +1093,7 @@ start_decompress(const char *filename, const char *to_filename) } } sem_post(&tdat->start_sem); - chunk_num++; + ++chunk_num; } } @@ -1903,7 +1903,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) } /* Signal the compression thread to start */ sem_post(&tdat->start_sem); - chunk_num++; + ++chunk_num; if (single_chunk) { rbytes = 0; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 7877d8c..b8dbe9b 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -359,20 +359,20 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of offset = *size - ctx->rabin_poly_max_block_size; length = 0; for (i=offset; i<*size; i++) { - uchar_t cur_byte = buf1[i]; - uint64_t pushed_out = ctx->current_window_data[ctx->window_pos]; + int cur_byte = buf1[i]; + int pushed_out = ctx->current_window_data[ctx->window_pos]; ctx->current_window_data[ctx->window_pos] = cur_byte; cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum += cur_byte; cur_roll_checksum -= out[pushed_out]; - cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); ++length; if (length < ctx->rabin_poly_min_block_size) continue; // If we hit our special value update block offset + cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt) { last_offset = i; length = 0; @@ -385,18 +385,15 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of return (0); } - j = 0; - for (i=offset; i<*size; i++) { uint64_t pc[4]; - uchar_t cur_byte = buf1[i]; - uint64_t pushed_out = ctx->current_window_data[ctx->window_pos]; + int cur_byte = buf1[i]; + int pushed_out = ctx->current_window_data[ctx->window_pos]; ctx->current_window_data[ctx->window_pos] = cur_byte; cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK; cur_roll_checksum += cur_byte; cur_roll_checksum -= out[pushed_out]; - cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; /* * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 @@ -408,6 +405,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of if (length < ctx->rabin_poly_min_block_size) continue; // If we hit our special value or reached the max block size update block offset + cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; if ((cur_pos_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt || length >= ctx->rabin_poly_max_block_size) { if (ctx->blocks[blknum] == 0) @@ -446,7 +444,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ++blknum; last_offset = i+1; length = 0; - j = 0; } } @@ -475,12 +472,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ksmallest((int64_t *)ctx_heap, length, &heap); cur_sketch = XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0); - ctx->blocks[blknum]->similarity_hash = cur_sketch; } else { cur_sketch = XXH32((const uchar_t *)(buf1+last_offset), length, 0); - ctx->blocks[blknum]->similarity_hash = cur_sketch; } + ctx->blocks[blknum]->similarity_hash = cur_sketch; } ++blknum; last_offset = *size; @@ -556,7 +552,7 @@ process_blocks: length = 0; /* - * Look for exact duplicates. Same cksum, length and memcmp()\ + * Look for exact duplicates. Same cksum, length and memcmp() */ while (1) { if (be->hash == ctx->blocks[i]->hash && diff --git a/utils/utils.c b/utils/utils.c index 5f6e4cb..c729865 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -301,7 +301,7 @@ set_threadcounts(algo_props_t *props, int *nthreads, int nprocs, algo_threads_ty props->nthreads++; } else if (props->nthreads * (nthreads1+1) <= nprocs && nthreads1 < *nthreads) { - nthreads1++; + ++nthreads1; } else { break; } diff --git a/utils/xxhash.c b/utils/xxhash.c index 23669d0..7042c0d 100644 --- a/utils/xxhash.c +++ b/utils/xxhash.c @@ -109,7 +109,15 @@ static inline unsigned int XXH_swap32 (unsigned int x) { } #endif - +#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) +#include +static inline __m128i _x_mm_rotl_epi32(const __m128i a, int bits) +{ + __m128i tmp1 = _mm_slli_epi32(a, bits); + __m128i tmp2 = _mm_srli_epi32(a, 32 - bits); + return (_mm_or_si128(tmp1, tmp2)); +} +#endif //************************************** // Constants @@ -146,7 +154,84 @@ unsigned int XXH32(const void* input, int len, unsigned int seed) const unsigned char* const bEnd = p + len; unsigned int h32; - if (len>=16) + if (len>=256) + { + const unsigned char* const limit = bEnd - 32; + unsigned int v1 = seed + PRIME32_1 + PRIME32_2; + unsigned int v2 = seed + PRIME32_2; + unsigned int v3 = seed + 0; + unsigned int v4 = seed - PRIME32_1; +#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) + unsigned int vx[4], vx1[4]; + + __m128i accum = _mm_set_epi32(v4, v3, v2, v1); + __m128i accum1 = _mm_set_epi32(v4, v3, v2, v1); + __m128i prime1 = _mm_set1_epi32(PRIME32_1); + __m128i prime2 = _mm_set1_epi32(PRIME32_2); + + /* + * 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will + * interleave to some extent on a hyperthreaded processor providing 10% - 14% + * speedup over original xxhash depending on processor. We could have used + * aligned loads but we actually want the unaligned penalty. It helps to + * interleave better for a slight benefit over aligned loads here! + */ + do { + __m128i mem = _mm_loadu_si128((__m128i *)p); + p += 16; + mem = _mm_mullo_epi32(mem, prime2); + accum = _mm_add_epi32(accum, mem); + accum = _x_mm_rotl_epi32(accum, 13); + accum = _mm_mullo_epi32(accum, prime1); + + mem = _mm_loadu_si128((__m128i *)p); + p += 16; + mem = _mm_mullo_epi32(mem, prime2); + accum1 = _mm_add_epi32(accum1, mem); + accum1 = _x_mm_rotl_epi32(accum1, 13); + accum1 = _mm_mullo_epi32(accum1, prime1); + } while (p<=limit); + + _mm_storeu_si128((__m128i *)vx, accum); + _mm_storeu_si128((__m128i *)vx1, accum1); + + /* + * Combine the two accumulators into a single hash value. + */ + v1 = vx[0]; + v2 = vx[1]; + v3 = vx[2]; + v4 = vx[3]; + v1 += vx1[0] * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; + v2 += vx1[1] * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; + v3 += vx1[2] * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; + v4 += vx1[3] * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); +#else + unsigned int vx1 = seed + PRIME32_1 + PRIME32_2; + unsigned int vx2 = seed + PRIME32_2; + unsigned int vx3 = seed + 0; + unsigned int vx4 = seed - PRIME32_1; + + do + { + v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + + vx1 += XXH_LE32(p) * PRIME32_2; vx1 = XXH_rotl32(vx1, 13); vx1 *= PRIME32_1; p+=4; + vx2 += XXH_LE32(p) * PRIME32_2; vx2 = XXH_rotl32(vx2, 13); vx2 *= PRIME32_1; p+=4; + vx3 += XXH_LE32(p) * PRIME32_2; vx3 = XXH_rotl32(vx3, 13); vx3 *= PRIME32_1; p+=4; + vx4 += XXH_LE32(p) * PRIME32_2; vx4 = XXH_rotl32(vx4, 13); vx4 *= PRIME32_1; p+=4; + } while (p<=limit) ; + v1 += vx1 * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; + v2 += vx2 * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; + v3 += vx3 * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; + v4 += vx4 * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); +#endif + } else if (len>=16) { const unsigned char* const limit = bEnd - 16; unsigned int v1 = seed + PRIME32_1 + PRIME32_2; @@ -161,7 +246,6 @@ unsigned int XXH32(const void* input, int len, unsigned int seed) v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; } while (p<=limit) ; - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); } else