From 13d9378acdf543e3f641d96e1a4d5f93498f2b11 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 31 Dec 2012 11:53:47 +0530 Subject: [PATCH] Update to latest XXHash version. --- rabin/rabin_dedup.c | 15 +- utils/xxhash.c | 388 +++++++++++++++++++++++++++++--------------- utils/xxhash.h | 100 +++++++++--- 3 files changed, 336 insertions(+), 167 deletions(-) diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index c13385b..e3fd2fb 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -328,7 +328,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ctx->blocks[i]->index = i; // Need to store for sorting ctx->blocks[i]->length = length; ctx->blocks[i]->similar = 0; - ctx->blocks[i]->hash = XXH_fast32(buf1+last_offset, length, 0); + ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0); ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash; last_offset += length; } @@ -448,7 +448,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of reset_heap(&heap, pc[ctx->delta_flag]); ksmallest((int32_t *)fplist, j, &heap); ctx->blocks[blknum]->similarity_hash = - XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); + XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); memset(fplist, 0, ary_sz); } blknum++; @@ -478,11 +478,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of reset_heap(&heap, pc[ctx->delta_flag]); ksmallest((int32_t *)fplist, j, &heap); cur_sketch = - XXH_fast32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); + XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); } else { if (j == 0) j = 1; cur_sketch = - XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0); + XXH32((const uchar_t *)fplist, (j*4)/2, 0); } ctx->blocks[blknum]->similarity_hash = cur_sketch; } @@ -516,12 +516,12 @@ process_blocks: */ if (ctx->delta_flag) { for (i=0; iblocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset, + ctx->blocks[i]->hash = XXH32(buf1+ctx->blocks[i]->offset, ctx->blocks[i]->length, 0); } } else { for (i=0; iblocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset, + ctx->blocks[i]->hash = XXH32(buf1+ctx->blocks[i]->offset, ctx->blocks[i]->length, 0); ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash; } @@ -618,6 +618,9 @@ process_blocks: dedupe_index_sz = (uint64_t)blknum * RABIN_ENTRY_SIZE; if (matchlen < dedupe_index_sz) { + DEBUG_STAT_EN(en = get_wtime_millis()); + DEBUG_STAT_EN(fprintf(stderr, "Chunking speed %.3f MB/s, Overall Dedupe speed %.3f MB/s\n", + get_mb_s(*size, strt, en_1), get_mb_s(*size, strt, en))); DEBUG_STAT_EN(fprintf(stderr, "No Dedupe possible.\n")); ctx->valid = 0; return (0); diff --git a/utils/xxhash.c b/utils/xxhash.c index 5bd8ebc..23669d0 100644 --- a/utils/xxhash.c +++ b/utils/xxhash.c @@ -1,26 +1,3 @@ -/* - * This file is a part of Pcompress, a chunked parallel multi- - * algorithm lossless compression and decompression program. - * - * Copyright (C) 2012 Moinak Ghosh. All rights reserved. - * Use is subject to license terms. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * moinakg@belenix.org, http://moinakg.wordpress.com/ - * - * This program includes partly-modified public domain source - * code from the LZMA SDK: http://www.7-zip.org/sdk.html - */ - /* xxHash - Fast Hash algorithm Copyright (C) 2012, Yann Collet. @@ -54,23 +31,82 @@ */ + +//************************************** +// Tuning parameters +//************************************** +// FORCE_NATIVE_FORMAT : +// By default, xxHash library provides endian-independant Hash values. +// Results are therefore identical for big-endian and little-endian CPU. +// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +// Should endian-independance be of no importance to your application, you may uncomment the #define below +// It will improve speed for Big-endian CPU. +// This option has no impact on Little_Endian CPU. +//#define FORCE_NATIVE_FORMAT 1 + + + //************************************** // Includes //************************************** +#include // for malloc(), free() +#include // for memcpy() #include "xxhash.h" //************************************** -// Compiler Options +// CPU Feature Detection //************************************** -#ifdef _MSC_VER // Visual Studio -#define inline __forceinline // Visual is not C99, but supports some kind of inline +// Little Endian or Big Endian ? +// You can overwrite the #define below if you know your architecture endianess +#if defined(FORCE_NATIVE_FORMAT) && (FORCE_NATIVE_FORMAT==1) +// Force native format. The result will be endian dependant. +# define XXH_BIG_ENDIAN 0 +#elif defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define XXH_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define XXH_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define XXH_BIG_ENDIAN 1 #endif -// GCC does not support _rotl outside of Windows -#if !defined(_WIN32) -#define _rotl(x,r) ((x << r) | (x >> (32 - r))) +#if !defined(XXH_BIG_ENDIAN) +// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. +# define XXH_BIG_ENDIAN 0 +#endif + + + +//************************************** +// Compiler-specific Options & Functions +//************************************** +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic +#if defined(_WIN32) +# define XXH_rotl32(x,r) _rotl(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif + +#if defined(_MSC_VER) // Visual Studio +# define XXH_swap32 _byteswap_ulong +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static inline unsigned int XXH_swap32 (unsigned int x) { + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); + } #endif @@ -78,147 +114,229 @@ //************************************** // Constants //************************************** -#define PRIME1 2654435761U -#define PRIME2 2246822519U -#define PRIME3 3266489917U -#define PRIME4 668265263U -#define PRIME5 0x165667b1 +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + + + +//************************************** +// Macros +//************************************** +#define XXH_LE32(p) (XXH_BIG_ENDIAN ? XXH_swap32(*(unsigned int*)(p)) : *(unsigned int*)(p)) //**************************** -// Private functions +// Simple Hash Functions //**************************** -// This version is for very small inputs (< 16 bytes) -inline unsigned int XXH_small(const void* key, int len, unsigned int seed) +unsigned int XXH32(const void* input, int len, unsigned int seed) { - const unsigned char* p = (unsigned char*)key; - const unsigned char* const bEnd = p + len; - unsigned int idx = seed + PRIME1; - unsigned int crc = PRIME5; - const unsigned char* const limit = bEnd - 4; +#if 0 + // Simple version, good for code maintenance, but unfortunately slow for small inputs + void* state = XXH32_init(seed); + XXH32_feed(state, input, len); + return XXH32_result(state); +#else - while (p=16) { - crc += ((*(unsigned int*)p) + idx++); - crc += _rotl(crc, 17) * PRIME4; - crc *= PRIME1; + const unsigned char* const limit = bEnd - 16; + unsigned int v1 = seed + PRIME32_1 + PRIME32_2; + unsigned int v2 = seed + PRIME32_2; + unsigned int v3 = seed + 0; + unsigned int v4 = seed - PRIME32_1; + + do + { + v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } while (p<=limit) ; + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (unsigned int) len; + + while (p<=bEnd-4) + { + h32 += XXH_LE32(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; p+=4; } while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; - crc ^= crc >> 15; - crc *= PRIME2; - crc ^= crc >> 13; - crc *= PRIME3; - crc ^= crc >> 16; + return h32; - return crc; +#endif } +//**************************** +// Advanced Hash Functions +//**************************** -//****************************** -// Hash functions -//****************************** -unsigned int XXH_fast32(const void* input, int len, unsigned int seed) +struct XXH_state32_t { - // Special case, for small inputs - if (len < 16) return XXH_small(input, len, seed); + unsigned int seed; + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + unsigned long long total_len; + char memory[16]; + int memsize; +}; + +void* XXH32_init (unsigned int seed) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t)); + state->seed = seed; + state->v1 = seed + PRIME32_1 + PRIME32_2; + state->v2 = seed + PRIME32_2; + state->v3 = seed + 0; + state->v4 = seed - PRIME32_1; + state->total_len = 0; + state->memsize = 0; + + return (void*)state; +} + + +int XXH32_feed (void* state_in, const void* input, int len) +{ + struct XXH_state32_t * state = state_in; + const unsigned char* p = (const unsigned char*)input; + const unsigned char* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 16) // fill in tmp buffer { - const unsigned char* p = (const unsigned char*)input; - const unsigned char* const bEnd = p + len; - unsigned int v1 = seed + PRIME1; - unsigned int v2 = v1 * PRIME2 + len; - unsigned int v3 = v2 * PRIME3; - unsigned int v4 = v3 * PRIME4; - const unsigned char* const limit = bEnd - 16; - unsigned int crc; - - while (p> 11; - crc += (PRIME4+len) * PRIME1; - crc ^= crc >> 15; - crc *= PRIME2; - crc ^= crc >> 13; - - return crc; + memcpy(state->memory + state->memsize, input, len); + state->memsize += len; + return 0; } -} - - - -unsigned int XXH_strong32(const void* input, int len, unsigned int seed) -{ - // Special case, for small inputs - if (len < 16) return XXH_small(input, len, seed); - + if (state->memsize) // some data left from previous feed { - const unsigned char* p = (const unsigned char*)input; - const unsigned char* const bEnd = p + len; - unsigned int v1 = seed + PRIME1; - unsigned int v2 = v1 * PRIME2 + len; - unsigned int v3 = v2 * PRIME3; - unsigned int v4 = v3 * PRIME4; - const unsigned char* const limit = bEnd - 16; - unsigned int crc; - - while (pmemory + state->memsize, input, 16-state->memsize); { - v1 += _rotl(v1, 13); v1 *= PRIME1; v1 += (*(unsigned int*)p); p+=4; - v2 += _rotl(v2, 11); v2 *= PRIME1; v2 += (*(unsigned int*)p); p+=4; - v3 += _rotl(v3, 17); v3 *= PRIME1; v3 += (*(unsigned int*)p); p+=4; - v4 += _rotl(v4, 19); v4 *= PRIME1; v4 += (*(unsigned int*)p); p+=4; - } - - p = bEnd - 16; - v1 += _rotl(v1, 17); v2 += _rotl(v2, 19); v3 += _rotl(v3, 13); v4 += _rotl(v4, 11); - v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1; - v1 += *(unsigned int*)p; p+=4; v2 += *(unsigned int*)p; p+=4; v3 += *(unsigned int*)p; p+=4; v4 += *(unsigned int*)p; // p+=4; - v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2; - v1 += _rotl(v1, 11); v2 += _rotl(v2, 17); v3 += _rotl(v3, 19); v4 += _rotl(v4, 13); - v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3; - - crc = v1 + _rotl(v2, 3) + _rotl(v3, 6) + _rotl(v4, 9); - crc ^= crc >> 11; - crc += (PRIME4+len) * PRIME1; - crc ^= crc >> 15; - crc *= PRIME2; - crc ^= crc >> 13; - - return crc; + const unsigned int* p32 = (const unsigned int*)state->memory; + state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++; + state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; + state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++; + state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++; + } + p += 16-state->memsize; + state->memsize = 0; } + { + const unsigned char* const limit = bEnd - 16; + unsigned int v1 = state->v1; + unsigned int v2 = state->v2; + unsigned int v3 = state->v3; + unsigned int v4 = state->v4; + + while (p<=limit) + { + v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + memcpy(state->memory, p, bEnd-p); + state->memsize = bEnd-p; + } + + return 0; } +unsigned int XXH32_getIntermediateResult (void* state_in) +{ + struct XXH_state32_t * state = state_in; + unsigned char * p = (unsigned char*)state->memory; + unsigned char* bEnd = (unsigned char*)state->memory + state->memsize; + unsigned int h32; + if (state->total_len >= 16) + { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } + else + { + h32 = state->seed + PRIME32_5; + } + h32 += (unsigned int) state->total_len; + + while (p<=bEnd-4) + { + h32 += XXH_LE32(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +unsigned int XXH32_result (void* state_in) +{ + unsigned int h32 = XXH32_getIntermediateResult(state_in); + + free(state_in); + + return h32; +} diff --git a/utils/xxhash.h b/utils/xxhash.h index 3cf8803..d253fa0 100644 --- a/utils/xxhash.h +++ b/utils/xxhash.h @@ -1,23 +1,3 @@ -/* - * This file is a part of Pcompress, a chunked parallel multi- - * algorithm lossless compression and decompression program. - * - * Copyright (C) 2012 Moinak Ghosh. All rights reserved. - * Use is subject to license terms. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * moinakg@belenix.org, http://moinakg.wordpress.com/ - */ - /* xxHash - Fast Hash algorithm Header File @@ -50,6 +30,33 @@ You can contact the author at : - xxHash source repository : http://code.google.com/p/xxhash/ */ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. +*/ + #pragma once #if defined (__cplusplus) @@ -58,19 +65,60 @@ extern "C" { //**************************** -// Hash Functions +// Simple Hash Functions //**************************** -unsigned int XXH_fast32 (const void* input, int len, unsigned int seed); -unsigned int XXH_strong32(const void* input, int len, unsigned int seed); +unsigned int XXH32 (const void* input, int len, unsigned int seed); /* -XXH_fast32() : +XXH32() : Calculate the 32-bits hash of "input", of length "len" "seed" can be used to alter the result + This function successfully passes all SMHasher tests. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s + Note that "len" is type "int", which means it is limited to 2^31-1. + If your data is larger, use the advanced functions below. +*/ -XXH_strong32() : - Same as XXH_fast(), but the resulting hash has stronger properties + + +//**************************** +// Advanced Hash Functions +//**************************** + +void* XXH32_init (unsigned int seed); +int XXH32_feed (void* state, const void* input, int len); +unsigned int XXH32_result (void* state); + +/* +These functions calculate the xxhash of an input provided in several small packets, +as opposed to an input provided as a single block. + +You must start with : +void* XXH32_init() +The function returns a pointer which holds the state of calculation. + +This pointer must be provided as "void* state" parameter for XXH32_feed(). +XXH32_feed() can be called as many times as necessary. +The function returns an error code, with 0 meaning OK, and all other values meaning there is an error. +Note that "len" is type "int", which means it is limited to 2^31-1. +If your data is larger, it is recommended +to chunk your data into blocks of size 2^30 (1GB) to avoid any "int" overflow issue. + +Finally, you can end the calculation anytime, by using XXH32_result(). +This function returns the final 32-bits hash. +You must provide the same "void* state" parameter created by XXH32_init(). + +Memory will be freed by XXH32_result(). +*/ + + +unsigned int XXH32_getIntermediateResult (void* state); +/* +This function does the same as XXH32_result(), generating a 32-bit hash, +but preserve memory context. +This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_feed(). +To free memory context, use XXH32_result(). */