diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c
index c13385b..e3fd2fb 100755
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@@ -328,7 +328,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 			ctx->blocks[i]->index = i; // Need to store for sorting
 			ctx->blocks[i]->length = length;
 			ctx->blocks[i]->similar = 0;
-			ctx->blocks[i]->hash = XXH_fast32(buf1+last_offset, length, 0);
+			ctx->blocks[i]->hash = XXH32(buf1+last_offset, length, 0);
 			ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
 			last_offset += length;
 		}
@@ -448,7 +448,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 				reset_heap(&heap, pc[ctx->delta_flag]);
 				ksmallest((int32_t *)fplist, j, &heap);
 				ctx->blocks[blknum]->similarity_hash =
-					XXH_fast32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
+					XXH32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
 				memset(fplist, 0, ary_sz);
 			}
 			blknum++;
@@ -478,11 +478,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
 				reset_heap(&heap, pc[ctx->delta_flag]);
 				ksmallest((int32_t *)fplist, j, &heap);
 				cur_sketch =
-				    XXH_fast32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
+				    XXH32((const uchar_t *)fplist,  pc[ctx->delta_flag]*4, 0);
 			} else {
 				if (j == 0) j = 1;
 				cur_sketch =
-				    XXH_fast32((const uchar_t *)fplist, (j*4)/2, 0);
+				    XXH32((const uchar_t *)fplist, (j*4)/2, 0);
 			}
 			ctx->blocks[blknum]->similarity_hash = cur_sketch;
 		}
@@ -516,12 +516,12 @@ process_blocks:
 		 */
 		if (ctx->delta_flag) {
 			for (i=0; i<blknum; i++) {
-				ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
+				ctx->blocks[i]->hash = XXH32(buf1+ctx->blocks[i]->offset,
 								    ctx->blocks[i]->length, 0);
 			}
 		} else {
 			for (i=0; i<blknum; i++) {
-				ctx->blocks[i]->hash = XXH_fast32(buf1+ctx->blocks[i]->offset,
+				ctx->blocks[i]->hash = XXH32(buf1+ctx->blocks[i]->offset,
 								    ctx->blocks[i]->length, 0);
 				ctx->blocks[i]->similarity_hash = ctx->blocks[i]->hash;
 			}
@@ -618,6 +618,9 @@ process_blocks:
 
 		dedupe_index_sz = (uint64_t)blknum * RABIN_ENTRY_SIZE;
 		if (matchlen < dedupe_index_sz) {
+			DEBUG_STAT_EN(en = get_wtime_millis());
+			DEBUG_STAT_EN(fprintf(stderr, "Chunking speed %.3f MB/s, Overall Dedupe speed %.3f MB/s\n",
+					      get_mb_s(*size, strt, en_1), get_mb_s(*size, strt, en)));
 			DEBUG_STAT_EN(fprintf(stderr, "No Dedupe possible.\n"));
 			ctx->valid = 0;
 			return (0);
diff --git a/utils/xxhash.c b/utils/xxhash.c
index 5bd8ebc..23669d0 100644
--- a/utils/xxhash.c
+++ b/utils/xxhash.c
@@ -1,26 +1,3 @@
-/*
- * This file is a part of Pcompress, a chunked parallel multi-
- * algorithm lossless compression and decompression program.
- *
- * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
- * Use is subject to license terms.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 3 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * moinakg@belenix.org, http://moinakg.wordpress.com/
- *      
- * This program includes partly-modified public domain source
- * code from the LZMA SDK: http://www.7-zip.org/sdk.html
- */
-
 /*
    xxHash - Fast Hash algorithm
    Copyright (C) 2012, Yann Collet.
@@ -54,23 +31,82 @@
 */
 
 
+
+//**************************************
+// Tuning parameters
+//**************************************
+// FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independant Hash values.
+// Results are therefore identical for big-endian and little-endian CPU.
+// This comes at a  performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance to your application, you may uncomment the #define below
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+//#define FORCE_NATIVE_FORMAT 1
+
+
+
 //**************************************
 // Includes
 //**************************************
+#include <stdlib.h>    // for malloc(), free()
+#include <string.h>    // for memcpy()
 #include "xxhash.h"
 
 
 
 //**************************************
-// Compiler Options
+// CPU Feature Detection
 //**************************************
-#ifdef _MSC_VER              // Visual Studio
-#define inline __forceinline // Visual is not C99, but supports some kind of inline
+// Little Endian or Big Endian ?
+// You can overwrite the #define below if you know your architecture endianess
+#if defined(FORCE_NATIVE_FORMAT) && (FORCE_NATIVE_FORMAT==1)
+// Force native format. The result will be endian dependant.
+#  define XXH_BIG_ENDIAN 0
+#elif defined (__GLIBC__)
+#  include <endian.h>
+#  if (__BYTE_ORDER == __BIG_ENDIAN)
+#     define XXH_BIG_ENDIAN 1
+#  endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+#  define XXH_BIG_ENDIAN 1
+#elif defined(__sparc) || defined(__sparc__) \
+   || defined(__ppc__) || defined(_POWER) || defined(__powerpc__) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__PPC) || defined(PPC) || defined(__powerpc__) || defined(__powerpc) || defined(powerpc) \
+   || defined(__hpux)  || defined(__hppa) \
+   || defined(_MIPSEB) || defined(__s390__)
+#  define XXH_BIG_ENDIAN 1
 #endif
 
-// GCC does not support _rotl outside of Windows
-#if !defined(_WIN32)
-#define _rotl(x,r) ((x << r) | (x >> (32 - r)))
+#if !defined(XXH_BIG_ENDIAN)
+// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
+#  define XXH_BIG_ENDIAN 0
+#endif
+
+
+
+//**************************************
+// Compiler-specific Options & Functions
+//**************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : under GCC, it may sometimes be faster to enable the (2nd) macro definition, instead of using win32 intrinsic
+#if defined(_WIN32)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline unsigned int XXH_swap32 (unsigned int x) {
+                        return  ((x << 24) & 0xff000000 ) |
+                                ((x <<  8) & 0x00ff0000 ) |
+                                ((x >>  8) & 0x0000ff00 ) |
+                                ((x >> 24) & 0x000000ff );
+                 }
 #endif
 
 
@@ -78,147 +114,229 @@
 //**************************************
 // Constants
 //**************************************
-#define PRIME1   2654435761U
-#define PRIME2   2246822519U
-#define PRIME3   3266489917U
-#define PRIME4    668265263U
-#define PRIME5   0x165667b1
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_LE32(p)  (XXH_BIG_ENDIAN ? XXH_swap32(*(unsigned int*)(p)) : *(unsigned int*)(p))
 
 
 
 //****************************
-// Private functions
+// Simple Hash Functions
 //****************************
 
-// This version is for very small inputs (< 16  bytes)
-inline unsigned int XXH_small(const void* key, int len, unsigned int seed)
+unsigned int XXH32(const void* input, int len, unsigned int seed)
 {
-	const unsigned char* p = (unsigned char*)key;
-	const unsigned char* const bEnd = p + len;
-	unsigned int idx = seed + PRIME1;
-	unsigned int crc = PRIME5;
-	const unsigned char* const limit = bEnd - 4;
+#if 0
+	// Simple version, good for code maintenance, but unfortunately slow for small inputs
+	void* state = XXH32_init(seed);
+	XXH32_feed(state, input, len);
+	return XXH32_result(state);
+#else
 
-	while (p<limit)
+	const unsigned char* p = (const unsigned char*)input;
+	const unsigned char* const bEnd = p + len;
+	unsigned int h32;
+
+	if (len>=16)
 	{
-		crc += ((*(unsigned int*)p) + idx++);
-		crc += _rotl(crc, 17) * PRIME4;
-		crc *= PRIME1;
+		const unsigned char* const limit = bEnd - 16;
+		unsigned int v1 = seed + PRIME32_1 + PRIME32_2;
+		unsigned int v2 = seed + PRIME32_2;
+		unsigned int v3 = seed + 0;
+		unsigned int v4 = seed - PRIME32_1;
+
+		do
+		{
+			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+		} while (p<=limit) ;
+
+		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+	}
+	else
+	{
+		h32  = seed + PRIME32_5;
+	}
+
+	h32 += (unsigned int) len;
+	
+	while (p<=bEnd-4)
+	{
+		h32 += XXH_LE32(p) * PRIME32_3;
+		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
 		p+=4;
 	}
 
 	while (p<bEnd)
 	{
-		crc += ((*p) + idx++);
-		crc *= PRIME1;
+		h32 += (*p) * PRIME32_5;
+		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
 		p++;
 	}
 
-	crc += len;
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
 
-	crc ^= crc >> 15;
-	crc *= PRIME2;
-	crc ^= crc >> 13;
-	crc *= PRIME3;
-	crc ^= crc >> 16;
+	return h32;
 
-	return crc;
+#endif
 }
 
 
+//****************************
+// Advanced Hash Functions
+//****************************
 
-//******************************
-// Hash functions
-//******************************
-unsigned int XXH_fast32(const void* input, int len, unsigned int seed)
+struct XXH_state32_t
 {
-	// Special case, for small inputs
-	if (len < 16) return XXH_small(input, len, seed);
+	unsigned int seed;
+	unsigned int v1;
+	unsigned int v2;
+	unsigned int v3;
+	unsigned int v4;
+	unsigned long long total_len;
+	char memory[16];
+	int memsize;
+};
 
+
+void* XXH32_init (unsigned int seed)
+{
+	struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t));
+	state->seed = seed;
+	state->v1 = seed + PRIME32_1 + PRIME32_2;
+	state->v2 = seed + PRIME32_2;
+	state->v3 = seed + 0;
+	state->v4 = seed - PRIME32_1;
+	state->total_len = 0;
+	state->memsize = 0;
+
+	return (void*)state;
+}
+
+
+int   XXH32_feed (void* state_in, const void* input, int len)
+{
+	struct XXH_state32_t * state = state_in;
+	const unsigned char* p = (const unsigned char*)input;
+	const unsigned char* const bEnd = p + len;
+
+	state->total_len += len;
+	
+	if (state->memsize + len < 16)   // fill in tmp buffer
 	{
-		const unsigned char* p = (const unsigned char*)input;
-		const unsigned char* const bEnd = p + len;
-		unsigned int v1 = seed + PRIME1;
-		unsigned int v2 = v1 * PRIME2 + len;
-		unsigned int v3 = v2 * PRIME3;
-		unsigned int v4 = v3 * PRIME4;	
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int crc;
-
-		while (p<limit)
-		{
-			v1 = _rotl(v1, 13) + (*(unsigned int*)p); p+=4;
-			v2 = _rotl(v2, 11) + (*(unsigned int*)p); p+=4;
-			v3 = _rotl(v3, 17) + (*(unsigned int*)p); p+=4;
-			v4 = _rotl(v4, 19) + (*(unsigned int*)p); p+=4;
-		} 
-
-		p = bEnd - 16;
-		v1 += _rotl(v1, 17); v2 += _rotl(v2, 19); v3 += _rotl(v3, 13); v4 += _rotl(v4, 11); 
-		v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1; 
-		v1 += *(unsigned int*)p; p+=4; v2 += *(unsigned int*)p; p+=4; v3 += *(unsigned int*)p; p+=4; v4 += *(unsigned int*)p;   // p+=4;
-		v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2; 
-		v1 += _rotl(v1, 11); v2 += _rotl(v2, 17); v3 += _rotl(v3, 19); v4 += _rotl(v4, 13); 
-		v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3;
-
-		crc = v1 + _rotl(v2, 3) + _rotl(v3, 6) + _rotl(v4, 9);
-		crc ^= crc >> 11;
-		crc += (PRIME4+len) * PRIME1;
-		crc ^= crc >> 15;
-		crc *= PRIME2;
-		crc ^= crc >> 13;
-
-		return crc;
+		memcpy(state->memory + state->memsize, input, len);
+		state->memsize +=  len;
+		return 0;
 	}
 
-}
-
-
-
-unsigned int XXH_strong32(const void* input, int len, unsigned int seed)
-{
-	// Special case, for small inputs
-	if (len < 16) return XXH_small(input, len, seed);
-
+	if (state->memsize)   // some data left from previous feed
 	{
-		const unsigned char* p = (const unsigned char*)input;
-		const unsigned char* const bEnd = p + len;
-		unsigned int v1 = seed + PRIME1;
-		unsigned int v2 = v1 * PRIME2 + len;
-		unsigned int v3 = v2 * PRIME3;
-		unsigned int v4 = v3 * PRIME4;	
-		const unsigned char* const limit = bEnd - 16;
-		unsigned int crc;
-
-		while (p<limit)
+		memcpy(state->memory + state->memsize, input, 16-state->memsize);
 		{
-			v1 += _rotl(v1, 13); v1 *= PRIME1; v1 += (*(unsigned int*)p); p+=4;
-			v2 += _rotl(v2, 11); v2 *= PRIME1; v2 += (*(unsigned int*)p); p+=4;
-			v3 += _rotl(v3, 17); v3 *= PRIME1; v3 += (*(unsigned int*)p); p+=4;
-			v4 += _rotl(v4, 19); v4 *= PRIME1; v4 += (*(unsigned int*)p); p+=4;
-		} 
-
-		p = bEnd - 16;
-		v1 += _rotl(v1, 17); v2 += _rotl(v2, 19); v3 += _rotl(v3, 13); v4 += _rotl(v4, 11); 
-		v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1; 
-		v1 += *(unsigned int*)p; p+=4; v2 += *(unsigned int*)p; p+=4; v3 += *(unsigned int*)p; p+=4; v4 += *(unsigned int*)p;   // p+=4;
-		v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2; 
-		v1 += _rotl(v1, 11); v2 += _rotl(v2, 17); v3 += _rotl(v3, 19); v4 += _rotl(v4, 13); 
-		v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3;
-
-		crc = v1 + _rotl(v2, 3) + _rotl(v3, 6) + _rotl(v4, 9);
-		crc ^= crc >> 11;
-		crc += (PRIME4+len) * PRIME1;
-		crc ^= crc >> 15;
-		crc *= PRIME2;
-		crc ^= crc >> 13;
-
-		return crc;
+			const unsigned int* p32 = (const unsigned int*)state->memory;
+			state->v1 += XXH_LE32(p32) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+			state->v2 += XXH_LE32(p32) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; 
+			state->v3 += XXH_LE32(p32) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+			state->v4 += XXH_LE32(p32) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+		}
+		p += 16-state->memsize;
+		state->memsize = 0;
 	}
 
+	{
+		const unsigned char* const limit = bEnd - 16;
+		unsigned int v1 = state->v1;
+		unsigned int v2 = state->v2;
+		unsigned int v3 = state->v3;
+		unsigned int v4 = state->v4;
+
+		while (p<=limit)
+		{
+			v1 += XXH_LE32(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+			v2 += XXH_LE32(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+			v3 += XXH_LE32(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+			v4 += XXH_LE32(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+		}  
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < bEnd)
+	{
+		memcpy(state->memory, p, bEnd-p);
+		state->memsize = bEnd-p;
+	}
+
+	return 0;
 }
 
 
+unsigned int XXH32_getIntermediateResult (void* state_in)
+{
+	struct XXH_state32_t * state = state_in;
+	unsigned char * p   = (unsigned char*)state->memory;
+	unsigned char* bEnd = (unsigned char*)state->memory + state->memsize;
+	unsigned int h32;
 
 
+	if (state->total_len >= 16)
+	{
+		h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+	}
+	else
+	{
+		h32  = state->seed + PRIME32_5;
+	}
 
+	h32 += (unsigned int) state->total_len;
+	
+	while (p<=bEnd-4)
+	{
+		h32 += XXH_LE32(p) * PRIME32_3;
+		h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+		p+=4;
+	}
+
+	while (p<bEnd)
+	{
+		h32 += (*p) * PRIME32_5;
+		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+
+
+unsigned int XXH32_result (void* state_in)
+{
+    unsigned int h32 = XXH32_getIntermediateResult(state_in);
+
+	free(state_in);
+
+	return h32;
+}
diff --git a/utils/xxhash.h b/utils/xxhash.h
index 3cf8803..d253fa0 100644
--- a/utils/xxhash.h
+++ b/utils/xxhash.h
@@ -1,23 +1,3 @@
-/*
- * This file is a part of Pcompress, a chunked parallel multi-
- * algorithm lossless compression and decompression program.
- *
- * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
- * Use is subject to license terms.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 3 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * moinakg@belenix.org, http://moinakg.wordpress.com/
- */
-
 /*
    xxHash - Fast Hash algorithm
    Header File
@@ -50,6 +30,33 @@
 	You can contact the author at :
 	- xxHash source repository : http://code.google.com/p/xxhash/
 */
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function. 
+It depends on successfully passing SMHasher test set. 
+10 is a perfect score.
+*/
+
 #pragma once
 
 #if defined (__cplusplus)
@@ -58,19 +65,60 @@ extern "C" {
 
 
 //****************************
-// Hash Functions
+// Simple Hash Functions
 //****************************
 
-unsigned int XXH_fast32  (const void* input, int len, unsigned int seed);
-unsigned int XXH_strong32(const void* input, int len, unsigned int seed);
+unsigned int XXH32 (const void* input, int len, unsigned int seed);
 
 /*
-XXH_fast32() :
+XXH32() :
 	Calculate the 32-bits hash of "input", of length "len"
 	"seed" can be used to alter the result
+	This function successfully passes all SMHasher tests.
+	Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+	Note that "len" is type "int", which means it is limited to 2^31-1.
+	If your data is larger, use the advanced functions below.
+*/
 
-XXH_strong32() :
-	Same as XXH_fast(), but the resulting hash has stronger properties
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*        XXH32_init   (unsigned int seed);
+int          XXH32_feed   (void* state, const void* input, int len);
+unsigned int XXH32_result (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+You must start with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_feed().
+XXH32_feed() can be called as many times as necessary.
+The function returns an error code, with 0 meaning OK, and all other values meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1. 
+If your data is larger, it is recommended
+to chunk your data into blocks of size 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_result().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+
+Memory will be freed by XXH32_result().
+*/
+
+
+unsigned int XXH32_getIntermediateResult (void* state);
+/*
+This function does the same as XXH32_result(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_feed().
+To free memory context, use XXH32_result().
 */