Use 2-stage Merkle Tree hashing for parallel hashes for better crypto properties.

Update xxhash comment.
2013-02-01 22:07:28 +05:30 · 2013-02-01 22:07:28 +05:30 · 6bfd044311
commit 6bfd044311
parent af4c6e1d84
3 changed files with 323 additions and 139 deletions
--- a/crypto/sha2_utils.c
+++ b/crypto/sha2_utils.c
@ -30,9 +30,21 @@
 #endif
 #include <utils.h>

+#define	BLKSZ		(2048)
+
 /*
 * Helper functions for single-call SHA2 hashing. Both serial and
- * parallel versions are provided.
+ * parallel versions are provided. Parallel versions use 2-stage
+ * Merkle Tree hashing.
+ * 
+ * At the leaf level data is split into BLKSZ blocks and 4 threads
+ * compute 4 hashes of interleaved block streams. At 2nd level two
+ * new hashes are generated from hashing the 2 pairs of hash values.
+ * In the final stage the 2 hash values are hashed to the final digest.
+ * 
+ * References:
+ * http://eprint.iacr.org/2012/476.pdf
+ * http://gva.noekeon.org/papers/bdpv09tree.html
 */
 void
 ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
@ -47,44 +59,64 @@ ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 void
 ossl_SHA256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][32];
-	int i;
-	SHA256_CTX *mctx;
+	uchar_t cksum[6][32];
+	SHA256_CTX ctx[4];
+	int i, rem;
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
 	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
-		mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX));
-		SHA256_Init(mctx);
-		SHA256_Update(mctx, buf, bytes);
-		SHA256_Final(cksum_buf, mctx);
-		free(mctx);
+	if (bytes <= BLKSZ * 2) {
+		SHA256_Init(&ctx[0]);
+		SHA256_Update(&ctx[0], buf, bytes);
+		SHA256_Final(cksum_buf, &ctx[0]);
 		return;
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	buf += bytes/2;
-	pos[1] = buf;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		SHA256_CTX ctx;
-		SHA256_Init(&ctx);
-		SHA256_Update(&ctx, pos[i], len[i]);
-		SHA256_Final(cksum[i], &ctx);
+		uint64_t byt;
+
+		byt = i * BLKSZ;
+		SHA256_Init(&ctx[i]);
+		while (byt < _bytes) {
+			SHA256_Update(&ctx[i], buf + byt, BLKSZ);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			SHA256_Final(cksum[i], &ctx[i]);
 	}
-	mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX));
-	SHA256_Init(mctx);
-	SHA256_Update(mctx, cksum, 2 * 32);
-	SHA256_Final(cksum_buf, mctx);
-	free(mctx);
+	if (rem > 0) {
+		SHA256_Update(&ctx[0], buf + bytes - rem, rem);
+	}
+	SHA256_Final(cksum[0], &ctx[0]);
+
+	/*
+	 * Second level hashes.
+	 */
+	SHA256_Init(&ctx[0]);
+	SHA256_Init(&ctx[1]);
+	SHA256_Update(&ctx[0], &cksum[0], 2 * 32);
+	SHA256_Update(&ctx[1], &cksum[1], 2 * 32);
+	SHA256_Final(cksum[4], &ctx[0]);
+	SHA256_Final(cksum[5], &ctx[1]);
+
+	/*
+	 * Final hash.
+	 */
+	SHA256_Init(&ctx[0]);
+	SHA256_Update(&ctx[0], &cksum[4], 2 * 32);
+	SHA256_Final(cksum_buf, &ctx[0]);
 }

 void
@ -100,43 +132,64 @@ ossl_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 void
 ossl_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][64];
-	int i;
-	SHA512_CTX *mctx;
+	uchar_t cksum[6][32];
+	SHA512_CTX ctx[4];
+	int i, rem;
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
-	 * If not then just do a simple hashing.
+	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
-		mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX));
-		SHA512_Init(mctx);
-		SHA512_Update(mctx, buf, bytes);
-		SHA512_Final(cksum_buf, mctx);
-		free(mctx);
+	if (bytes <= BLKSZ * 2) {
+		SHA512_Init(&ctx[0]);
+		SHA512_Update(&ctx[0], buf, bytes);
+		SHA512_Final(cksum_buf, &ctx[0]);
 		return;
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	pos[1] = buf + bytes/2;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		SHA512_CTX ctx;
-		SHA512_Init(&ctx);
-		SHA512_Update(&ctx, pos[i], len[i]);
-		SHA512_Final(cksum[i], &ctx);
+		uint64_t byt;
+
+		byt = i * BLKSZ;
+		SHA512_Init(&ctx[i]);
+		while (byt < _bytes) {
+			SHA512_Update(&ctx[i], buf + byt, BLKSZ);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			SHA512_Final(cksum[i], &ctx[i]);
 	}
-	mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX));
-	SHA512_Init(mctx);
-	SHA512_Update(mctx, cksum, 2 * 64);
-	SHA512_Final(cksum_buf, mctx);
-	free(mctx);
+	if (rem > 0) {
+		SHA512_Update(&ctx[0], buf + bytes - rem, rem);
+	}
+	SHA512_Final(cksum[0], &ctx[0]);
+
+	/*
+	 * Second level hashes.
+	 */
+	SHA512_Init(&ctx[0]);
+	SHA512_Init(&ctx[1]);
+	SHA512_Update(&ctx[0], &cksum[0], 2 * 32);
+	SHA512_Update(&ctx[1], &cksum[1], 2 * 32);
+	SHA512_Final(cksum[4], &ctx[0]);
+	SHA512_Final(cksum[5], &ctx[1]);
+
+	/*
+	 * Final hash.
+	 */
+	SHA512_Init(&ctx[0]);
+	SHA512_Update(&ctx[0], &cksum[4], 2 * 32);
+	SHA512_Final(cksum_buf, &ctx[0]);
 }

 void
@ -152,43 +205,64 @@ opt_SHA512t256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 void
 opt_SHA512t256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][32];
-	int i;
-	SHA512_Context *mctx;
+	uchar_t cksum[6][32];
+	SHA512_Context ctx[4];
+	int i, rem;
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
 	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
-		mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
-		opt_SHA512t256_Init(mctx);
-		opt_SHA512t256_Update(mctx, buf, bytes);
-		opt_SHA512t256_Final(mctx, cksum_buf);
-		free(mctx);
+	if (bytes <= BLKSZ * 2) {
+		opt_SHA512t256_Init(&ctx[0]);
+		opt_SHA512t256_Update(&ctx[0], buf, bytes);
+		opt_SHA512t256_Final(&ctx[0], cksum_buf);
 		return;
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	pos[1] = buf + bytes/2;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		SHA512_Context ctx;
-		opt_SHA512t256_Init(&ctx);
-		opt_SHA512t256_Update(&ctx, pos[i], len[i]);
-		opt_SHA512t256_Final(&ctx, cksum[i]);
+		uint64_t byt;
+
+		byt = i * BLKSZ;
+		opt_SHA512t256_Init(&ctx[i]);
+		while (byt < _bytes) {
+			opt_SHA512t256_Update(&ctx[i], buf + byt, BLKSZ);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			opt_SHA512t256_Final(&ctx[i], cksum[i]);
 	}
-	mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
-	opt_SHA512t256_Init(mctx);
-	opt_SHA512t256_Update(mctx, cksum, 2 * 32);
-	opt_SHA512t256_Final(mctx, cksum_buf);
-	free(mctx);
+	if (rem > 0) {
+		opt_SHA512t256_Update(&ctx[0], buf + bytes - rem, rem);
+	}
+	opt_SHA512t256_Final(&ctx[0], cksum[0]);
+
+	/*
+	 * Second level hashes.
+	 */
+	opt_SHA512t256_Init(&ctx[0]);
+	opt_SHA512t256_Init(&ctx[1]);
+	opt_SHA512t256_Update(&ctx[0], &cksum[0], 2 * 32);
+	opt_SHA512t256_Update(&ctx[1], &cksum[1], 2 * 32);
+	opt_SHA512t256_Final(&ctx[0], cksum[4]);
+	opt_SHA512t256_Final(&ctx[1], cksum[5]);
+
+	/*
+	 * Final hash.
+	 */
+	opt_SHA512t256_Init(&ctx[0]);
+	opt_SHA512t256_Update(&ctx[0], &cksum[4], 2 * 32);
+	opt_SHA512t256_Final(&ctx[0], cksum_buf);
 }

 void
@ -204,42 +278,62 @@ opt_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 void
 opt_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][64];
-	int i;
-	SHA512_Context *mctx;
+	uchar_t cksum[6][64];
+	SHA512_Context ctx[4];
+	int i, rem;
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
 	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
-		mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
-		opt_SHA512_Init(mctx);
-		opt_SHA512_Update(mctx, buf, bytes);
-		opt_SHA512_Final(mctx, cksum_buf);
-		free(mctx);
+	if (bytes <= BLKSZ * 2) {
+		opt_SHA512_Init(&ctx[0]);
+		opt_SHA512_Update(&ctx[0], buf, bytes);
+		opt_SHA512_Final(&ctx[0], cksum_buf);
 		return;
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	pos[1] = buf + bytes/2;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		SHA512_Context ctx;
-		opt_SHA512_Init(&ctx);
-		opt_SHA512_Update(&ctx, pos[i], len[i]);
-		opt_SHA512_Final(&ctx, cksum[i]);
-	}
-	mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
-	opt_SHA512_Init(mctx);
-	opt_SHA512_Update(mctx, cksum, 2 * 64);
-	opt_SHA512_Final(mctx, cksum_buf);
-	free(mctx);
-}
+		uint64_t byt;

+		byt = i * BLKSZ;
+		opt_SHA512_Init(&ctx[i]);
+		while (byt < _bytes) {
+			opt_SHA512_Update(&ctx[i], buf + byt, BLKSZ);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			opt_SHA512_Final(&ctx[i], cksum[i]);
+	}
+	if (rem > 0) {
+		opt_SHA512_Update(&ctx[0], buf + bytes - rem, rem);
+	}
+	opt_SHA512_Final(&ctx[0], cksum[0]);
+
+	/*
+	 * Second level hashes.
+	 */
+	opt_SHA512_Init(&ctx[0]);
+	opt_SHA512_Init(&ctx[1]);
+	opt_SHA512_Update(&ctx[0], &cksum[0], 2 * 64);
+	opt_SHA512_Update(&ctx[1], &cksum[1], 2 * 64);
+	opt_SHA512_Final(&ctx[0], cksum[4]);
+	opt_SHA512_Final(&ctx[1], cksum[5]);
+
+	/*
+	 * Final hash.
+	 */
+	opt_SHA512_Init(&ctx[0]);
+	opt_SHA512_Update(&ctx[0], &cksum[4], 2 * 64);
+	opt_SHA512_Final(&ctx[0], cksum_buf);
+}
--- a/crypto/sha3_utils.c
+++ b/crypto/sha3_utils.c
@ -30,10 +30,21 @@
 #include <utils.h>

 #define	KECCAK_BLOCK_SIZE	1024
+#define	BLKSZ			(2048)

 /*
 * Helper functions for single-call SHA3 (Keccak) hashing. Both serial
- * and parallel versions are provided.
+ * and parallel versions are provided. Parallel versions use 2-stage
+ * Merkle Tree hashing.
+ * 
+ * At the leaf level data is split into BLKSZ blocks and 4 threads
+ * compute 4 hashes of interleaved block streams. At 2nd level two
+ * new hashes are generated from hashing the 2 pairs of hash values.
+ * In the final stage the 2 hash values are hashed to the final digest.
+ * 
+ * References:
+ * http://eprint.iacr.org/2012/476.pdf
+ * http://gva.noekeon.org/papers/bdpv09tree.html
 */

 int
@ -45,32 +56,69 @@ Keccak256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 int
 Keccak256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][32];
-	int i, rv[2];
+	uchar_t cksum[6][32];
+	hashState ctx[4];
+	int i, rem, rv[4];
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
 	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) {
+	if (bytes <= BLKSZ) {
 		return (Keccak_Hash(256, buf, bytes * 8, cksum_buf));
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	pos[1] = buf + bytes/2;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	for (i = 0; i < 4; ++i) rv[i] = 0;
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		rv[i] = Keccak_Hash(256, pos[i], len[i] * 8, cksum[i]);
+		uint64_t byt;
+
+		byt = i * BLKSZ;
+		rv[i] |= Keccak_Init(&ctx[i], 256);
+		while (byt < _bytes) {
+			rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			rv[i] |= Keccak_Final(&ctx[i], cksum[i]);
 	}
-	if (rv[0] != 0 || rv[1] != 0)
-		return (-1);
-	return (Keccak_Hash(256, (const BitSequence *)cksum, 2 * 32 * 8, cksum_buf));
+	if (rem > 0) {
+		rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8);
+	}
+	rv[0] |= Keccak_Final(&ctx[0], cksum[0]);
+
+	for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1);
+	rv[0] = 0;
+	rv[1] = 0;
+
+	/*
+	 * Second level hashes.
+	 */
+	rv[0] |= Keccak_Init(&ctx[0], 256);
+	rv[1] |= Keccak_Init(&ctx[1], 256);
+	rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 32 * 8);
+	rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 32 * 8);
+	rv[0] |= Keccak_Final(&ctx[0], cksum[4]);
+	rv[1] |= Keccak_Final(&ctx[1], cksum[5]);
+	for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1);
+
+	/*
+	 * Final hash.
+	 */
+	rv[0] = 0;
+	rv[0] |= Keccak_Init(&ctx[0], 256);
+	rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 32 * 8);
+	rv[0] |= Keccak_Final(&ctx[0], cksum_buf);
+	return (rv[0]);
 }

 int
@ -82,30 +130,67 @@ Keccak512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 int
 Keccak512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
 {
-	uchar_t *pos[2];
-	uint64_t len[2];
-	uchar_t cksum[2][64];
-	int i, rv[2];
+	uchar_t cksum[6][64];
+	hashState ctx[4];
+	int i, rem, rv[4];
+	uint64_t _bytes;

 	/*
 	 * Is it worth doing the overhead of parallelism ? Buffer large enough ?
 	 * If not then just do a simple serial hashing.
 	 */
-	if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) {
+	if (bytes <= BLKSZ) {
 		return (Keccak_Hash(512, buf, bytes * 8, cksum_buf));
 	}
-	pos[0] = buf;
-	len[0] = bytes/2;
-	pos[1] = buf + bytes/2;
-	len[1] = bytes - bytes/2;
+
+	/*
+	 * Do first level hashes in parallel.
+	 */
+	for (i = 0; i < 4; ++i) rv[i] = 0;
+	_bytes = (bytes / BLKSZ) * BLKSZ;
+	rem = bytes - _bytes;
 #if defined(_OPENMP)
 #	pragma omp parallel for
 #endif
-	for(i = 0; i < 2; ++i)
+	for(i = 0; i < 4; ++i)
 	{
-		rv[i] = Keccak_Hash(512, pos[i], len[i] * 8, cksum[i]);
+		uint64_t byt;
+
+		byt = i * BLKSZ;
+		rv[i] |= Keccak_Init(&ctx[i], 512);
+		while (byt < _bytes) {
+			rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8);
+			byt += 4 * BLKSZ;
+		}
+		if (i>0)
+			rv[i] |= Keccak_Final(&ctx[i], cksum[i]);
 	}
-	if (rv[0] != 0 || rv[1] != 0)
-		return (-1);
-	return (Keccak_Hash(512, (const BitSequence *)cksum, 2 * 64 * 8, cksum_buf));
+	if (rem > 0) {
+		rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8);
+	}
+	rv[0] |= Keccak_Final(&ctx[0], cksum[0]);
+
+	for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1);
+	rv[0] = 0;
+	rv[1] = 0;
+
+	/*
+	 * Second level hashes.
+	 */
+	rv[0] |= Keccak_Init(&ctx[0], 512);
+	rv[1] |= Keccak_Init(&ctx[1], 512);
+	rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 64 * 8);
+	rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 64 * 8);
+	rv[0] |= Keccak_Final(&ctx[0], cksum[4]);
+	rv[1] |= Keccak_Final(&ctx[1], cksum[5]);
+	for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1);
+
+	/*
+	 * Final hash.
+	 */
+	rv[0] = 0;
+	rv[0] |= Keccak_Init(&ctx[0], 512);
+	rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 64 * 8);
+	rv[0] |= Keccak_Final(&ctx[0], cksum_buf);
+	return (rv[0]);
 }
--- a/utils/xxhash.c
+++ b/utils/xxhash.c
@ -30,7 +30,12 @@
 	- xxHash source repository : http://code.google.com/p/xxhash/
 */

-
+/*
+ * Modified by Moinak Ghosh for pcompress. The new hashing approach
+ * with interleaved blocks is derived from the following paper:
+ * 
+ * http://eprint.iacr.org/2012/476.pdf
+ */

 //**************************************
 // Tuning parameters
@ -356,10 +361,10 @@ int   CPUCAP_NM(XXH32_feed) (void* state_in, const void* input, int len)

 		/*
 		 * 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
-		 * interleave to some extent on a hyperthreaded processor providing 10% - 14%
-		 * speedup over original xxhash depending on processor. We could have used
-		 * aligned loads but we actually want the unaligned penalty. It helps to
-		 * interleave better for a slight benefit over aligned loads here!
+		 * interleave to some extent on the superscalar x86 processor providing
+		 * 10% - 14% speedup over original xxhash depending on processor model. We
+		 * could have used aligned loads but we actually want the unaligned penalty.
+		 * It helps to interleave better for a slight benefit over aligned loads here!
 		 */
 		do {
 			__m128i mem = _mm_loadu_si128((__m128i *)p);