From 6bfd04431189020c8779f406186d434f141d226b Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Fri, 1 Feb 2013 22:07:28 +0530 Subject: [PATCH] Use 2-stage Merkle Tree hashing for parallel hashes for better crypto properties. Update xxhash comment. --- crypto/sha2_utils.c | 304 +++++++++++++++++++++++++++++--------------- crypto/sha3_utils.c | 143 ++++++++++++++++----- utils/xxhash.c | 15 ++- 3 files changed, 323 insertions(+), 139 deletions(-) diff --git a/crypto/sha2_utils.c b/crypto/sha2_utils.c index b746f1b..c47cc73 100644 --- a/crypto/sha2_utils.c +++ b/crypto/sha2_utils.c @@ -30,9 +30,21 @@ #endif #include +#define BLKSZ (2048) + /* * Helper functions for single-call SHA2 hashing. Both serial and - * parallel versions are provided. + * parallel versions are provided. Parallel versions use 2-stage + * Merkle Tree hashing. + * + * At the leaf level data is split into BLKSZ blocks and 4 threads + * compute 4 hashes of interleaved block streams. At 2nd level two + * new hashes are generated from hashing the 2 pairs of hash values. + * In the final stage the 2 hash values are hashed to the final digest. + * + * References: + * http://eprint.iacr.org/2012/476.pdf + * http://gva.noekeon.org/papers/bdpv09tree.html */ void ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) @@ -47,44 +59,64 @@ ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) void ossl_SHA256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][32]; - int i; - SHA256_CTX *mctx; + uchar_t cksum[6][32]; + SHA256_CTX ctx[4]; + int i, rem; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? * If not then just do a simple serial hashing. */ - if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) { - mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX)); - SHA256_Init(mctx); - SHA256_Update(mctx, buf, bytes); - SHA256_Final(cksum_buf, mctx); - free(mctx); + if (bytes <= BLKSZ * 2) { + SHA256_Init(&ctx[0]); + SHA256_Update(&ctx[0], buf, bytes); + SHA256_Final(cksum_buf, &ctx[0]); return; } - pos[0] = buf; - len[0] = bytes/2; - buf += bytes/2; - pos[1] = buf; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, pos[i], len[i]); - SHA256_Final(cksum[i], &ctx); + uint64_t byt; + + byt = i * BLKSZ; + SHA256_Init(&ctx[i]); + while (byt < _bytes) { + SHA256_Update(&ctx[i], buf + byt, BLKSZ); + byt += 4 * BLKSZ; + } + if (i>0) + SHA256_Final(cksum[i], &ctx[i]); } - mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX)); - SHA256_Init(mctx); - SHA256_Update(mctx, cksum, 2 * 32); - SHA256_Final(cksum_buf, mctx); - free(mctx); + if (rem > 0) { + SHA256_Update(&ctx[0], buf + bytes - rem, rem); + } + SHA256_Final(cksum[0], &ctx[0]); + + /* + * Second level hashes. + */ + SHA256_Init(&ctx[0]); + SHA256_Init(&ctx[1]); + SHA256_Update(&ctx[0], &cksum[0], 2 * 32); + SHA256_Update(&ctx[1], &cksum[1], 2 * 32); + SHA256_Final(cksum[4], &ctx[0]); + SHA256_Final(cksum[5], &ctx[1]); + + /* + * Final hash. + */ + SHA256_Init(&ctx[0]); + SHA256_Update(&ctx[0], &cksum[4], 2 * 32); + SHA256_Final(cksum_buf, &ctx[0]); } void @@ -100,43 +132,64 @@ ossl_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) void ossl_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][64]; - int i; - SHA512_CTX *mctx; + uchar_t cksum[6][32]; + SHA512_CTX ctx[4]; + int i, rem; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? - * If not then just do a simple hashing. + * If not then just do a simple serial hashing. */ - if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) { - mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX)); - SHA512_Init(mctx); - SHA512_Update(mctx, buf, bytes); - SHA512_Final(cksum_buf, mctx); - free(mctx); + if (bytes <= BLKSZ * 2) { + SHA512_Init(&ctx[0]); + SHA512_Update(&ctx[0], buf, bytes); + SHA512_Final(cksum_buf, &ctx[0]); return; } - pos[0] = buf; - len[0] = bytes/2; - pos[1] = buf + bytes/2; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - SHA512_CTX ctx; - SHA512_Init(&ctx); - SHA512_Update(&ctx, pos[i], len[i]); - SHA512_Final(cksum[i], &ctx); + uint64_t byt; + + byt = i * BLKSZ; + SHA512_Init(&ctx[i]); + while (byt < _bytes) { + SHA512_Update(&ctx[i], buf + byt, BLKSZ); + byt += 4 * BLKSZ; + } + if (i>0) + SHA512_Final(cksum[i], &ctx[i]); } - mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX)); - SHA512_Init(mctx); - SHA512_Update(mctx, cksum, 2 * 64); - SHA512_Final(cksum_buf, mctx); - free(mctx); + if (rem > 0) { + SHA512_Update(&ctx[0], buf + bytes - rem, rem); + } + SHA512_Final(cksum[0], &ctx[0]); + + /* + * Second level hashes. + */ + SHA512_Init(&ctx[0]); + SHA512_Init(&ctx[1]); + SHA512_Update(&ctx[0], &cksum[0], 2 * 32); + SHA512_Update(&ctx[1], &cksum[1], 2 * 32); + SHA512_Final(cksum[4], &ctx[0]); + SHA512_Final(cksum[5], &ctx[1]); + + /* + * Final hash. + */ + SHA512_Init(&ctx[0]); + SHA512_Update(&ctx[0], &cksum[4], 2 * 32); + SHA512_Final(cksum_buf, &ctx[0]); } void @@ -152,43 +205,64 @@ opt_SHA512t256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) void opt_SHA512t256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][32]; - int i; - SHA512_Context *mctx; + uchar_t cksum[6][32]; + SHA512_Context ctx[4]; + int i, rem; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? * If not then just do a simple serial hashing. */ - if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) { - mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context)); - opt_SHA512t256_Init(mctx); - opt_SHA512t256_Update(mctx, buf, bytes); - opt_SHA512t256_Final(mctx, cksum_buf); - free(mctx); + if (bytes <= BLKSZ * 2) { + opt_SHA512t256_Init(&ctx[0]); + opt_SHA512t256_Update(&ctx[0], buf, bytes); + opt_SHA512t256_Final(&ctx[0], cksum_buf); return; } - pos[0] = buf; - len[0] = bytes/2; - pos[1] = buf + bytes/2; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - SHA512_Context ctx; - opt_SHA512t256_Init(&ctx); - opt_SHA512t256_Update(&ctx, pos[i], len[i]); - opt_SHA512t256_Final(&ctx, cksum[i]); + uint64_t byt; + + byt = i * BLKSZ; + opt_SHA512t256_Init(&ctx[i]); + while (byt < _bytes) { + opt_SHA512t256_Update(&ctx[i], buf + byt, BLKSZ); + byt += 4 * BLKSZ; + } + if (i>0) + opt_SHA512t256_Final(&ctx[i], cksum[i]); } - mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context)); - opt_SHA512t256_Init(mctx); - opt_SHA512t256_Update(mctx, cksum, 2 * 32); - opt_SHA512t256_Final(mctx, cksum_buf); - free(mctx); + if (rem > 0) { + opt_SHA512t256_Update(&ctx[0], buf + bytes - rem, rem); + } + opt_SHA512t256_Final(&ctx[0], cksum[0]); + + /* + * Second level hashes. + */ + opt_SHA512t256_Init(&ctx[0]); + opt_SHA512t256_Init(&ctx[1]); + opt_SHA512t256_Update(&ctx[0], &cksum[0], 2 * 32); + opt_SHA512t256_Update(&ctx[1], &cksum[1], 2 * 32); + opt_SHA512t256_Final(&ctx[0], cksum[4]); + opt_SHA512t256_Final(&ctx[1], cksum[5]); + + /* + * Final hash. + */ + opt_SHA512t256_Init(&ctx[0]); + opt_SHA512t256_Update(&ctx[0], &cksum[4], 2 * 32); + opt_SHA512t256_Final(&ctx[0], cksum_buf); } void @@ -204,42 +278,62 @@ opt_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) void opt_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][64]; - int i; - SHA512_Context *mctx; + uchar_t cksum[6][64]; + SHA512_Context ctx[4]; + int i, rem; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? * If not then just do a simple serial hashing. */ - if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) { - mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context)); - opt_SHA512_Init(mctx); - opt_SHA512_Update(mctx, buf, bytes); - opt_SHA512_Final(mctx, cksum_buf); - free(mctx); + if (bytes <= BLKSZ * 2) { + opt_SHA512_Init(&ctx[0]); + opt_SHA512_Update(&ctx[0], buf, bytes); + opt_SHA512_Final(&ctx[0], cksum_buf); return; } - pos[0] = buf; - len[0] = bytes/2; - pos[1] = buf + bytes/2; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - SHA512_Context ctx; - opt_SHA512_Init(&ctx); - opt_SHA512_Update(&ctx, pos[i], len[i]); - opt_SHA512_Final(&ctx, cksum[i]); - } - mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context)); - opt_SHA512_Init(mctx); - opt_SHA512_Update(mctx, cksum, 2 * 64); - opt_SHA512_Final(mctx, cksum_buf); - free(mctx); -} + uint64_t byt; + byt = i * BLKSZ; + opt_SHA512_Init(&ctx[i]); + while (byt < _bytes) { + opt_SHA512_Update(&ctx[i], buf + byt, BLKSZ); + byt += 4 * BLKSZ; + } + if (i>0) + opt_SHA512_Final(&ctx[i], cksum[i]); + } + if (rem > 0) { + opt_SHA512_Update(&ctx[0], buf + bytes - rem, rem); + } + opt_SHA512_Final(&ctx[0], cksum[0]); + + /* + * Second level hashes. + */ + opt_SHA512_Init(&ctx[0]); + opt_SHA512_Init(&ctx[1]); + opt_SHA512_Update(&ctx[0], &cksum[0], 2 * 64); + opt_SHA512_Update(&ctx[1], &cksum[1], 2 * 64); + opt_SHA512_Final(&ctx[0], cksum[4]); + opt_SHA512_Final(&ctx[1], cksum[5]); + + /* + * Final hash. + */ + opt_SHA512_Init(&ctx[0]); + opt_SHA512_Update(&ctx[0], &cksum[4], 2 * 64); + opt_SHA512_Final(&ctx[0], cksum_buf); +} diff --git a/crypto/sha3_utils.c b/crypto/sha3_utils.c index 836329b..60b4e57 100644 --- a/crypto/sha3_utils.c +++ b/crypto/sha3_utils.c @@ -30,10 +30,21 @@ #include #define KECCAK_BLOCK_SIZE 1024 +#define BLKSZ (2048) /* * Helper functions for single-call SHA3 (Keccak) hashing. Both serial - * and parallel versions are provided. + * and parallel versions are provided. Parallel versions use 2-stage + * Merkle Tree hashing. + * + * At the leaf level data is split into BLKSZ blocks and 4 threads + * compute 4 hashes of interleaved block streams. At 2nd level two + * new hashes are generated from hashing the 2 pairs of hash values. + * In the final stage the 2 hash values are hashed to the final digest. + * + * References: + * http://eprint.iacr.org/2012/476.pdf + * http://gva.noekeon.org/papers/bdpv09tree.html */ int @@ -45,32 +56,69 @@ Keccak256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) int Keccak256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][32]; - int i, rv[2]; + uchar_t cksum[6][32]; + hashState ctx[4]; + int i, rem, rv[4]; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? * If not then just do a simple serial hashing. */ - if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) { + if (bytes <= BLKSZ) { return (Keccak_Hash(256, buf, bytes * 8, cksum_buf)); } - pos[0] = buf; - len[0] = bytes/2; - pos[1] = buf + bytes/2; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + for (i = 0; i < 4; ++i) rv[i] = 0; + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - rv[i] = Keccak_Hash(256, pos[i], len[i] * 8, cksum[i]); + uint64_t byt; + + byt = i * BLKSZ; + rv[i] |= Keccak_Init(&ctx[i], 256); + while (byt < _bytes) { + rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8); + byt += 4 * BLKSZ; + } + if (i>0) + rv[i] |= Keccak_Final(&ctx[i], cksum[i]); } - if (rv[0] != 0 || rv[1] != 0) - return (-1); - return (Keccak_Hash(256, (const BitSequence *)cksum, 2 * 32 * 8, cksum_buf)); + if (rem > 0) { + rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8); + } + rv[0] |= Keccak_Final(&ctx[0], cksum[0]); + + for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1); + rv[0] = 0; + rv[1] = 0; + + /* + * Second level hashes. + */ + rv[0] |= Keccak_Init(&ctx[0], 256); + rv[1] |= Keccak_Init(&ctx[1], 256); + rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 32 * 8); + rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 32 * 8); + rv[0] |= Keccak_Final(&ctx[0], cksum[4]); + rv[1] |= Keccak_Final(&ctx[1], cksum[5]); + for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1); + + /* + * Final hash. + */ + rv[0] = 0; + rv[0] |= Keccak_Init(&ctx[0], 256); + rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 32 * 8); + rv[0] |= Keccak_Final(&ctx[0], cksum_buf); + return (rv[0]); } int @@ -82,30 +130,67 @@ Keccak512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) int Keccak512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes) { - uchar_t *pos[2]; - uint64_t len[2]; - uchar_t cksum[2][64]; - int i, rv[2]; + uchar_t cksum[6][64]; + hashState ctx[4]; + int i, rem, rv[4]; + uint64_t _bytes; /* * Is it worth doing the overhead of parallelism ? Buffer large enough ? * If not then just do a simple serial hashing. */ - if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) { + if (bytes <= BLKSZ) { return (Keccak_Hash(512, buf, bytes * 8, cksum_buf)); } - pos[0] = buf; - len[0] = bytes/2; - pos[1] = buf + bytes/2; - len[1] = bytes - bytes/2; + + /* + * Do first level hashes in parallel. + */ + for (i = 0; i < 4; ++i) rv[i] = 0; + _bytes = (bytes / BLKSZ) * BLKSZ; + rem = bytes - _bytes; #if defined(_OPENMP) # pragma omp parallel for #endif - for(i = 0; i < 2; ++i) + for(i = 0; i < 4; ++i) { - rv[i] = Keccak_Hash(512, pos[i], len[i] * 8, cksum[i]); + uint64_t byt; + + byt = i * BLKSZ; + rv[i] |= Keccak_Init(&ctx[i], 512); + while (byt < _bytes) { + rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8); + byt += 4 * BLKSZ; + } + if (i>0) + rv[i] |= Keccak_Final(&ctx[i], cksum[i]); } - if (rv[0] != 0 || rv[1] != 0) - return (-1); - return (Keccak_Hash(512, (const BitSequence *)cksum, 2 * 64 * 8, cksum_buf)); + if (rem > 0) { + rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8); + } + rv[0] |= Keccak_Final(&ctx[0], cksum[0]); + + for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1); + rv[0] = 0; + rv[1] = 0; + + /* + * Second level hashes. + */ + rv[0] |= Keccak_Init(&ctx[0], 512); + rv[1] |= Keccak_Init(&ctx[1], 512); + rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 64 * 8); + rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 64 * 8); + rv[0] |= Keccak_Final(&ctx[0], cksum[4]); + rv[1] |= Keccak_Final(&ctx[1], cksum[5]); + for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1); + + /* + * Final hash. + */ + rv[0] = 0; + rv[0] |= Keccak_Init(&ctx[0], 512); + rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 64 * 8); + rv[0] |= Keccak_Final(&ctx[0], cksum_buf); + return (rv[0]); } diff --git a/utils/xxhash.c b/utils/xxhash.c index 26966ba..76f3576 100644 --- a/utils/xxhash.c +++ b/utils/xxhash.c @@ -30,7 +30,12 @@ - xxHash source repository : http://code.google.com/p/xxhash/ */ - +/* + * Modified by Moinak Ghosh for pcompress. The new hashing approach + * with interleaved blocks is derived from the following paper: + * + * http://eprint.iacr.org/2012/476.pdf + */ //************************************** // Tuning parameters @@ -356,10 +361,10 @@ int CPUCAP_NM(XXH32_feed) (void* state_in, const void* input, int len) /* * 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will - * interleave to some extent on a hyperthreaded processor providing 10% - 14% - * speedup over original xxhash depending on processor. We could have used - * aligned loads but we actually want the unaligned penalty. It helps to - * interleave better for a slight benefit over aligned loads here! + * interleave to some extent on the superscalar x86 processor providing + * 10% - 14% speedup over original xxhash depending on processor model. We + * could have used aligned loads but we actually want the unaligned penalty. + * It helps to interleave better for a slight benefit over aligned loads here! */ do { __m128i mem = _mm_loadu_si128((__m128i *)p);