Use 2-stage Merkle Tree hashing for parallel hashes for better crypto properties.
Update xxhash comment.
This commit is contained in:
parent
af4c6e1d84
commit
6bfd044311
3 changed files with 323 additions and 139 deletions
|
@ -30,9 +30,21 @@
|
||||||
#endif
|
#endif
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
|
|
||||||
|
#define BLKSZ (2048)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helper functions for single-call SHA2 hashing. Both serial and
|
* Helper functions for single-call SHA2 hashing. Both serial and
|
||||||
* parallel versions are provided.
|
* parallel versions are provided. Parallel versions use 2-stage
|
||||||
|
* Merkle Tree hashing.
|
||||||
|
*
|
||||||
|
* At the leaf level data is split into BLKSZ blocks and 4 threads
|
||||||
|
* compute 4 hashes of interleaved block streams. At 2nd level two
|
||||||
|
* new hashes are generated from hashing the 2 pairs of hash values.
|
||||||
|
* In the final stage the 2 hash values are hashed to the final digest.
|
||||||
|
*
|
||||||
|
* References:
|
||||||
|
* http://eprint.iacr.org/2012/476.pdf
|
||||||
|
* http://gva.noekeon.org/papers/bdpv09tree.html
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
|
@ -47,44 +59,64 @@ ossl_SHA256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
void
|
void
|
||||||
ossl_SHA256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
ossl_SHA256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][32];
|
||||||
uint64_t len[2];
|
SHA256_CTX ctx[4];
|
||||||
uchar_t cksum[2][32];
|
int i, rem;
|
||||||
int i;
|
uint64_t _bytes;
|
||||||
SHA256_CTX *mctx;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple serial hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
|
if (bytes <= BLKSZ * 2) {
|
||||||
mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX));
|
SHA256_Init(&ctx[0]);
|
||||||
SHA256_Init(mctx);
|
SHA256_Update(&ctx[0], buf, bytes);
|
||||||
SHA256_Update(mctx, buf, bytes);
|
SHA256_Final(cksum_buf, &ctx[0]);
|
||||||
SHA256_Final(cksum_buf, mctx);
|
|
||||||
free(mctx);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
buf += bytes/2;
|
* Do first level hashes in parallel.
|
||||||
pos[1] = buf;
|
*/
|
||||||
len[1] = bytes - bytes/2;
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
SHA256_CTX ctx;
|
uint64_t byt;
|
||||||
SHA256_Init(&ctx);
|
|
||||||
SHA256_Update(&ctx, pos[i], len[i]);
|
byt = i * BLKSZ;
|
||||||
SHA256_Final(cksum[i], &ctx);
|
SHA256_Init(&ctx[i]);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
SHA256_Update(&ctx[i], buf + byt, BLKSZ);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
SHA256_Final(cksum[i], &ctx[i]);
|
||||||
}
|
}
|
||||||
mctx = (SHA256_CTX *)malloc(sizeof (SHA256_CTX));
|
if (rem > 0) {
|
||||||
SHA256_Init(mctx);
|
SHA256_Update(&ctx[0], buf + bytes - rem, rem);
|
||||||
SHA256_Update(mctx, cksum, 2 * 32);
|
}
|
||||||
SHA256_Final(cksum_buf, mctx);
|
SHA256_Final(cksum[0], &ctx[0]);
|
||||||
free(mctx);
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
SHA256_Init(&ctx[0]);
|
||||||
|
SHA256_Init(&ctx[1]);
|
||||||
|
SHA256_Update(&ctx[0], &cksum[0], 2 * 32);
|
||||||
|
SHA256_Update(&ctx[1], &cksum[1], 2 * 32);
|
||||||
|
SHA256_Final(cksum[4], &ctx[0]);
|
||||||
|
SHA256_Final(cksum[5], &ctx[1]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
SHA256_Init(&ctx[0]);
|
||||||
|
SHA256_Update(&ctx[0], &cksum[4], 2 * 32);
|
||||||
|
SHA256_Final(cksum_buf, &ctx[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -100,43 +132,64 @@ ossl_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
void
|
void
|
||||||
ossl_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
ossl_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][32];
|
||||||
uint64_t len[2];
|
SHA512_CTX ctx[4];
|
||||||
uchar_t cksum[2][64];
|
int i, rem;
|
||||||
int i;
|
uint64_t _bytes;
|
||||||
SHA512_CTX *mctx;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
|
if (bytes <= BLKSZ * 2) {
|
||||||
mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX));
|
SHA512_Init(&ctx[0]);
|
||||||
SHA512_Init(mctx);
|
SHA512_Update(&ctx[0], buf, bytes);
|
||||||
SHA512_Update(mctx, buf, bytes);
|
SHA512_Final(cksum_buf, &ctx[0]);
|
||||||
SHA512_Final(cksum_buf, mctx);
|
|
||||||
free(mctx);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
pos[1] = buf + bytes/2;
|
* Do first level hashes in parallel.
|
||||||
len[1] = bytes - bytes/2;
|
*/
|
||||||
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
SHA512_CTX ctx;
|
uint64_t byt;
|
||||||
SHA512_Init(&ctx);
|
|
||||||
SHA512_Update(&ctx, pos[i], len[i]);
|
byt = i * BLKSZ;
|
||||||
SHA512_Final(cksum[i], &ctx);
|
SHA512_Init(&ctx[i]);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
SHA512_Update(&ctx[i], buf + byt, BLKSZ);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
SHA512_Final(cksum[i], &ctx[i]);
|
||||||
}
|
}
|
||||||
mctx = (SHA512_CTX *)malloc(sizeof (SHA512_CTX));
|
if (rem > 0) {
|
||||||
SHA512_Init(mctx);
|
SHA512_Update(&ctx[0], buf + bytes - rem, rem);
|
||||||
SHA512_Update(mctx, cksum, 2 * 64);
|
}
|
||||||
SHA512_Final(cksum_buf, mctx);
|
SHA512_Final(cksum[0], &ctx[0]);
|
||||||
free(mctx);
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
SHA512_Init(&ctx[0]);
|
||||||
|
SHA512_Init(&ctx[1]);
|
||||||
|
SHA512_Update(&ctx[0], &cksum[0], 2 * 32);
|
||||||
|
SHA512_Update(&ctx[1], &cksum[1], 2 * 32);
|
||||||
|
SHA512_Final(cksum[4], &ctx[0]);
|
||||||
|
SHA512_Final(cksum[5], &ctx[1]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
SHA512_Init(&ctx[0]);
|
||||||
|
SHA512_Update(&ctx[0], &cksum[4], 2 * 32);
|
||||||
|
SHA512_Final(cksum_buf, &ctx[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -152,43 +205,64 @@ opt_SHA512t256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
void
|
void
|
||||||
opt_SHA512t256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
opt_SHA512t256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][32];
|
||||||
uint64_t len[2];
|
SHA512_Context ctx[4];
|
||||||
uchar_t cksum[2][32];
|
int i, rem;
|
||||||
int i;
|
uint64_t _bytes;
|
||||||
SHA512_Context *mctx;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple serial hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
|
if (bytes <= BLKSZ * 2) {
|
||||||
mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
|
opt_SHA512t256_Init(&ctx[0]);
|
||||||
opt_SHA512t256_Init(mctx);
|
opt_SHA512t256_Update(&ctx[0], buf, bytes);
|
||||||
opt_SHA512t256_Update(mctx, buf, bytes);
|
opt_SHA512t256_Final(&ctx[0], cksum_buf);
|
||||||
opt_SHA512t256_Final(mctx, cksum_buf);
|
|
||||||
free(mctx);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
pos[1] = buf + bytes/2;
|
* Do first level hashes in parallel.
|
||||||
len[1] = bytes - bytes/2;
|
*/
|
||||||
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
SHA512_Context ctx;
|
uint64_t byt;
|
||||||
opt_SHA512t256_Init(&ctx);
|
|
||||||
opt_SHA512t256_Update(&ctx, pos[i], len[i]);
|
byt = i * BLKSZ;
|
||||||
opt_SHA512t256_Final(&ctx, cksum[i]);
|
opt_SHA512t256_Init(&ctx[i]);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
opt_SHA512t256_Update(&ctx[i], buf + byt, BLKSZ);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
opt_SHA512t256_Final(&ctx[i], cksum[i]);
|
||||||
}
|
}
|
||||||
mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
|
if (rem > 0) {
|
||||||
opt_SHA512t256_Init(mctx);
|
opt_SHA512t256_Update(&ctx[0], buf + bytes - rem, rem);
|
||||||
opt_SHA512t256_Update(mctx, cksum, 2 * 32);
|
}
|
||||||
opt_SHA512t256_Final(mctx, cksum_buf);
|
opt_SHA512t256_Final(&ctx[0], cksum[0]);
|
||||||
free(mctx);
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
opt_SHA512t256_Init(&ctx[0]);
|
||||||
|
opt_SHA512t256_Init(&ctx[1]);
|
||||||
|
opt_SHA512t256_Update(&ctx[0], &cksum[0], 2 * 32);
|
||||||
|
opt_SHA512t256_Update(&ctx[1], &cksum[1], 2 * 32);
|
||||||
|
opt_SHA512t256_Final(&ctx[0], cksum[4]);
|
||||||
|
opt_SHA512t256_Final(&ctx[1], cksum[5]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
opt_SHA512t256_Init(&ctx[0]);
|
||||||
|
opt_SHA512t256_Update(&ctx[0], &cksum[4], 2 * 32);
|
||||||
|
opt_SHA512t256_Final(&ctx[0], cksum_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -204,42 +278,62 @@ opt_SHA512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
void
|
void
|
||||||
opt_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
opt_SHA512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][64];
|
||||||
uint64_t len[2];
|
SHA512_Context ctx[4];
|
||||||
uchar_t cksum[2][64];
|
int i, rem;
|
||||||
int i;
|
uint64_t _bytes;
|
||||||
SHA512_Context *mctx;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple serial hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= SHA512_BLOCK_SIZE * 4) {
|
if (bytes <= BLKSZ * 2) {
|
||||||
mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
|
opt_SHA512_Init(&ctx[0]);
|
||||||
opt_SHA512_Init(mctx);
|
opt_SHA512_Update(&ctx[0], buf, bytes);
|
||||||
opt_SHA512_Update(mctx, buf, bytes);
|
opt_SHA512_Final(&ctx[0], cksum_buf);
|
||||||
opt_SHA512_Final(mctx, cksum_buf);
|
|
||||||
free(mctx);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
pos[1] = buf + bytes/2;
|
* Do first level hashes in parallel.
|
||||||
len[1] = bytes - bytes/2;
|
*/
|
||||||
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
SHA512_Context ctx;
|
uint64_t byt;
|
||||||
opt_SHA512_Init(&ctx);
|
|
||||||
opt_SHA512_Update(&ctx, pos[i], len[i]);
|
|
||||||
opt_SHA512_Final(&ctx, cksum[i]);
|
|
||||||
}
|
|
||||||
mctx = (SHA512_Context *)malloc(sizeof (SHA512_Context));
|
|
||||||
opt_SHA512_Init(mctx);
|
|
||||||
opt_SHA512_Update(mctx, cksum, 2 * 64);
|
|
||||||
opt_SHA512_Final(mctx, cksum_buf);
|
|
||||||
free(mctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
byt = i * BLKSZ;
|
||||||
|
opt_SHA512_Init(&ctx[i]);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
opt_SHA512_Update(&ctx[i], buf + byt, BLKSZ);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
opt_SHA512_Final(&ctx[i], cksum[i]);
|
||||||
|
}
|
||||||
|
if (rem > 0) {
|
||||||
|
opt_SHA512_Update(&ctx[0], buf + bytes - rem, rem);
|
||||||
|
}
|
||||||
|
opt_SHA512_Final(&ctx[0], cksum[0]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
opt_SHA512_Init(&ctx[0]);
|
||||||
|
opt_SHA512_Init(&ctx[1]);
|
||||||
|
opt_SHA512_Update(&ctx[0], &cksum[0], 2 * 64);
|
||||||
|
opt_SHA512_Update(&ctx[1], &cksum[1], 2 * 64);
|
||||||
|
opt_SHA512_Final(&ctx[0], cksum[4]);
|
||||||
|
opt_SHA512_Final(&ctx[1], cksum[5]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
opt_SHA512_Init(&ctx[0]);
|
||||||
|
opt_SHA512_Update(&ctx[0], &cksum[4], 2 * 64);
|
||||||
|
opt_SHA512_Final(&ctx[0], cksum_buf);
|
||||||
|
}
|
||||||
|
|
|
@ -30,10 +30,21 @@
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
|
|
||||||
#define KECCAK_BLOCK_SIZE 1024
|
#define KECCAK_BLOCK_SIZE 1024
|
||||||
|
#define BLKSZ (2048)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helper functions for single-call SHA3 (Keccak) hashing. Both serial
|
* Helper functions for single-call SHA3 (Keccak) hashing. Both serial
|
||||||
* and parallel versions are provided.
|
* and parallel versions are provided. Parallel versions use 2-stage
|
||||||
|
* Merkle Tree hashing.
|
||||||
|
*
|
||||||
|
* At the leaf level data is split into BLKSZ blocks and 4 threads
|
||||||
|
* compute 4 hashes of interleaved block streams. At 2nd level two
|
||||||
|
* new hashes are generated from hashing the 2 pairs of hash values.
|
||||||
|
* In the final stage the 2 hash values are hashed to the final digest.
|
||||||
|
*
|
||||||
|
* References:
|
||||||
|
* http://eprint.iacr.org/2012/476.pdf
|
||||||
|
* http://gva.noekeon.org/papers/bdpv09tree.html
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -45,32 +56,69 @@ Keccak256(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
int
|
int
|
||||||
Keccak256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
Keccak256_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][32];
|
||||||
uint64_t len[2];
|
hashState ctx[4];
|
||||||
uchar_t cksum[2][32];
|
int i, rem, rv[4];
|
||||||
int i, rv[2];
|
uint64_t _bytes;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple serial hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) {
|
if (bytes <= BLKSZ) {
|
||||||
return (Keccak_Hash(256, buf, bytes * 8, cksum_buf));
|
return (Keccak_Hash(256, buf, bytes * 8, cksum_buf));
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
pos[1] = buf + bytes/2;
|
* Do first level hashes in parallel.
|
||||||
len[1] = bytes - bytes/2;
|
*/
|
||||||
|
for (i = 0; i < 4; ++i) rv[i] = 0;
|
||||||
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
rv[i] = Keccak_Hash(256, pos[i], len[i] * 8, cksum[i]);
|
uint64_t byt;
|
||||||
|
|
||||||
|
byt = i * BLKSZ;
|
||||||
|
rv[i] |= Keccak_Init(&ctx[i], 256);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
rv[i] |= Keccak_Final(&ctx[i], cksum[i]);
|
||||||
}
|
}
|
||||||
if (rv[0] != 0 || rv[1] != 0)
|
if (rem > 0) {
|
||||||
return (-1);
|
rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8);
|
||||||
return (Keccak_Hash(256, (const BitSequence *)cksum, 2 * 32 * 8, cksum_buf));
|
}
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum[0]);
|
||||||
|
|
||||||
|
for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1);
|
||||||
|
rv[0] = 0;
|
||||||
|
rv[1] = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
rv[0] |= Keccak_Init(&ctx[0], 256);
|
||||||
|
rv[1] |= Keccak_Init(&ctx[1], 256);
|
||||||
|
rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 32 * 8);
|
||||||
|
rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 32 * 8);
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum[4]);
|
||||||
|
rv[1] |= Keccak_Final(&ctx[1], cksum[5]);
|
||||||
|
for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
rv[0] = 0;
|
||||||
|
rv[0] |= Keccak_Init(&ctx[0], 256);
|
||||||
|
rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 32 * 8);
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum_buf);
|
||||||
|
return (rv[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -82,30 +130,67 @@ Keccak512(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
int
|
int
|
||||||
Keccak512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
Keccak512_par(uchar_t *cksum_buf, uchar_t *buf, uint64_t bytes)
|
||||||
{
|
{
|
||||||
uchar_t *pos[2];
|
uchar_t cksum[6][64];
|
||||||
uint64_t len[2];
|
hashState ctx[4];
|
||||||
uchar_t cksum[2][64];
|
int i, rem, rv[4];
|
||||||
int i, rv[2];
|
uint64_t _bytes;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
* Is it worth doing the overhead of parallelism ? Buffer large enough ?
|
||||||
* If not then just do a simple serial hashing.
|
* If not then just do a simple serial hashing.
|
||||||
*/
|
*/
|
||||||
if (bytes / 2 <= KECCAK_BLOCK_SIZE * 2) {
|
if (bytes <= BLKSZ) {
|
||||||
return (Keccak_Hash(512, buf, bytes * 8, cksum_buf));
|
return (Keccak_Hash(512, buf, bytes * 8, cksum_buf));
|
||||||
}
|
}
|
||||||
pos[0] = buf;
|
|
||||||
len[0] = bytes/2;
|
/*
|
||||||
pos[1] = buf + bytes/2;
|
* Do first level hashes in parallel.
|
||||||
len[1] = bytes - bytes/2;
|
*/
|
||||||
|
for (i = 0; i < 4; ++i) rv[i] = 0;
|
||||||
|
_bytes = (bytes / BLKSZ) * BLKSZ;
|
||||||
|
rem = bytes - _bytes;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
# pragma omp parallel for
|
# pragma omp parallel for
|
||||||
#endif
|
#endif
|
||||||
for(i = 0; i < 2; ++i)
|
for(i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
rv[i] = Keccak_Hash(512, pos[i], len[i] * 8, cksum[i]);
|
uint64_t byt;
|
||||||
|
|
||||||
|
byt = i * BLKSZ;
|
||||||
|
rv[i] |= Keccak_Init(&ctx[i], 512);
|
||||||
|
while (byt < _bytes) {
|
||||||
|
rv[i] |= Keccak_Update(&ctx[i], buf + byt, BLKSZ * 8);
|
||||||
|
byt += 4 * BLKSZ;
|
||||||
|
}
|
||||||
|
if (i>0)
|
||||||
|
rv[i] |= Keccak_Final(&ctx[i], cksum[i]);
|
||||||
}
|
}
|
||||||
if (rv[0] != 0 || rv[1] != 0)
|
if (rem > 0) {
|
||||||
return (-1);
|
rv[0] |= Keccak_Update(&ctx[0], buf + bytes - rem, rem * 8);
|
||||||
return (Keccak_Hash(512, (const BitSequence *)cksum, 2 * 64 * 8, cksum_buf));
|
}
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum[0]);
|
||||||
|
|
||||||
|
for (i = 0; i < 4; ++i) if (rv[i] != 0) return (-1);
|
||||||
|
rv[0] = 0;
|
||||||
|
rv[1] = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Second level hashes.
|
||||||
|
*/
|
||||||
|
rv[0] |= Keccak_Init(&ctx[0], 512);
|
||||||
|
rv[1] |= Keccak_Init(&ctx[1], 512);
|
||||||
|
rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[0], 2 * 64 * 8);
|
||||||
|
rv[1] |= Keccak_Update(&ctx[1], (const BitSequence *)&cksum[1], 2 * 64 * 8);
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum[4]);
|
||||||
|
rv[1] |= Keccak_Final(&ctx[1], cksum[5]);
|
||||||
|
for (i = 0; i < 2; ++i) if (rv[i] != 0) return (-1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Final hash.
|
||||||
|
*/
|
||||||
|
rv[0] = 0;
|
||||||
|
rv[0] |= Keccak_Init(&ctx[0], 512);
|
||||||
|
rv[0] |= Keccak_Update(&ctx[0], (const BitSequence *)&cksum[4], 2 * 64 * 8);
|
||||||
|
rv[0] |= Keccak_Final(&ctx[0], cksum_buf);
|
||||||
|
return (rv[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,12 @@
|
||||||
- xxHash source repository : http://code.google.com/p/xxhash/
|
- xxHash source repository : http://code.google.com/p/xxhash/
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Modified by Moinak Ghosh for pcompress. The new hashing approach
|
||||||
|
* with interleaved blocks is derived from the following paper:
|
||||||
|
*
|
||||||
|
* http://eprint.iacr.org/2012/476.pdf
|
||||||
|
*/
|
||||||
|
|
||||||
//**************************************
|
//**************************************
|
||||||
// Tuning parameters
|
// Tuning parameters
|
||||||
|
@ -356,10 +361,10 @@ int CPUCAP_NM(XXH32_feed) (void* state_in, const void* input, int len)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
|
* 4-way SIMD calculations with 4 ints in two blocks for 2 accumulators will
|
||||||
* interleave to some extent on a hyperthreaded processor providing 10% - 14%
|
* interleave to some extent on the superscalar x86 processor providing
|
||||||
* speedup over original xxhash depending on processor. We could have used
|
* 10% - 14% speedup over original xxhash depending on processor model. We
|
||||||
* aligned loads but we actually want the unaligned penalty. It helps to
|
* could have used aligned loads but we actually want the unaligned penalty.
|
||||||
* interleave better for a slight benefit over aligned loads here!
|
* It helps to interleave better for a slight benefit over aligned loads here!
|
||||||
*/
|
*/
|
||||||
do {
|
do {
|
||||||
__m128i mem = _mm_loadu_si128((__m128i *)p);
|
__m128i mem = _mm_loadu_si128((__m128i *)p);
|
||||||
|
|
Loading…
Reference in a new issue