Major changes to use Intel's optimized SHA512 code for SHA512 and SHA512/256.
Remove earlier SHA256 code which is slower than SHA512/256 (on 64-bit CPU). Use HMAC from Alan Saddi's implementation for cleaner, faster code.
This commit is contained in:
parent
26bb137257
commit
43af97042a
15 changed files with 1391 additions and 1540 deletions
21
Makefile.in
21
Makefile.in
|
@ -102,12 +102,11 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \
|
|||
crypto/skein/skein_debug.h crypto/skein/skein_iv.h
|
||||
SKEINOBJS = $(SKEINSRCS:.c=.o)
|
||||
|
||||
SHA256_SRCS = crypto/sha2/sha256.c
|
||||
SHA256_HDRS = crypto/sha2/sha256.h
|
||||
SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \
|
||||
crypto/sha2/intel/sha256_sse4.asm
|
||||
SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o)
|
||||
SHA256_OBJS = $(SHA256_SRCS:.c=.o)
|
||||
SHA2_SRCS = crypto/sha2/sha512.c
|
||||
SHA2_HDRS = crypto/sha2/sha512.h
|
||||
SHA2ASM_SRCS = crypto/sha2/intel/sha512_avx.asm crypto/sha2/intel/sha512_sse4.asm
|
||||
SHA2ASM_OBJS = $(SHA2ASM_SRCS:.asm=.o)
|
||||
SHA2_OBJS = $(SHA2_SRCS:.c=.o)
|
||||
|
||||
YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX
|
||||
LIBBSCWRAP = libbsc_compress.c
|
||||
|
@ -161,7 +160,7 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR
|
|||
-L./buildtmp -Wl,-R@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS)
|
||||
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
|
||||
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
|
||||
$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
|
||||
$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
|
||||
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS)
|
||||
|
||||
DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@
|
||||
|
@ -199,7 +198,7 @@ SSE3_OPT_FLAG = -mssse3
|
|||
SSE2_OPT_FLAG = -msse2
|
||||
|
||||
SKEIN_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
|
||||
SHA256_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
|
||||
SHA2_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
|
||||
KECCAK_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
|
||||
|
||||
all: $(PROG)
|
||||
|
@ -237,10 +236,10 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
|
|||
$(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS)
|
||||
$(COMPILE) $(SKEIN_FLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS)
|
||||
$(COMPILE) $(SHA256_FLAGS) $(@:.o=.c) -o $@
|
||||
$(SHA2_OBJS): $(SHA2_SRCS) $(SHA2_HDRS)
|
||||
$(COMPILE) $(SHA2_FLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(SHA256ASM_OBJS): $(SHA256ASM_SRCS)
|
||||
$(SHA2ASM_OBJS): $(SHA2ASM_SRCS)
|
||||
$(YASM) -o $@ $(@:.o=.asm)
|
||||
|
||||
$(KECCAK_OBJS): $(KECCAK_SRCS) $(KECCAK_HDRS)
|
||||
|
|
8
config
8
config
|
@ -236,8 +236,8 @@ then
|
|||
# Minimum yasm version 1.1
|
||||
[ $major -lt 1 -o $minor -lt 1 ] && continue
|
||||
yasm=${bindir}/yasm
|
||||
sha256asmobjs='\$\(SHA256ASM_OBJS\)'
|
||||
sha256objs='\$\(SHA256_OBJS\)'
|
||||
sha256asmobjs='\$\(SHA2ASM_OBJS\)'
|
||||
sha256objs='\$\(SHA2_OBJS\)'
|
||||
fi
|
||||
done
|
||||
if [ "x${yasm}" = "x" ]
|
||||
|
@ -492,8 +492,8 @@ libbsclflagsvar="LIBBSCLFLAGS"
|
|||
libbscwrapobjvar="LIBBSCWRAPOBJ"
|
||||
libbscgenoptvar="LIBBSCGEN_OPT"
|
||||
libbsccppflagsvar="LIBBSCCPPFLAGS"
|
||||
sha256asmobjsvar="SHA256ASM_OBJS"
|
||||
sha256objsvar="SHA256_OBJS"
|
||||
sha256asmobjsvar="SHA2ASM_OBJS"
|
||||
sha256objsvar="SHA2_OBJS"
|
||||
yasmvar="YASM"
|
||||
fptr_flag_var="FPTR_FLAG"
|
||||
extra_opt_flags_var="EXTRA_OPT_FLAGS"
|
||||
|
|
|
@ -36,7 +36,8 @@
|
|||
#include <openssl/rand.h>
|
||||
#include <openssl/evp.h>
|
||||
#include <openssl/hmac.h>
|
||||
#include <sha256.h>
|
||||
//#include <sha256.h>
|
||||
#include <sha512.h>
|
||||
#include <crypto_aes.h>
|
||||
#include <KeccakNISTInterface.h>
|
||||
#include <utils.h>
|
||||
|
@ -46,7 +47,7 @@
|
|||
#define PROVIDER_OPENSSL 0
|
||||
#define PROVIDER_X64_OPT 1
|
||||
|
||||
static void init_sha256(void);
|
||||
static void init_sha512(void);
|
||||
static int geturandom_bytes(uchar_t rbytes[32]);
|
||||
/*
|
||||
* Checksum properties
|
||||
|
@ -66,9 +67,9 @@ static struct {
|
|||
{"SKEIN512", "512-bit SKEIN",
|
||||
CKSUM_SKEIN512, 64, 64, NULL},
|
||||
{"SHA256", "Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.",
|
||||
CKSUM_SHA256, 32, 32, init_sha256},
|
||||
CKSUM_SHA256, 32, 32, init_sha512},
|
||||
{"SHA512", "512-bit SHA2 from OpenSSL's crypto library.",
|
||||
CKSUM_SHA512, 64, 64, NULL},
|
||||
CKSUM_SHA512, 64, 64, init_sha512},
|
||||
{"KECCAK256", "Official 256-bit NIST SHA3 optimized implementation.",
|
||||
CKSUM_KECCAK256, 32, 32, NULL},
|
||||
{"KECCAK512", "Official 512-bit NIST SHA3 optimized implementation.",
|
||||
|
@ -190,18 +191,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
|
|||
SHA256_Update(&ctx, buf, bytes);
|
||||
SHA256_Final(cksum_buf, &ctx);
|
||||
} else {
|
||||
SHA256_Context ctx;
|
||||
SHA512_Context ctx;
|
||||
|
||||
opt_SHA256_Init(&ctx);
|
||||
opt_SHA256_Update(&ctx, buf, bytes);
|
||||
opt_SHA256_Final(&ctx, cksum_buf);
|
||||
opt_SHA512t256_Init(&ctx);
|
||||
opt_SHA512t256_Update(&ctx, buf, bytes);
|
||||
opt_SHA512t256_Final(&ctx, cksum_buf);
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
SHA512_CTX ctx;
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
SHA512_CTX ctx;
|
||||
|
||||
SHA512_Init(&ctx);
|
||||
SHA512_Update(&ctx, buf, bytes);
|
||||
SHA512_Final(cksum_buf, &ctx);
|
||||
SHA512_Init(&ctx);
|
||||
SHA512_Update(&ctx, buf, bytes);
|
||||
SHA512_Final(cksum_buf, &ctx);
|
||||
} else {
|
||||
SHA512_Context ctx;
|
||||
|
||||
opt_SHA512_Init(&ctx);
|
||||
opt_SHA512_Update(&ctx, buf, bytes);
|
||||
opt_SHA512_Final(&ctx, cksum_buf);
|
||||
}
|
||||
|
||||
} else if (cksum == CKSUM_KECCAK256) {
|
||||
if (Keccak_Hash(256, buf, bytes * 8, cksum_buf) != 0)
|
||||
|
@ -219,7 +228,7 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
|
|||
}
|
||||
|
||||
static void
|
||||
init_sha256(void)
|
||||
init_sha512(void)
|
||||
{
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
cksum_provider = PROVIDER_OPENSSL;
|
||||
|
@ -227,7 +236,7 @@ init_sha256(void)
|
|||
#ifdef __x86_64__
|
||||
cksum_provider = PROVIDER_OPENSSL;
|
||||
if (proc_info.proc_type == PROC_X64_INTEL || proc_info.proc_type == PROC_X64_AMD) {
|
||||
if (opt_Init_SHA(&proc_info) == 0) {
|
||||
if (opt_Init_SHA512(&proc_info) == 0) {
|
||||
cksum_provider = PROVIDER_X64_OPT;
|
||||
}
|
||||
}
|
||||
|
@ -355,7 +364,7 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
|
|||
}
|
||||
mctx->mac_ctx_reinit = ctx;
|
||||
} else {
|
||||
HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
|
||||
/* HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
|
||||
if (!ctx) return (-1);
|
||||
opt_HMAC_SHA256_Init(ctx, actx->pkey, KEYLEN);
|
||||
mctx->mac_ctx = ctx;
|
||||
|
@ -366,26 +375,54 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
|
|||
return (-1);
|
||||
}
|
||||
memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA256_Context));
|
||||
mctx->mac_ctx_reinit = ctx;*/
|
||||
|
||||
HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
|
||||
if (!ctx) return (-1);
|
||||
opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN);
|
||||
mctx->mac_ctx = ctx;
|
||||
|
||||
ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
|
||||
if (!ctx) {
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
}
|
||||
memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
|
||||
mctx->mac_ctx_reinit = ctx;
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
|
||||
if (!ctx) return (-1);
|
||||
HMAC_CTX_init(ctx);
|
||||
HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL);
|
||||
mctx->mac_ctx = ctx;
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
|
||||
if (!ctx) return (-1);
|
||||
HMAC_CTX_init(ctx);
|
||||
HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL);
|
||||
mctx->mac_ctx = ctx;
|
||||
|
||||
ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
|
||||
if (!ctx) {
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
|
||||
if (!ctx) {
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
}
|
||||
if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) {
|
||||
free(ctx);
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
}
|
||||
mctx->mac_ctx_reinit = ctx;
|
||||
} else {
|
||||
HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
|
||||
if (!ctx) return (-1);
|
||||
opt_HMAC_SHA512_Init(ctx, actx->pkey, KEYLEN);
|
||||
mctx->mac_ctx = ctx;
|
||||
|
||||
ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
|
||||
if (!ctx) {
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
}
|
||||
memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
|
||||
mctx->mac_ctx_reinit = ctx;
|
||||
}
|
||||
if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) {
|
||||
free(ctx);
|
||||
free(mctx->mac_ctx);
|
||||
return (-1);
|
||||
}
|
||||
mctx->mac_ctx_reinit = ctx;
|
||||
|
||||
} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
|
||||
hashState *ctx = (hashState *)malloc(sizeof (hashState));
|
||||
|
@ -423,16 +460,13 @@ hmac_reinit(mac_ctx_t *mctx)
|
|||
if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
|
||||
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t));
|
||||
|
||||
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
|
||||
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx),
|
||||
(HMAC_CTX *)(mctx->mac_ctx_reinit));
|
||||
} else {
|
||||
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA256_Context));
|
||||
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA512_Context));
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx), (HMAC_CTX *)(mctx->mac_ctx_reinit));
|
||||
|
||||
} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
|
||||
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (hashState));
|
||||
} else {
|
||||
|
@ -458,15 +492,19 @@ hmac_update(mac_ctx_t *mctx, uchar_t *data, uint64_t len)
|
|||
HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
|
||||
#endif
|
||||
} else {
|
||||
opt_HMAC_SHA256_Update((HMAC_SHA256_Context *)(mctx->mac_ctx), data, len);
|
||||
opt_HMAC_SHA512t256_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
#ifndef __OSSL_OLD__
|
||||
if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0)
|
||||
return (-1);
|
||||
if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0)
|
||||
return (-1);
|
||||
#else
|
||||
HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
|
||||
HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
|
||||
#endif
|
||||
} else {
|
||||
opt_HMAC_SHA512_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
|
||||
}
|
||||
|
||||
} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
|
||||
// Keccak takes data length in bits so we have to scale
|
||||
|
@ -503,12 +541,16 @@ hmac_final(mac_ctx_t *mctx, uchar_t *hash, unsigned int *len)
|
|||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
|
||||
} else {
|
||||
opt_HMAC_SHA256_Final((HMAC_SHA256_Context *)(mctx->mac_ctx), hash);
|
||||
opt_HMAC_SHA512t256_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
|
||||
*len = 32;
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
|
||||
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
|
||||
} else {
|
||||
opt_HMAC_SHA512_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
|
||||
*len = 64;
|
||||
}
|
||||
} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
|
||||
if (Keccak_Final((hashState *)(mctx->mac_ctx), hash) != 0)
|
||||
return (-1);
|
||||
|
@ -531,18 +573,14 @@ hmac_cleanup(mac_ctx_t *mctx)
|
|||
memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t));
|
||||
memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t));
|
||||
|
||||
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
|
||||
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
|
||||
HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
|
||||
} else {
|
||||
memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA256_Context));
|
||||
memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA256_Context));
|
||||
memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA512_Context));
|
||||
memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA512_Context));
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
|
||||
HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
|
||||
|
||||
} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
|
||||
memset(mctx->mac_ctx, 0, sizeof (hashState));
|
||||
memset(mctx->mac_ctx_reinit, 0, sizeof (hashState));
|
||||
|
|
|
@ -33,7 +33,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#define MAX_PW_LEN 16
|
||||
#define CKSUM_MASK 0x800
|
||||
#define CKSUM_MASK 0x700
|
||||
#define CKSUM_MAX_BYTES 64
|
||||
#define DEFAULT_CKSUM "SKEIN256"
|
||||
|
||||
|
|
84
crypto/sha2/_hmac.c
Normal file
84
crypto/sha2/_hmac.c
Normal file
|
@ -0,0 +1,84 @@
|
|||
/*-
|
||||
* Copyright (c) 2010, 2011 Allan Saddi <allan@saddi.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
void
|
||||
HMAC_INIT(HMAC_CONTEXT *ctxt, const void *key, size_t keyLen)
|
||||
{
|
||||
HASH_CONTEXT keyCtxt;
|
||||
unsigned int i;
|
||||
uint8_t pkey[HASH_BLOCK_SIZE], okey[HASH_BLOCK_SIZE], ikey[HASH_BLOCK_SIZE];
|
||||
|
||||
/* Ensure key is zero-padded */
|
||||
memset(pkey, 0, sizeof(pkey));
|
||||
|
||||
if (keyLen > sizeof(pkey)) {
|
||||
/* Hash key if > HASH_BLOCK_SIZE */
|
||||
HASH_INIT(&keyCtxt);
|
||||
HASH_UPDATE(&keyCtxt, key, keyLen);
|
||||
HASH_FINAL(&keyCtxt, pkey);
|
||||
}
|
||||
else {
|
||||
memcpy(pkey, key, keyLen);
|
||||
}
|
||||
|
||||
/* XOR with opad, ipad */
|
||||
for (i = 0; i < sizeof(okey); i++) {
|
||||
okey[i] = pkey[i] ^ 0x5c;
|
||||
}
|
||||
for (i = 0; i < sizeof(ikey); i++) {
|
||||
ikey[i] = pkey[i] ^ 0x36;
|
||||
}
|
||||
|
||||
/* Initialize hash contexts */
|
||||
HASH_INIT(&ctxt->outer);
|
||||
HASH_UPDATE(&ctxt->outer, okey, sizeof(okey));
|
||||
HASH_INIT(&ctxt->inner);
|
||||
HASH_UPDATE(&ctxt->inner, ikey, sizeof(ikey));
|
||||
|
||||
/* Burn the stack */
|
||||
memset(ikey, 0, sizeof(ikey));
|
||||
memset(okey, 0, sizeof(okey));
|
||||
memset(pkey, 0, sizeof(pkey));
|
||||
memset(&keyCtxt, 0, sizeof(keyCtxt));
|
||||
}
|
||||
|
||||
void
|
||||
HMAC_UPDATE(HMAC_CONTEXT *ctxt, const void *data, size_t len)
|
||||
{
|
||||
HASH_UPDATE(&ctxt->inner, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
HMAC_FINAL(HMAC_CONTEXT *ctxt, uint8_t hmac[HASH_SIZE])
|
||||
{
|
||||
uint8_t ihash[HASH_SIZE];
|
||||
|
||||
HASH_FINAL(&ctxt->inner, ihash);
|
||||
HASH_UPDATE(&ctxt->outer, ihash, sizeof(ihash));
|
||||
HASH_FINAL(&ctxt->outer, hmac);
|
||||
|
||||
memset(ihash, 0, sizeof(ihash));
|
||||
}
|
|
@ -1,577 +0,0 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define VMOVDQ vmovdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
%macro MY_ROR 2
|
||||
shld %1,%1,(32-(%2))
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
VMOVDQ %1, %2
|
||||
vpshufb %1, %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
%define XTMP5 xmm11
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm13
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 8*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
;vmovdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
;vmovdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpsrld XTMP2, XTMP1, 7
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
|
||||
vpslld XTMP3, XTMP1, (32-7)
|
||||
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
|
||||
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
|
||||
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
|
||||
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
|
||||
vpsrld XTMP2, XTMP1,18
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
|
||||
vpslld XTMP1, XTMP1, (32-18)
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP1
|
||||
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
|
||||
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_avx
|
||||
align 32
|
||||
sha256_avx:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 2*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 3*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vpaddd XFER, X1, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vmovdqa X0, X2
|
||||
vmovdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
|
||||
%endif
|
||||
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
|
@ -1,535 +0,0 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define MOVDQ movdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
MOVDQ %1, %2
|
||||
pshufb %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm12
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 7*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
movdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
movdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pslld XTMP1, (32-7)
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
psrld XTMP2, 7
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
pslld XTMP3, (32-18)
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
psrld XTMP2, 18
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP1, XTMP3
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pxor XTMP1, XTMP4 ; XTMP1 = s0
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
pxor XTMP2, XTMP3
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
pxor X0, XTMP2 ; X0 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_sse4
|
||||
align 32
|
||||
sha256_sse4:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
movdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
movdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
movdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
movdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
movdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
movdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
movdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
movdqa XFER, [TBL + 0*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 1*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 2*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 3*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
paddd X0, [TBL + 0*16]
|
||||
movdqa [rsp + _XFER], X0
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
paddd X1, [TBL + 1*16]
|
||||
movdqa [rsp + _XFER], X1
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
movdqa X0, X2
|
||||
movdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
%endif
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
409
crypto/sha2/intel/sha512_avx.asm
Normal file
409
crypto/sha2/intel/sha512_avx.asm
Normal file
|
@ -0,0 +1,409 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -f x64 -D WINABI sha512_avx.asm
|
||||
; Linux: yasm -f elf64 sha512_avx.asm
|
||||
;
|
||||
|
||||
BITS 64
|
||||
section .text
|
||||
|
||||
; Virtual Registers
|
||||
%ifdef WINABI
|
||||
%define msg rcx ; ARG1
|
||||
%define digest rdx ; ARG2
|
||||
%define msglen r8 ; ARG3
|
||||
%define T1 rsi
|
||||
%define T2 rdi
|
||||
%else
|
||||
%define msg rdi ; ARG1
|
||||
%define digest rsi ; ARG2
|
||||
%define msglen rdx ; ARG3
|
||||
%define T1 rcx
|
||||
%define T2 r8
|
||||
%endif
|
||||
%define a_64 r9
|
||||
%define b_64 r10
|
||||
%define c_64 r11
|
||||
%define d_64 r12
|
||||
%define e_64 r13
|
||||
%define f_64 r14
|
||||
%define g_64 r15
|
||||
%define h_64 rbx
|
||||
%define tmp0 rax
|
||||
|
||||
; Local variables (stack frame)
|
||||
; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
|
||||
struc frame
|
||||
.W: resq 80 ; Message Schedule
|
||||
.WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
|
||||
|
||||
%ifdef WINABI
|
||||
.XMMSAVE: resdq 4
|
||||
.GPRSAVE: resq 7
|
||||
%else
|
||||
.GPRSAVE: resq 5
|
||||
%endif
|
||||
endstruc
|
||||
|
||||
; Useful QWORD "arrays" for simpler memory references
|
||||
%define MSG(i) msg + 8*(i) ; Input message (arg1)
|
||||
%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
|
||||
%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem)
|
||||
%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
|
||||
%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
|
||||
; MSG, DIGEST, K_t, W_t are arrays
|
||||
; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
|
||||
|
||||
%macro RotateState 0
|
||||
; Rotate symbles a..h right
|
||||
%xdefine %%TMP h_64
|
||||
%xdefine h_64 g_64
|
||||
%xdefine g_64 f_64
|
||||
%xdefine f_64 e_64
|
||||
%xdefine e_64 d_64
|
||||
%xdefine d_64 c_64
|
||||
%xdefine c_64 b_64
|
||||
%xdefine b_64 a_64
|
||||
%xdefine a_64 %%TMP
|
||||
%endmacro
|
||||
|
||||
%macro RORQ 2
|
||||
; shld is faster than ror on Sandybridge
|
||||
shld %1, %1, (64 - %2)
|
||||
%endmacro
|
||||
|
||||
%macro SHA512_Round 1
|
||||
%assign %%t (%1)
|
||||
|
||||
; Compute Round %%t
|
||||
mov T1, f_64 ; T1 = f
|
||||
mov tmp0, e_64 ; tmp = e
|
||||
xor T1, g_64 ; T1 = f ^ g
|
||||
RORQ tmp0, 23 ; 41 ; tmp = e ror 23
|
||||
and T1, e_64 ; T1 = (f ^ g) & e
|
||||
xor tmp0, e_64 ; tmp = (e ror 23) ^ e
|
||||
xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
|
||||
add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
|
||||
RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
|
||||
xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
|
||||
mov T2, a_64 ; T2 = a
|
||||
add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
|
||||
RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
|
||||
add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
|
||||
mov tmp0, a_64 ; tmp = a
|
||||
xor T2, c_64 ; T2 = a ^ c
|
||||
and tmp0, c_64 ; tmp = a & c
|
||||
and T2, b_64 ; T2 = (a ^ c) & b
|
||||
xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
|
||||
mov tmp0, a_64 ; tmp = a
|
||||
RORQ tmp0, 5 ; 39 ; tmp = a ror 5
|
||||
xor tmp0, a_64 ; tmp = (a ror 5) ^ a
|
||||
add d_64, T1 ; e(next_state) = d + T1
|
||||
RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
|
||||
xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
|
||||
lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
|
||||
RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
|
||||
add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
|
||||
RotateState
|
||||
%endmacro
|
||||
|
||||
%macro SHA512_2Sched_2Round_avx 1
|
||||
%assign %%t %1
|
||||
; Compute rounds %%t-2 and %%t-1
|
||||
; Compute message schedule QWORDS %%t and %%t+1
|
||||
|
||||
; Two rounds are computed based on the values for K[t-2]+W[t-2] and
|
||||
; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
|
||||
; scheduler.
|
||||
; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
|
||||
; They are then added to their respective SHA512 constants at
|
||||
; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
|
||||
; For brievity, the comments following vectored instructions only refer to
|
||||
; the first of a pair of QWORDS.
|
||||
; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
|
||||
; The computation of the message schedule and the rounds are tightly
|
||||
; stitched to take advantage of instruction-level parallelism.
|
||||
; For clarity, integer instructions (for the rounds calculation) are indented
|
||||
; by one tab. Vectored instructions (for the message scheduler) are indented
|
||||
; by two tabs.
|
||||
|
||||
vmovdqa xmm4, [W_t(%%t-2)] ; XMM4 = W[t-2]
|
||||
vmovdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
|
||||
mov T1, f_64
|
||||
vpsrlq xmm0, xmm4, 61 ; XMM0 = W[t-2]>>61
|
||||
mov tmp0, e_64
|
||||
vpsrlq xmm6, xmm5, 1 ; XMM6 = W[t-15]>>1
|
||||
xor T1, g_64
|
||||
RORQ tmp0, 23 ; 41
|
||||
vpsrlq xmm1, xmm4, 19 ; XMM1 = W[t-2]>>19
|
||||
and T1, e_64
|
||||
xor tmp0, e_64
|
||||
vpxor xmm0, xmm1 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19
|
||||
xor T1, g_64
|
||||
add T1, [WK_2(%%t)];
|
||||
vpsrlq xmm7, xmm5, 8 ; XMM7 = W[t-15]>>8
|
||||
RORQ tmp0, 4 ; 18
|
||||
vpsrlq xmm2, xmm4, 6 ; XMM2 = W[t-2]>>6
|
||||
xor tmp0, e_64
|
||||
mov T2, a_64
|
||||
add T1, h_64
|
||||
vpxor xmm6, xmm7 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8
|
||||
RORQ tmp0, 14 ; 14
|
||||
add T1, tmp0
|
||||
vpsrlq xmm8, xmm5, 7 ; XMM8 = W[t-15]>>7
|
||||
mov tmp0, a_64
|
||||
xor T2, c_64
|
||||
vpsllq xmm3, xmm4, (64-61) ; XMM3 = W[t-2]<<3
|
||||
and tmp0, c_64
|
||||
and T2, b_64
|
||||
vpxor xmm2, xmm3 ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3
|
||||
xor T2, tmp0
|
||||
mov tmp0, a_64
|
||||
vpsllq xmm9, xmm5, (64-1) ; XMM9 = W[t-15]<<63
|
||||
RORQ tmp0, 5 ; 39
|
||||
vpxor xmm8, xmm9 ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63
|
||||
xor tmp0, a_64
|
||||
add d_64, T1
|
||||
RORQ tmp0, 6 ; 34
|
||||
xor tmp0, a_64
|
||||
vpxor xmm6, xmm8 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63
|
||||
lea h_64, [T1 + T2]
|
||||
RORQ tmp0, 28 ; 28
|
||||
vpsllq xmm4, (64-19) ; XMM4 = W[t-2]<<25
|
||||
add h_64, tmp0
|
||||
RotateState
|
||||
vpxor xmm0, xmm4 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25
|
||||
mov T1, f_64
|
||||
vpxor xmm0, xmm2 ; XMM0 = s1(W[t-2])
|
||||
mov tmp0, e_64
|
||||
xor T1, g_64
|
||||
vpaddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + W[t-16]
|
||||
vmovdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
|
||||
RORQ tmp0, 23 ; 41
|
||||
and T1, e_64
|
||||
xor tmp0, e_64
|
||||
xor T1, g_64
|
||||
vpsllq xmm5, (64-8) ; XMM5 = W[t-15]<<56
|
||||
add T1, [WK_2(%%t+1)]
|
||||
vpxor xmm6, xmm5 ; XMM6 = s0(W[t-15])
|
||||
RORQ tmp0, 4 ; 18
|
||||
vpaddq xmm0, xmm6 ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
|
||||
xor tmp0, e_64
|
||||
vpaddq xmm0, xmm1 ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
|
||||
mov T2, a_64
|
||||
add T1, h_64
|
||||
RORQ tmp0, 14 ; 14
|
||||
add T1, tmp0
|
||||
vmovdqa [W_t(%%t)], xmm0 ; Store W[t]
|
||||
vpaddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
vmovdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
|
||||
mov tmp0, a_64
|
||||
xor T2, c_64
|
||||
and tmp0, c_64
|
||||
and T2, b_64
|
||||
xor T2, tmp0
|
||||
mov tmp0, a_64
|
||||
RORQ tmp0, 5 ; 39
|
||||
xor tmp0, a_64
|
||||
add d_64, T1
|
||||
RORQ tmp0, 6 ; 34
|
||||
xor tmp0, a_64
|
||||
lea h_64, [T1 + T2]
|
||||
RORQ tmp0, 28 ; 28
|
||||
add h_64, tmp0
|
||||
RotateState
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; void sha512_avx(const void* M, void* D, uint64_t L);
|
||||
; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
|
||||
; The size of the message pointed to by M must be an integer multiple of SHA512
|
||||
; message blocks.
|
||||
; L is the message length in SHA512 blocks
|
||||
global sha512_avx:function
|
||||
sha512_avx:
|
||||
cmp msglen, 0
|
||||
je .nowork
|
||||
|
||||
; Allocate Stack Space
|
||||
sub rsp, frame_size
|
||||
|
||||
; Save GPRs
|
||||
mov [rsp + frame.GPRSAVE + 8 * 0], rbx
|
||||
mov [rsp + frame.GPRSAVE + 8 * 1], r12
|
||||
mov [rsp + frame.GPRSAVE + 8 * 2], r13
|
||||
mov [rsp + frame.GPRSAVE + 8 * 3], r14
|
||||
mov [rsp + frame.GPRSAVE + 8 * 4], r15
|
||||
%ifdef WINABI
|
||||
mov [rsp + frame.GPRSAVE + 8 * 5], rsi
|
||||
mov [rsp + frame.GPRSAVE + 8 * 6], rdi
|
||||
%endif
|
||||
; Save XMMs
|
||||
%ifdef WINABI
|
||||
vmovdqa [rsp + frame.XMMSAVE + 16 * 0], xmm6
|
||||
vmovdqa [rsp + frame.XMMSAVE + 16 * 1], xmm7
|
||||
vmovdqa [rsp + frame.XMMSAVE + 16 * 2], xmm8
|
||||
vmovdqa [rsp + frame.XMMSAVE + 16 * 3], xmm9
|
||||
%endif
|
||||
|
||||
.updateblock:
|
||||
|
||||
; Load state variables
|
||||
mov a_64, [DIGEST(0)]
|
||||
mov b_64, [DIGEST(1)]
|
||||
mov c_64, [DIGEST(2)]
|
||||
mov d_64, [DIGEST(3)]
|
||||
mov e_64, [DIGEST(4)]
|
||||
mov f_64, [DIGEST(5)]
|
||||
mov g_64, [DIGEST(6)]
|
||||
mov h_64, [DIGEST(7)]
|
||||
|
||||
%assign t 0
|
||||
%rep 80/2 + 1
|
||||
; (80 rounds) / (2 rounds/iteration) + (1 iteration)
|
||||
; +1 iteration because the scheduler leads hashing by 1 iteration
|
||||
%if t < 2
|
||||
; BSWAP 2 QWORDS
|
||||
vmovdqa xmm1, [XMM_QWORD_BSWAP wrt rip]
|
||||
vmovdqu xmm0, [MSG(t)]
|
||||
vpshufb xmm0, xmm0, xmm1 ; BSWAP
|
||||
vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair
|
||||
vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
vmovdqa [WK_2(t)], xmm0 ; Store into WK for rounds
|
||||
%elif t < 16
|
||||
; BSWAP 2 QWORDS, Compute 2 Rounds
|
||||
vmovdqu xmm0, [MSG(t)]
|
||||
vpshufb xmm0, xmm0, xmm1 ; BSWAP
|
||||
SHA512_Round t - 2 ; Round t-2
|
||||
vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair
|
||||
vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
SHA512_Round t - 1 ; Round t-1
|
||||
vmovdqa [WK_2(t)], xmm0 ; W[t]+K[t] into WK
|
||||
%elif t < 79
|
||||
; Schedule 2 QWORDS; Compute 2 Rounds
|
||||
SHA512_2Sched_2Round_avx t
|
||||
%else
|
||||
; Compute 2 Rounds
|
||||
SHA512_Round t - 2
|
||||
SHA512_Round t - 1
|
||||
%endif
|
||||
%assign t t+2
|
||||
%endrep
|
||||
|
||||
; Update digest
|
||||
add [DIGEST(0)], a_64
|
||||
add [DIGEST(1)], b_64
|
||||
add [DIGEST(2)], c_64
|
||||
add [DIGEST(3)], d_64
|
||||
add [DIGEST(4)], e_64
|
||||
add [DIGEST(5)], f_64
|
||||
add [DIGEST(6)], g_64
|
||||
add [DIGEST(7)], h_64
|
||||
|
||||
; Advance to next message block
|
||||
add msg, 16*8
|
||||
dec msglen
|
||||
jnz .updateblock
|
||||
|
||||
; Restore XMMs
|
||||
%ifdef WINABI
|
||||
vmovdqa xmm6, [rsp + frame.XMMSAVE + 16 * 0]
|
||||
vmovdqa xmm7, [rsp + frame.XMMSAVE + 16 * 1]
|
||||
vmovdqa xmm8, [rsp + frame.XMMSAVE + 16 * 2]
|
||||
vmovdqa xmm9, [rsp + frame.XMMSAVE + 16 * 3]
|
||||
%endif
|
||||
; Restore GPRs
|
||||
mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
|
||||
mov r12, [rsp + frame.GPRSAVE + 8 * 1]
|
||||
mov r13, [rsp + frame.GPRSAVE + 8 * 2]
|
||||
mov r14, [rsp + frame.GPRSAVE + 8 * 3]
|
||||
mov r15, [rsp + frame.GPRSAVE + 8 * 4]
|
||||
%ifdef WINABI
|
||||
mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
|
||||
mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
|
||||
%endif
|
||||
; Restore Stack Pointer
|
||||
add rsp, frame_size
|
||||
|
||||
.nowork:
|
||||
ret
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; Binary Data
|
||||
|
||||
section .data
|
||||
|
||||
ALIGN 16
|
||||
|
||||
; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||
XMM_QWORD_BSWAP:
|
||||
ddq 0x08090a0b0c0d0e0f0001020304050607
|
||||
|
||||
; K[t] used in SHA512 hashing
|
||||
K512:
|
||||
dq 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
dq 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
dq 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
dq 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
dq 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
dq 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
dq 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
dq 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
dq 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
dq 0xd192e819d6ef5218,0xd69906245565a910
|
||||
dq 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
dq 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
dq 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
dq 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
dq 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
dq 0x28db77f523047d84,0x32caab7b40c72493
|
||||
dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
|
398
crypto/sha2/intel/sha512_sse4.asm
Normal file
398
crypto/sha2/intel/sha512_sse4.asm
Normal file
|
@ -0,0 +1,398 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -f x64 -D WINABI sha512_sse4.asm
|
||||
; Linux: yasm -f elf64 sha512_sse4.asm
|
||||
;
|
||||
; Alternative Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -D WINABI -rnasm -pnasm -o sha512_sse4.obj -g cv8 sha512_sse4.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha512_sse4.o sha512_sse4.asm
|
||||
;
|
||||
|
||||
BITS 64
|
||||
section .text
|
||||
|
||||
; Virtual Registers
|
||||
%ifdef WINABI
|
||||
%define msg rcx ; ARG1
|
||||
%define digest rdx ; ARG2
|
||||
%define msglen r8 ; ARG3
|
||||
%define T1 rsi
|
||||
%define T2 rdi
|
||||
%else
|
||||
%define msg rdi ; ARG1
|
||||
%define digest rsi ; ARG2
|
||||
%define msglen rdx ; ARG3
|
||||
%define T1 rcx
|
||||
%define T2 r8
|
||||
%endif
|
||||
%define a_64 r9
|
||||
%define b_64 r10
|
||||
%define c_64 r11
|
||||
%define d_64 r12
|
||||
%define e_64 r13
|
||||
%define f_64 r14
|
||||
%define g_64 r15
|
||||
%define h_64 rbx
|
||||
%define tmp0 rax
|
||||
|
||||
; Local variables (stack frame)
|
||||
; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
|
||||
struc frame
|
||||
.W: resq 80 ; Message Schedule
|
||||
.WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
|
||||
|
||||
%ifdef WINABI
|
||||
.GPRSAVE: resq 7
|
||||
%else
|
||||
.GPRSAVE: resq 5
|
||||
%endif
|
||||
endstruc
|
||||
|
||||
; Useful QWORD "arrays" for simpler memory references
|
||||
%define MSG(i) msg + 8*(i) ; Input message (arg1)
|
||||
%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
|
||||
%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem)
|
||||
%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
|
||||
%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
|
||||
; MSG, DIGEST, K_t, W_t are arrays
|
||||
; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
|
||||
|
||||
%macro RotateState 0
|
||||
; Rotate symbles a..h right
|
||||
%xdefine %%TMP h_64
|
||||
%xdefine h_64 g_64
|
||||
%xdefine g_64 f_64
|
||||
%xdefine f_64 e_64
|
||||
%xdefine e_64 d_64
|
||||
%xdefine d_64 c_64
|
||||
%xdefine c_64 b_64
|
||||
%xdefine b_64 a_64
|
||||
%xdefine a_64 %%TMP
|
||||
%endmacro
|
||||
|
||||
%macro SHA512_Round 1
|
||||
%assign %%t (%1)
|
||||
|
||||
; Compute Round %%t
|
||||
mov T1, f_64 ; T1 = f
|
||||
mov tmp0, e_64 ; tmp = e
|
||||
xor T1, g_64 ; T1 = f ^ g
|
||||
ror tmp0, 23 ; 41 ; tmp = e ror 23
|
||||
and T1, e_64 ; T1 = (f ^ g) & e
|
||||
xor tmp0, e_64 ; tmp = (e ror 23) ^ e
|
||||
xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
|
||||
add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
|
||||
ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
|
||||
xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
|
||||
mov T2, a_64 ; T2 = a
|
||||
add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
|
||||
ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
|
||||
add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
|
||||
mov tmp0, a_64 ; tmp = a
|
||||
xor T2, c_64 ; T2 = a ^ c
|
||||
and tmp0, c_64 ; tmp = a & c
|
||||
and T2, b_64 ; T2 = (a ^ c) & b
|
||||
xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
|
||||
mov tmp0, a_64 ; tmp = a
|
||||
ror tmp0, 5 ; 39 ; tmp = a ror 5
|
||||
xor tmp0, a_64 ; tmp = (a ror 5) ^ a
|
||||
add d_64, T1 ; e(next_state) = d + T1
|
||||
ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
|
||||
xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
|
||||
lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
|
||||
ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
|
||||
add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
|
||||
RotateState
|
||||
%endmacro
|
||||
|
||||
%macro SHA512_2Sched_2Round_sse 1
|
||||
%assign %%t (%1)
|
||||
|
||||
; Compute rounds %%t-2 and %%t-1
|
||||
; Compute message schedule QWORDS %%t and %%t+1
|
||||
|
||||
; Two rounds are computed based on the values for K[t-2]+W[t-2] and
|
||||
; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
|
||||
; scheduler.
|
||||
; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
|
||||
; They are then added to their respective SHA512 constants at
|
||||
; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
|
||||
; For brievity, the comments following vectored instructions only refer to
|
||||
; the first of a pair of QWORDS.
|
||||
; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
|
||||
; The computation of the message schedule and the rounds are tightly
|
||||
; stitched to take advantage of instruction-level parallelism.
|
||||
; For clarity, integer instructions (for the rounds calculation) are indented
|
||||
; by one tab. Vectored instructions (for the message scheduler) are indented
|
||||
; by two tabs.
|
||||
|
||||
mov T1, f_64
|
||||
movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2]
|
||||
xor T1, g_64
|
||||
and T1, e_64
|
||||
movdqa xmm0, xmm2 ; XMM0 = W[t-2]
|
||||
xor T1, g_64
|
||||
add T1, [WK_2(%%t)]
|
||||
movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
|
||||
mov tmp0, e_64
|
||||
ror tmp0, 23 ; 41
|
||||
movdqa xmm3, xmm5 ; XMM3 = W[t-15]
|
||||
xor tmp0, e_64
|
||||
ror tmp0, 4 ; 18
|
||||
psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42
|
||||
xor tmp0, e_64
|
||||
ror tmp0, 14 ; 14
|
||||
psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1
|
||||
add T1, tmp0
|
||||
add T1, h_64
|
||||
pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
|
||||
mov T2, a_64
|
||||
xor T2, c_64
|
||||
pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
|
||||
and T2, b_64
|
||||
mov tmp0, a_64
|
||||
psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
|
||||
and tmp0, c_64
|
||||
xor T2, tmp0
|
||||
psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
|
||||
mov tmp0, a_64
|
||||
ror tmp0, 5 ; 39
|
||||
pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
|
||||
xor tmp0, a_64
|
||||
ror tmp0, 6 ; 34
|
||||
pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
|
||||
xor tmp0, a_64
|
||||
ror tmp0, 28 ; 28
|
||||
psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
|
||||
add T2, tmp0
|
||||
add d_64, T1
|
||||
psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
|
||||
lea h_64, [T1 + T2]
|
||||
RotateState
|
||||
movdqa xmm1, xmm2 ; XMM1 = W[t-2]
|
||||
mov T1, f_64
|
||||
xor T1, g_64
|
||||
movdqa xmm4, xmm5 ; XMM4 = W[t-15]
|
||||
and T1, e_64
|
||||
xor T1, g_64
|
||||
psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
|
||||
add T1, [WK_2(%%t+1)]
|
||||
mov tmp0, e_64
|
||||
psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
|
||||
ror tmp0, 23 ; 41
|
||||
xor tmp0, e_64
|
||||
pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2]
|
||||
ror tmp0, 4 ; 18
|
||||
xor tmp0, e_64
|
||||
pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15]
|
||||
ror tmp0, 14 ; 14
|
||||
add T1, tmp0
|
||||
psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
|
||||
add T1, h_64
|
||||
mov T2, a_64
|
||||
psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
|
||||
xor T2, c_64
|
||||
and T2, b_64
|
||||
pxor xmm0, xmm1 ; XMM0 = s1(W[t-2])
|
||||
mov tmp0, a_64
|
||||
and tmp0, c_64
|
||||
movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
|
||||
xor T2, tmp0
|
||||
pxor xmm3, xmm4 ; XMM3 = s0(W[t-15])
|
||||
mov tmp0, a_64
|
||||
paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15])
|
||||
ror tmp0, 5 ; 39
|
||||
paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
|
||||
xor tmp0, a_64
|
||||
paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
|
||||
ror tmp0, 6 ; 34
|
||||
movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords
|
||||
xor tmp0, a_64
|
||||
paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
ror tmp0, 28 ; 28
|
||||
movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
|
||||
add T2, tmp0
|
||||
add d_64, T1
|
||||
lea h_64, [T1 + T2]
|
||||
RotateState
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; void sha512_sse4(const void* M, void* D, uint64_t L);
|
||||
; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
|
||||
; The size of the message pointed to by M must be an integer multiple of SHA512
|
||||
; message blocks.
|
||||
; L is the message length in SHA512 blocks.
|
||||
global sha512_sse4:function
|
||||
sha512_sse4:
|
||||
cmp msglen, 0
|
||||
je .nowork
|
||||
|
||||
; Allocate Stack Space
|
||||
sub rsp, frame_size
|
||||
|
||||
; Save GPRs
|
||||
mov [rsp + frame.GPRSAVE + 8 * 0], rbx
|
||||
mov [rsp + frame.GPRSAVE + 8 * 1], r12
|
||||
mov [rsp + frame.GPRSAVE + 8 * 2], r13
|
||||
mov [rsp + frame.GPRSAVE + 8 * 3], r14
|
||||
mov [rsp + frame.GPRSAVE + 8 * 4], r15
|
||||
%ifdef WINABI
|
||||
mov [rsp + frame.GPRSAVE + 8 * 5], rsi
|
||||
mov [rsp + frame.GPRSAVE + 8 * 6], rdi
|
||||
%endif
|
||||
|
||||
.updateblock:
|
||||
|
||||
; Load state variables
|
||||
mov a_64, [DIGEST(0)]
|
||||
mov b_64, [DIGEST(1)]
|
||||
mov c_64, [DIGEST(2)]
|
||||
mov d_64, [DIGEST(3)]
|
||||
mov e_64, [DIGEST(4)]
|
||||
mov f_64, [DIGEST(5)]
|
||||
mov g_64, [DIGEST(6)]
|
||||
mov h_64, [DIGEST(7)]
|
||||
|
||||
%assign t 0
|
||||
%rep 80/2 + 1
|
||||
; (80 rounds) / (2 rounds/iteration) + (1 iteration)
|
||||
; +1 iteration because the scheduler leads hashing by 1 iteration
|
||||
%if t < 2
|
||||
; BSWAP 2 QWORDS
|
||||
movdqa xmm1, [XMM_QWORD_BSWAP wrt rip]
|
||||
movdqu xmm0, [MSG(t)]
|
||||
pshufb xmm0, xmm1 ; BSWAP
|
||||
movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
|
||||
paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
movdqa [WK_2(t)], xmm0 ; Store into WK for rounds
|
||||
%elif t < 16
|
||||
; BSWAP 2 QWORDS; Compute 2 Rounds
|
||||
movdqu xmm0, [MSG(t)]
|
||||
pshufb xmm0, xmm1 ; BSWAP
|
||||
SHA512_Round t - 2 ; Round t-2
|
||||
movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
|
||||
paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
|
||||
SHA512_Round t - 1 ; Round t-1
|
||||
movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
|
||||
%elif t < 79
|
||||
; Schedule 2 QWORDS; Compute 2 Rounds
|
||||
SHA512_2Sched_2Round_sse t
|
||||
%else
|
||||
; Compute 2 Rounds
|
||||
SHA512_Round t - 2
|
||||
SHA512_Round t - 1
|
||||
%endif
|
||||
%assign t t+2
|
||||
%endrep
|
||||
|
||||
; Update digest
|
||||
add [DIGEST(0)], a_64
|
||||
add [DIGEST(1)], b_64
|
||||
add [DIGEST(2)], c_64
|
||||
add [DIGEST(3)], d_64
|
||||
add [DIGEST(4)], e_64
|
||||
add [DIGEST(5)], f_64
|
||||
add [DIGEST(6)], g_64
|
||||
add [DIGEST(7)], h_64
|
||||
|
||||
; Advance to next message block
|
||||
add msg, 16*8
|
||||
dec msglen
|
||||
jnz .updateblock
|
||||
|
||||
; Restore GPRs
|
||||
mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
|
||||
mov r12, [rsp + frame.GPRSAVE + 8 * 1]
|
||||
mov r13, [rsp + frame.GPRSAVE + 8 * 2]
|
||||
mov r14, [rsp + frame.GPRSAVE + 8 * 3]
|
||||
mov r15, [rsp + frame.GPRSAVE + 8 * 4]
|
||||
%ifdef WINABI
|
||||
mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
|
||||
mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
|
||||
%endif
|
||||
; Restore Stack Pointer
|
||||
add rsp, frame_size
|
||||
|
||||
.nowork:
|
||||
ret
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; Binary Data
|
||||
|
||||
section .data
|
||||
|
||||
ALIGN 16
|
||||
|
||||
; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||
XMM_QWORD_BSWAP:
|
||||
ddq 0x08090a0b0c0d0e0f0001020304050607
|
||||
|
||||
; K[t] used in SHA512 hashing
|
||||
K512:
|
||||
dq 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
dq 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
dq 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
dq 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
dq 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
dq 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
dq 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
dq 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
dq 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
dq 0xd192e819d6ef5218,0xd69906245565a910
|
||||
dq 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
dq 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
dq 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
dq 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
dq 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
dq 0x28db77f523047d84,0x32caab7b40c72493
|
||||
dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
|
|
@ -1,271 +0,0 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <utils.h>
|
||||
#include <sha256.h>
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
|
||||
#define BYTESWAP(x) (x)
|
||||
#define BYTESWAP64(x) (x)
|
||||
|
||||
#else /* WORDS_BIGENDIAN */
|
||||
|
||||
#define BYTESWAP(x) htonl(x)
|
||||
#define BYTESWAP64(x) htonll(x)
|
||||
|
||||
#endif /* WORDS_BIGENDIAN */
|
||||
typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
|
||||
static uint8_t padding[64] = {
|
||||
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
static const uint32_t iv256[SHA256_HASH_WORDS] = {
|
||||
0x6a09e667L,
|
||||
0xbb67ae85L,
|
||||
0x3c6ef372L,
|
||||
0xa54ff53aL,
|
||||
0x510e527fL,
|
||||
0x9b05688cL,
|
||||
0x1f83d9abL,
|
||||
0x5be0cd19L
|
||||
};
|
||||
|
||||
static update_func_ptr sha_update_func;
|
||||
|
||||
int
|
||||
APS_NAMESPACE(Init_SHA) (processor_info_t *pc)
|
||||
{
|
||||
if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
|
||||
if (pc->avx_level > 0) {
|
||||
sha_update_func = sha256_avx;
|
||||
|
||||
} else if (pc->sse_level >= 4) {
|
||||
sha_update_func = sha256_sse4;
|
||||
|
||||
} else {
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void
|
||||
_init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS])
|
||||
{
|
||||
/*
|
||||
* SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8!
|
||||
* So we unroll a loop here.
|
||||
*/
|
||||
sc->hash[0] = iv[0];
|
||||
sc->hash[1] = iv[1];
|
||||
sc->hash[2] = iv[2];
|
||||
sc->hash[3] = iv[3];
|
||||
sc->hash[4] = iv[4];
|
||||
sc->hash[5] = iv[5];
|
||||
sc->hash[6] = iv[6];
|
||||
sc->hash[7] = iv[7];
|
||||
|
||||
sc->totalLength = 0LL;
|
||||
sc->bufferLength = 0L;
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc)
|
||||
{
|
||||
_init (sc, iv256);
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *vdata, size_t len)
|
||||
{
|
||||
const uint8_t *data = (const uint8_t *)vdata;
|
||||
uint32_t bufferBytesLeft;
|
||||
size_t bytesToCopy;
|
||||
int rem;
|
||||
|
||||
if (sc->bufferLength) {
|
||||
do {
|
||||
bufferBytesLeft = 64L - sc->bufferLength;
|
||||
bytesToCopy = bufferBytesLeft;
|
||||
if (bytesToCopy > len)
|
||||
bytesToCopy = len;
|
||||
|
||||
memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
|
||||
sc->totalLength += bytesToCopy * 8L;
|
||||
sc->bufferLength += bytesToCopy;
|
||||
data += bytesToCopy;
|
||||
len -= bytesToCopy;
|
||||
|
||||
if (sc->bufferLength == 64L) {
|
||||
sc->blocks = 1;
|
||||
sha_update_func(sc->buffer.words, sc->hash, sc->blocks);
|
||||
sc->bufferLength = 0L;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} while (len > 0 && len <= 64L);
|
||||
if (!len) return;
|
||||
}
|
||||
|
||||
sc->blocks = len >> 6;
|
||||
rem = len - (sc->blocks << 6);
|
||||
len = sc->blocks << 6;
|
||||
sc->totalLength += rem * 8L;
|
||||
|
||||
if (len) {
|
||||
sc->totalLength += len * 8L;
|
||||
sha_update_func((uint32_t *)data, sc->hash, sc->blocks);
|
||||
}
|
||||
if (rem) {
|
||||
memcpy (&sc->buffer.bytes[0], data + len, rem);
|
||||
sc->bufferLength = rem;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
_final (SHA256_Context *sc, uint8_t *hash, int hashWords)
|
||||
{
|
||||
uint32_t bytesToPad;
|
||||
uint64_t lengthPad;
|
||||
int i;
|
||||
|
||||
bytesToPad = 120L - sc->bufferLength;
|
||||
if (bytesToPad > 64L)
|
||||
bytesToPad -= 64L;
|
||||
|
||||
lengthPad = BYTESWAP64(sc->totalLength);
|
||||
|
||||
APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad);
|
||||
APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L);
|
||||
|
||||
if (hash) {
|
||||
for (i = 0; i < hashWords; i++) {
|
||||
hash[0] = (uint8_t) (sc->hash[i] >> 24);
|
||||
hash[1] = (uint8_t) (sc->hash[i] >> 16);
|
||||
hash[2] = (uint8_t) (sc->hash[i] >> 8);
|
||||
hash[3] = (uint8_t) sc->hash[i];
|
||||
hash += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE])
|
||||
{
|
||||
_final (sc, hash, SHA256_HASH_WORDS);
|
||||
}
|
||||
|
||||
/* Initialize an HMAC-SHA256 operation with the given key. */
|
||||
void
|
||||
APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen)
|
||||
{
|
||||
unsigned char pad[64];
|
||||
unsigned char khash[32];
|
||||
const unsigned char * K = (const unsigned char *)_K;
|
||||
size_t i;
|
||||
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if (Klen > 64) {
|
||||
APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
|
||||
APS_NAMESPACE(SHA256_Update)(&ctx->ictx, K, Klen);
|
||||
APS_NAMESPACE(SHA256_Final)(&ctx->ictx, khash);
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
|
||||
memset(pad, 0x36, 64);
|
||||
for (i = 0; i < Klen; i++)
|
||||
pad[i] ^= K[i];
|
||||
APS_NAMESPACE(SHA256_Update)(&ctx->ictx, pad, 64);
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
APS_NAMESPACE(SHA256_Init)(&ctx->octx);
|
||||
memset(pad, 0x5c, 64);
|
||||
for (i = 0; i < Klen; i++)
|
||||
pad[i] ^= K[i];
|
||||
APS_NAMESPACE(SHA256_Update)(&ctx->octx, pad, 64);
|
||||
|
||||
/* Clean the stack. */
|
||||
memset(khash, 0, 32);
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
void
|
||||
APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len)
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
APS_NAMESPACE(SHA256_Update)(&ctx->ictx, in, len);
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
void
|
||||
APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32])
|
||||
{
|
||||
unsigned char ihash[32];
|
||||
|
||||
/* Finish the inner SHA256 operation. */
|
||||
APS_NAMESPACE(SHA256_Final)(&ctx->ictx, ihash);
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
APS_NAMESPACE(SHA256_Update)(&ctx->octx, ihash, 32);
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
APS_NAMESPACE(SHA256_Final)(&ctx->octx, digest);
|
||||
|
||||
/* Clean the stack. */
|
||||
memset(ihash, 0, 32);
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _APS_SHA256_H
|
||||
#define _APS_SHA256_H
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <utils.h>
|
||||
|
||||
#define SHA256_HASH_SIZE 32
|
||||
|
||||
/* Hash size in 32-bit words */
|
||||
#define SHA256_HASH_WORDS 8
|
||||
|
||||
typedef struct _SHA256_Context {
|
||||
uint64_t totalLength, blocks;
|
||||
uint32_t hash[SHA256_HASH_WORDS];
|
||||
uint32_t bufferLength;
|
||||
union {
|
||||
uint32_t words[16];
|
||||
uint8_t bytes[64];
|
||||
} buffer;
|
||||
} SHA256_Context;
|
||||
|
||||
typedef struct HMAC_SHA256Context {
|
||||
SHA256_Context ictx;
|
||||
SHA256_Context octx;
|
||||
} HMAC_SHA256_Context;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef APS_NAMESPACE
|
||||
#define APS_NAMESPACE(name) opt_##name
|
||||
#endif /* !APS_NAMESPACE */
|
||||
|
||||
void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc);
|
||||
void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *data, size_t len);
|
||||
void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]);
|
||||
int APS_NAMESPACE(Init_SHA) (processor_info_t *pc);
|
||||
|
||||
void APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen);
|
||||
void APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len);
|
||||
void APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32]);
|
||||
|
||||
/*
|
||||
* Intel's optimized SHA256 core routines. These routines are described in an
|
||||
* Intel White-Paper:
|
||||
* "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
*/
|
||||
extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_APS_SHA256_H */
|
294
crypto/sha2/sha512.c
Normal file
294
crypto/sha2/sha512.c
Normal file
|
@ -0,0 +1,294 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <utils.h>
|
||||
#include "sha512.h"
|
||||
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
|
||||
#define BYTESWAP(x) (x)
|
||||
#define BYTESWAP64(x) (x)
|
||||
|
||||
#else /* WORDS_BIGENDIAN */
|
||||
|
||||
#define BYTESWAP(x) htonl(x)
|
||||
#define BYTESWAP64(x) htonll(x)
|
||||
|
||||
#endif /* WORDS_BIGENDIAN */
|
||||
|
||||
typedef void (*update_func_ptr)(const void *input_data, void *digest, uint64_t num_blks);
|
||||
|
||||
static const uint8_t padding[128] = {
|
||||
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
static const uint64_t iv512[SHA512_HASH_WORDS] = {
|
||||
0x6a09e667f3bcc908LL,
|
||||
0xbb67ae8584caa73bLL,
|
||||
0x3c6ef372fe94f82bLL,
|
||||
0xa54ff53a5f1d36f1LL,
|
||||
0x510e527fade682d1LL,
|
||||
0x9b05688c2b3e6c1fLL,
|
||||
0x1f83d9abfb41bd6bLL,
|
||||
0x5be0cd19137e2179LL
|
||||
};
|
||||
|
||||
static const uint64_t iv256[SHA512_HASH_WORDS] = {
|
||||
0x22312194fc2bf72cLL,
|
||||
0x9f555fa3c84c64c2LL,
|
||||
0x2393b86b6f53b151LL,
|
||||
0x963877195940eabdLL,
|
||||
0x96283ee2a88effe3LL,
|
||||
0xbe5e1e2553863992LL,
|
||||
0x2b0199fc2c85b8aaLL,
|
||||
0x0eb72ddc81c52ca2LL
|
||||
};
|
||||
|
||||
static update_func_ptr sha512_update_func;
|
||||
|
||||
int
|
||||
APS_NAMESPACE(Init_SHA512) (processor_info_t *pc)
|
||||
{
|
||||
if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
|
||||
if (pc->avx_level > 0) {
|
||||
sha512_update_func = sha512_avx;
|
||||
|
||||
} else if (pc->sse_level >= 4) {
|
||||
sha512_update_func = sha512_sse4;
|
||||
|
||||
} else {
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void
|
||||
_init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS])
|
||||
{
|
||||
int i;
|
||||
|
||||
sc->totalLength[0] = 0LL;
|
||||
sc->totalLength[1] = 0LL;
|
||||
for (i = 0; i < SHA512_HASH_WORDS; i++)
|
||||
sc->hash[i] = iv[i];
|
||||
sc->bufferLength = 0L;
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc)
|
||||
{
|
||||
_init (sc, iv512);
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc)
|
||||
{
|
||||
_init (sc, iv256);
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *vdata, size_t len)
|
||||
{
|
||||
const uint8_t *data = (const uint8_t *)vdata;
|
||||
uint32_t bufferBytesLeft;
|
||||
size_t bytesToCopy;
|
||||
int rem;
|
||||
uint64_t carryCheck;
|
||||
|
||||
if (sc->bufferLength) {
|
||||
do {
|
||||
bufferBytesLeft = 128L - sc->bufferLength;
|
||||
bytesToCopy = bufferBytesLeft;
|
||||
if (bytesToCopy > len)
|
||||
bytesToCopy = len;
|
||||
|
||||
memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
|
||||
carryCheck = sc->totalLength[1];
|
||||
sc->totalLength[1] += bytesToCopy * 8L;
|
||||
if (sc->totalLength[1] < carryCheck)
|
||||
sc->totalLength[0]++;
|
||||
|
||||
sc->bufferLength += bytesToCopy;
|
||||
data += bytesToCopy;
|
||||
len -= bytesToCopy;
|
||||
|
||||
if (sc->bufferLength == 128L) {
|
||||
sc->blocks = 1;
|
||||
sha512_update_func(sc->buffer.words, sc->hash, sc->blocks);
|
||||
sc->bufferLength = 0L;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} while (len > 0 && len <= 128L);
|
||||
if (!len) return;
|
||||
}
|
||||
|
||||
sc->blocks = len >> 7;
|
||||
rem = len - (sc->blocks << 7);
|
||||
len = sc->blocks << 7;
|
||||
carryCheck = sc->totalLength[1];
|
||||
sc->totalLength[1] += rem * 8L;
|
||||
if (sc->totalLength[1] < carryCheck)
|
||||
sc->totalLength[0]++;
|
||||
|
||||
if (len) {
|
||||
carryCheck = sc->totalLength[1];
|
||||
sc->totalLength[1] += len * 8L;
|
||||
if (sc->totalLength[1] < carryCheck)
|
||||
sc->totalLength[0]++;
|
||||
sha512_update_func((uint32_t *)data, sc->hash, sc->blocks);
|
||||
}
|
||||
if (rem) {
|
||||
memcpy (&sc->buffer.bytes[0], data + len, rem);
|
||||
sc->bufferLength = rem;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len)
|
||||
{
|
||||
APS_NAMESPACE(SHA512_Update) (sc, data, len);
|
||||
}
|
||||
|
||||
static void
|
||||
_final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord)
|
||||
{
|
||||
uint32_t bytesToPad;
|
||||
uint64_t lengthPad[2];
|
||||
int i;
|
||||
|
||||
bytesToPad = 240L - sc->bufferLength;
|
||||
if (bytesToPad > 128L)
|
||||
bytesToPad -= 128L;
|
||||
|
||||
lengthPad[0] = BYTESWAP64(sc->totalLength[0]);
|
||||
lengthPad[1] = BYTESWAP64(sc->totalLength[1]);
|
||||
|
||||
APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad);
|
||||
APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L);
|
||||
|
||||
if (hash) {
|
||||
for (i = 0; i < hashWords; i++) {
|
||||
*((uint64_t *) hash) = BYTESWAP64(sc->hash[i]);
|
||||
hash += 8;
|
||||
}
|
||||
if (halfWord) {
|
||||
hash[0] = (uint8_t) (sc->hash[i] >> 56);
|
||||
hash[1] = (uint8_t) (sc->hash[i] >> 48);
|
||||
hash[2] = (uint8_t) (sc->hash[i] >> 40);
|
||||
hash[3] = (uint8_t) (sc->hash[i] >> 32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE])
|
||||
{
|
||||
_final (sc, hash, SHA512_HASH_WORDS, 0);
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE])
|
||||
{
|
||||
_final (sc, hash, SHA512t256_HASH_WORDS, 0);
|
||||
}
|
||||
|
||||
#define HASH_CONTEXT SHA512_Context
|
||||
#define HASH_INIT APS_NAMESPACE(SHA512_Init)
|
||||
#define HASH_UPDATE APS_NAMESPACE(SHA512_Update)
|
||||
#define HASH_FINAL APS_NAMESPACE(SHA512_Final)
|
||||
#define HASH_SIZE SHA512_HASH_SIZE
|
||||
#define HASH_BLOCK_SIZE 128
|
||||
|
||||
#define HMAC_CONTEXT HMAC_SHA512_Context
|
||||
#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512_Init)
|
||||
#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512_Update)
|
||||
#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512_Final)
|
||||
|
||||
#include "_hmac.c"
|
||||
|
||||
#undef HASH_CONTEXT
|
||||
#undef HASH_INIT
|
||||
#undef HASH_UPDATE
|
||||
#undef HASH_FINAL
|
||||
#undef HASH_SIZE
|
||||
#undef HASH_BLOCK_SIZE
|
||||
#undef HMAC_CONTEXT
|
||||
#undef HMAC_INIT
|
||||
#undef HMAC_UPDATE
|
||||
#undef HMAC_FINAL
|
||||
|
||||
#define HASH_CONTEXT SHA512_Context
|
||||
#define HASH_INIT APS_NAMESPACE(SHA512t256_Init)
|
||||
#define HASH_UPDATE APS_NAMESPACE(SHA512t256_Update)
|
||||
#define HASH_FINAL APS_NAMESPACE(SHA512t256_Final)
|
||||
#define HASH_SIZE SHA512t256_HASH_SIZE
|
||||
#define HASH_BLOCK_SIZE 128
|
||||
|
||||
#define HMAC_CONTEXT HMAC_SHA512_Context
|
||||
#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512t256_Init)
|
||||
#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512t256_Update)
|
||||
#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512t256_Final)
|
||||
|
||||
#include "_hmac.c"
|
||||
|
103
crypto/sha2/sha512.h
Normal file
103
crypto/sha2/sha512.h
Normal file
|
@ -0,0 +1,103 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _APS_SHA512_H
|
||||
#define _APS_SHA512_H
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <utils.h>
|
||||
|
||||
#define SHA512_HASH_SIZE 64
|
||||
#define SHA512t256_HASH_SIZE 32
|
||||
|
||||
/* Hash size in 64-bit words */
|
||||
#define SHA512_HASH_WORDS 8
|
||||
#define SHA512t256_HASH_WORDS 4
|
||||
|
||||
typedef struct _SHA512_Context {
|
||||
uint64_t totalLength[2], blocks;
|
||||
uint64_t hash[SHA512_HASH_WORDS];
|
||||
uint32_t bufferLength;
|
||||
union {
|
||||
uint64_t words[16];
|
||||
uint8_t bytes[128];
|
||||
} buffer;
|
||||
} SHA512_Context;
|
||||
|
||||
typedef struct {
|
||||
SHA512_Context outer;
|
||||
SHA512_Context inner;
|
||||
} HMAC_SHA512_Context;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef APS_NAMESPACE
|
||||
#define APS_NAMESPACE(name) opt_##name
|
||||
#endif /* !APS_NAMESPACE */
|
||||
|
||||
void APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc);
|
||||
void APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *data, size_t len);
|
||||
void APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]);
|
||||
int APS_NAMESPACE(Init_SHA512) (processor_info_t *pc);
|
||||
|
||||
/* As are SHA-512/256 and SHA-512/224 */
|
||||
#define SHA512t256_Context SHA512_Context
|
||||
void APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc);
|
||||
void APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len);
|
||||
void APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]);
|
||||
|
||||
void APS_NAMESPACE(HMAC_SHA512_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
|
||||
void APS_NAMESPACE(HMAC_SHA512_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
|
||||
void APS_NAMESPACE(HMAC_SHA512_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512_HASH_SIZE]);
|
||||
|
||||
void APS_NAMESPACE(HMAC_SHA512t256_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
|
||||
void APS_NAMESPACE(HMAC_SHA512t256_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
|
||||
void APS_NAMESPACE(HMAC_SHA512t256_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512t256_HASH_SIZE]);
|
||||
|
||||
/*
|
||||
* Intel's optimized SHA512 core routines. These routines are described in an
|
||||
* Intel White-Paper:
|
||||
* "Fast SHA-512 Implementations on Intel Architecture Processors"
|
||||
* Note: Works on AMD Bulldozer and later as well.
|
||||
*/
|
||||
extern void sha512_sse4(const void *input_data, void *digest, uint64_t num_blks);
|
||||
extern void sha512_avx(const void *input_data, void *digest, uint64_t num_blks);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_APS_SHA512_H */
|
2
main.c
2
main.c
|
@ -2149,6 +2149,7 @@ main(int argc, char *argv[])
|
|||
level = 6;
|
||||
err = 0;
|
||||
slab_init();
|
||||
init_pcompress();
|
||||
|
||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLPS:B:F")) != -1) {
|
||||
int ovr;
|
||||
|
@ -2341,7 +2342,6 @@ main(int argc, char *argv[])
|
|||
exit(1);
|
||||
}
|
||||
main_cancel = 0;
|
||||
init_pcompress();
|
||||
|
||||
if (cksum == 0)
|
||||
get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes);
|
||||
|
|
|
@ -20,7 +20,6 @@ void * (*xxh32_init)(unsigned int seed) = NULL;
|
|||
int (*xxh32_feed)(void* state, const void* input, int len) = NULL;
|
||||
unsigned int (*xxh32_result)(void* state) = NULL;
|
||||
unsigned int (*xxh32_getIntermediateResult)(void* state) = NULL;
|
||||
#include <stdio.h>
|
||||
|
||||
void
|
||||
XXH32_module_init() {
|
||||
|
|
Loading…
Reference in a new issue