diff --git a/Makefile.in b/Makefile.in index de63368..0db9404 100644 --- a/Makefile.in +++ b/Makefile.in @@ -102,12 +102,11 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \ crypto/skein/skein_debug.h crypto/skein/skein_iv.h SKEINOBJS = $(SKEINSRCS:.c=.o) -SHA256_SRCS = crypto/sha2/sha256.c -SHA256_HDRS = crypto/sha2/sha256.h -SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \ - crypto/sha2/intel/sha256_sse4.asm -SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o) -SHA256_OBJS = $(SHA256_SRCS:.c=.o) +SHA2_SRCS = crypto/sha2/sha512.c +SHA2_HDRS = crypto/sha2/sha512.h +SHA2ASM_SRCS = crypto/sha2/intel/sha512_avx.asm crypto/sha2/intel/sha512_sse4.asm +SHA2ASM_OBJS = $(SHA2ASM_SRCS:.asm=.o) +SHA2_OBJS = $(SHA2_SRCS:.c=.o) YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX LIBBSCWRAP = libbsc_compress.c @@ -161,7 +160,7 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR -L./buildtmp -Wl,-R@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS) OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \ -$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ +$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ @@ -199,7 +198,7 @@ SSE3_OPT_FLAG = -mssse3 SSE2_OPT_FLAG = -msse2 SKEIN_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ -SHA256_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ +SHA2_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ KECCAK_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ all: $(PROG) @@ -237,10 +236,10 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC) $(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS) $(COMPILE) $(SKEIN_FLAGS) $(@:.o=.c) -o $@ -$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS) - $(COMPILE) $(SHA256_FLAGS) $(@:.o=.c) -o $@ +$(SHA2_OBJS): $(SHA2_SRCS) $(SHA2_HDRS) + $(COMPILE) $(SHA2_FLAGS) $(@:.o=.c) -o $@ -$(SHA256ASM_OBJS): $(SHA256ASM_SRCS) +$(SHA2ASM_OBJS): $(SHA2ASM_SRCS) $(YASM) -o $@ $(@:.o=.asm) $(KECCAK_OBJS): $(KECCAK_SRCS) $(KECCAK_HDRS) diff --git a/config b/config index 12139e2..a3550e5 100755 --- a/config +++ b/config @@ -236,8 +236,8 @@ then # Minimum yasm version 1.1 [ $major -lt 1 -o $minor -lt 1 ] && continue yasm=${bindir}/yasm - sha256asmobjs='\$\(SHA256ASM_OBJS\)' - sha256objs='\$\(SHA256_OBJS\)' + sha256asmobjs='\$\(SHA2ASM_OBJS\)' + sha256objs='\$\(SHA2_OBJS\)' fi done if [ "x${yasm}" = "x" ] @@ -492,8 +492,8 @@ libbsclflagsvar="LIBBSCLFLAGS" libbscwrapobjvar="LIBBSCWRAPOBJ" libbscgenoptvar="LIBBSCGEN_OPT" libbsccppflagsvar="LIBBSCCPPFLAGS" -sha256asmobjsvar="SHA256ASM_OBJS" -sha256objsvar="SHA256_OBJS" +sha256asmobjsvar="SHA2ASM_OBJS" +sha256objsvar="SHA2_OBJS" yasmvar="YASM" fptr_flag_var="FPTR_FLAG" extra_opt_flags_var="EXTRA_OPT_FLAGS" diff --git a/crypto/crypto_utils.c b/crypto/crypto_utils.c index a300103..b3131b3 100644 --- a/crypto/crypto_utils.c +++ b/crypto/crypto_utils.c @@ -36,7 +36,8 @@ #include #include #include -#include +//#include +#include #include #include #include @@ -46,7 +47,7 @@ #define PROVIDER_OPENSSL 0 #define PROVIDER_X64_OPT 1 -static void init_sha256(void); +static void init_sha512(void); static int geturandom_bytes(uchar_t rbytes[32]); /* * Checksum properties @@ -66,9 +67,9 @@ static struct { {"SKEIN512", "512-bit SKEIN", CKSUM_SKEIN512, 64, 64, NULL}, {"SHA256", "Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.", - CKSUM_SHA256, 32, 32, init_sha256}, + CKSUM_SHA256, 32, 32, init_sha512}, {"SHA512", "512-bit SHA2 from OpenSSL's crypto library.", - CKSUM_SHA512, 64, 64, NULL}, + CKSUM_SHA512, 64, 64, init_sha512}, {"KECCAK256", "Official 256-bit NIST SHA3 optimized implementation.", CKSUM_KECCAK256, 32, 32, NULL}, {"KECCAK512", "Official 512-bit NIST SHA3 optimized implementation.", @@ -190,18 +191,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes) SHA256_Update(&ctx, buf, bytes); SHA256_Final(cksum_buf, &ctx); } else { - SHA256_Context ctx; + SHA512_Context ctx; - opt_SHA256_Init(&ctx); - opt_SHA256_Update(&ctx, buf, bytes); - opt_SHA256_Final(&ctx, cksum_buf); + opt_SHA512t256_Init(&ctx); + opt_SHA512t256_Update(&ctx, buf, bytes); + opt_SHA512t256_Final(&ctx, cksum_buf); } } else if (cksum == CKSUM_SHA512) { - SHA512_CTX ctx; + if (cksum_provider == PROVIDER_OPENSSL) { + SHA512_CTX ctx; - SHA512_Init(&ctx); - SHA512_Update(&ctx, buf, bytes); - SHA512_Final(cksum_buf, &ctx); + SHA512_Init(&ctx); + SHA512_Update(&ctx, buf, bytes); + SHA512_Final(cksum_buf, &ctx); + } else { + SHA512_Context ctx; + + opt_SHA512_Init(&ctx); + opt_SHA512_Update(&ctx, buf, bytes); + opt_SHA512_Final(&ctx, cksum_buf); + } } else if (cksum == CKSUM_KECCAK256) { if (Keccak_Hash(256, buf, bytes * 8, cksum_buf) != 0) @@ -219,7 +228,7 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes) } static void -init_sha256(void) +init_sha512(void) { #ifdef WORDS_BIGENDIAN cksum_provider = PROVIDER_OPENSSL; @@ -227,7 +236,7 @@ init_sha256(void) #ifdef __x86_64__ cksum_provider = PROVIDER_OPENSSL; if (proc_info.proc_type == PROC_X64_INTEL || proc_info.proc_type == PROC_X64_AMD) { - if (opt_Init_SHA(&proc_info) == 0) { + if (opt_Init_SHA512(&proc_info) == 0) { cksum_provider = PROVIDER_X64_OPT; } } @@ -355,7 +364,7 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx) } mctx->mac_ctx_reinit = ctx; } else { - HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context)); +/* HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context)); if (!ctx) return (-1); opt_HMAC_SHA256_Init(ctx, actx->pkey, KEYLEN); mctx->mac_ctx = ctx; @@ -366,26 +375,54 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx) return (-1); } memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA256_Context)); + mctx->mac_ctx_reinit = ctx;*/ + + HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context)); + if (!ctx) return (-1); + opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN); + mctx->mac_ctx = ctx; + + ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context)); + if (!ctx) { + free(mctx->mac_ctx); + return (-1); + } + memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context)); mctx->mac_ctx_reinit = ctx; } } else if (cksum == CKSUM_SHA512) { - HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX)); - if (!ctx) return (-1); - HMAC_CTX_init(ctx); - HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL); - mctx->mac_ctx = ctx; + if (cksum_provider == PROVIDER_OPENSSL) { + HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX)); + if (!ctx) return (-1); + HMAC_CTX_init(ctx); + HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL); + mctx->mac_ctx = ctx; - ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX)); - if (!ctx) { - free(mctx->mac_ctx); - return (-1); + ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX)); + if (!ctx) { + free(mctx->mac_ctx); + return (-1); + } + if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) { + free(ctx); + free(mctx->mac_ctx); + return (-1); + } + mctx->mac_ctx_reinit = ctx; + } else { + HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context)); + if (!ctx) return (-1); + opt_HMAC_SHA512_Init(ctx, actx->pkey, KEYLEN); + mctx->mac_ctx = ctx; + + ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context)); + if (!ctx) { + free(mctx->mac_ctx); + return (-1); + } + memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context)); + mctx->mac_ctx_reinit = ctx; } - if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) { - free(ctx); - free(mctx->mac_ctx); - return (-1); - } - mctx->mac_ctx_reinit = ctx; } else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) { hashState *ctx = (hashState *)malloc(sizeof (hashState)); @@ -423,16 +460,13 @@ hmac_reinit(mac_ctx_t *mctx) if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) { memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t)); - } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) { + } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) { if (cksum_provider == PROVIDER_OPENSSL) { HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx), (HMAC_CTX *)(mctx->mac_ctx_reinit)); } else { - memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA256_Context)); + memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA512_Context)); } - } else if (cksum == CKSUM_SHA512) { - HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx), (HMAC_CTX *)(mctx->mac_ctx_reinit)); - } else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) { memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (hashState)); } else { @@ -458,15 +492,19 @@ hmac_update(mac_ctx_t *mctx, uchar_t *data, uint64_t len) HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len); #endif } else { - opt_HMAC_SHA256_Update((HMAC_SHA256_Context *)(mctx->mac_ctx), data, len); + opt_HMAC_SHA512t256_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len); } } else if (cksum == CKSUM_SHA512) { + if (cksum_provider == PROVIDER_OPENSSL) { #ifndef __OSSL_OLD__ - if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0) - return (-1); + if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0) + return (-1); #else - HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len); + HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len); #endif + } else { + opt_HMAC_SHA512_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len); + } } else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) { // Keccak takes data length in bits so we have to scale @@ -503,12 +541,16 @@ hmac_final(mac_ctx_t *mctx, uchar_t *hash, unsigned int *len) if (cksum_provider == PROVIDER_OPENSSL) { HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len); } else { - opt_HMAC_SHA256_Final((HMAC_SHA256_Context *)(mctx->mac_ctx), hash); + opt_HMAC_SHA512t256_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash); *len = 32; } } else if (cksum == CKSUM_SHA512) { - HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len); - + if (cksum_provider == PROVIDER_OPENSSL) { + HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len); + } else { + opt_HMAC_SHA512_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash); + *len = 64; + } } else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) { if (Keccak_Final((hashState *)(mctx->mac_ctx), hash) != 0) return (-1); @@ -531,18 +573,14 @@ hmac_cleanup(mac_ctx_t *mctx) memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t)); memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t)); - } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) { + } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) { if (cksum_provider == PROVIDER_OPENSSL) { HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx)); HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit)); } else { - memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA256_Context)); - memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA256_Context)); + memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA512_Context)); + memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA512_Context)); } - } else if (cksum == CKSUM_SHA512) { - HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx)); - HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit)); - } else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) { memset(mctx->mac_ctx, 0, sizeof (hashState)); memset(mctx->mac_ctx_reinit, 0, sizeof (hashState)); diff --git a/crypto/crypto_utils.h b/crypto/crypto_utils.h index 018012c..debb8eb 100644 --- a/crypto/crypto_utils.h +++ b/crypto/crypto_utils.h @@ -33,7 +33,7 @@ extern "C" { #endif #define MAX_PW_LEN 16 -#define CKSUM_MASK 0x800 +#define CKSUM_MASK 0x700 #define CKSUM_MAX_BYTES 64 #define DEFAULT_CKSUM "SKEIN256" diff --git a/crypto/sha2/_hmac.c b/crypto/sha2/_hmac.c new file mode 100644 index 0000000..9f38d22 --- /dev/null +++ b/crypto/sha2/_hmac.c @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2010, 2011 Allan Saddi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +void +HMAC_INIT(HMAC_CONTEXT *ctxt, const void *key, size_t keyLen) +{ + HASH_CONTEXT keyCtxt; + unsigned int i; + uint8_t pkey[HASH_BLOCK_SIZE], okey[HASH_BLOCK_SIZE], ikey[HASH_BLOCK_SIZE]; + + /* Ensure key is zero-padded */ + memset(pkey, 0, sizeof(pkey)); + + if (keyLen > sizeof(pkey)) { + /* Hash key if > HASH_BLOCK_SIZE */ + HASH_INIT(&keyCtxt); + HASH_UPDATE(&keyCtxt, key, keyLen); + HASH_FINAL(&keyCtxt, pkey); + } + else { + memcpy(pkey, key, keyLen); + } + + /* XOR with opad, ipad */ + for (i = 0; i < sizeof(okey); i++) { + okey[i] = pkey[i] ^ 0x5c; + } + for (i = 0; i < sizeof(ikey); i++) { + ikey[i] = pkey[i] ^ 0x36; + } + + /* Initialize hash contexts */ + HASH_INIT(&ctxt->outer); + HASH_UPDATE(&ctxt->outer, okey, sizeof(okey)); + HASH_INIT(&ctxt->inner); + HASH_UPDATE(&ctxt->inner, ikey, sizeof(ikey)); + + /* Burn the stack */ + memset(ikey, 0, sizeof(ikey)); + memset(okey, 0, sizeof(okey)); + memset(pkey, 0, sizeof(pkey)); + memset(&keyCtxt, 0, sizeof(keyCtxt)); +} + +void +HMAC_UPDATE(HMAC_CONTEXT *ctxt, const void *data, size_t len) +{ + HASH_UPDATE(&ctxt->inner, data, len); +} + +void +HMAC_FINAL(HMAC_CONTEXT *ctxt, uint8_t hmac[HASH_SIZE]) +{ + uint8_t ihash[HASH_SIZE]; + + HASH_FINAL(&ctxt->inner, ihash); + HASH_UPDATE(&ctxt->outer, ihash, sizeof(ihash)); + HASH_FINAL(&ctxt->outer, hmac); + + memset(ihash, 0, sizeof(ihash)); +} diff --git a/crypto/sha2/intel/sha256_avx1.asm b/crypto/sha2/intel/sha256_avx1.asm deleted file mode 100644 index 694dc6a..0000000 --- a/crypto/sha2/intel/sha256_avx1.asm +++ /dev/null @@ -1,577 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright 2012 Intel Corporation All Rights Reserved. -; -; The source code contained or described herein and all documents -; related to the source code ("Material") are owned by Intel Corporation -; or its suppliers or licensors. Title to the Material remains with -; Intel Corporation or its suppliers and licensors. The Material may -; contain trade secrets and proprietary and confidential information of -; Intel Corporation and its suppliers and licensors, and is protected by -; worldwide copyright and trade secret laws and treaty provisions. No -; part of the Material may be used, copied, reproduced, modified, -; published, uploaded, posted, transmitted, distributed, or disclosed in -; any way without Intel's prior express written permission. -; -; No license under any patent, copyright, trade secret or other -; intellectual property right is granted to or conferred upon you by -; disclosure or delivery of the Materials, either expressly, by -; implication, inducement, estoppel or otherwise. Any license under such -; intellectual property rights must be express and approved by Intel in -; writing. -; -; Unless otherwise agreed by Intel in writing, you may not remove or -; alter this notice or any other notice embedded in Materials by Intel -; or Intel's suppliers or licensors in any way. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; Example YASM command lines: -; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm -; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; This code is described in an Intel White-Paper: -; "Fast SHA-256 Implementations on Intel Architecture Processors" -; -; To find it, surf to http://www.intel.com/p/en_US/embedded -; and search for that title. -; The paper is expected to be released roughly at the end of April, 2012 -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; This code schedules 1 blocks at a time, with 4 lanes per block -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define VMOVDQ vmovdqu ;; assume buffers not aligned - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros - -; addm [mem], reg -; Add reg to mem using reg-mem add and store -%macro addm 2 - add %2, %1 - mov %1, %2 -%endm - -%macro MY_ROR 2 - shld %1,%1,(32-(%2)) -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -; Load xmm with mem and byte swap each dword -%macro COPY_XMM_AND_BSWAP 3 - VMOVDQ %1, %2 - vpshufb %1, %1, %3 -%endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define X0 xmm4 -%define X1 xmm5 -%define X2 xmm6 -%define X3 xmm7 - -%define XTMP0 xmm0 -%define XTMP1 xmm1 -%define XTMP2 xmm2 -%define XTMP3 xmm3 -%define XTMP4 xmm8 -%define XFER xmm9 -%define XTMP5 xmm11 - -%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA -%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 -%define BYTE_FLIP_MASK xmm13 - -%ifdef LINUX -%define NUM_BLKS rdx ; 3rd arg -%define CTX rsi ; 2nd arg -%define INP rdi ; 1st arg - -%define SRND rdi ; clobbers INP -%define c ecx -%define d r8d -%define e edx -%else -%define NUM_BLKS r8 ; 3rd arg -%define CTX rdx ; 2nd arg -%define INP rcx ; 1st arg - -%define SRND rcx ; clobbers INP -%define c edi -%define d esi -%define e r8d - -%endif -%define TBL rbp -%define a eax -%define b ebx - -%define f r9d -%define g r10d -%define h r11d - -%define y0 r13d -%define y1 r14d -%define y2 r15d - - -_INP_END_SIZE equ 8 -_INP_SIZE equ 8 -_XFER_SIZE equ 8 -%ifdef LINUX -_XMM_SAVE_SIZE equ 0 -%else -_XMM_SAVE_SIZE equ 8*16 -%endif -; STACK_SIZE plus pushes must be an odd multiple of 8 -_ALIGN_SIZE equ 8 - -_INP_END equ 0 -_INP equ _INP_END + _INP_END_SIZE -_XFER equ _INP + _INP_SIZE -_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE -STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE - -; rotate_Xs -; Rotate values of symbols X0...X3 -%macro rotate_Xs 0 -%xdefine X_ X0 -%xdefine X0 X1 -%xdefine X1 X2 -%xdefine X2 X3 -%xdefine X3 X_ -%endm - -; ROTATE_ARGS -; Rotate values of symbols a...h -%macro ROTATE_ARGS 0 -%xdefine TMP_ h -%xdefine h g -%xdefine g f -%xdefine f e -%xdefine e d -%xdefine d c -%xdefine c b -%xdefine b a -%xdefine a TMP_ -%endm - -%macro FOUR_ROUNDS_AND_SCHED 0 - ;; compute s0 four at a time and s1 two at a time - ;; compute W[-16] + W[-7] 4 at a time - ;vmovdqa XTMP0, X3 - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - ;vmovdqa XTMP1, X1 - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - ;; compute s0 - vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH - - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - - vpsrld XTMP2, XTMP1, 7 - - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - - vpslld XTMP3, XTMP1, (32-7) - - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - - vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 - - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - - mov y0, e ; y0 = e - mov y1, a ; y1 = a - - - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - - vpsrld XTMP2, XTMP1,18 - - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - - vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 - - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - - vpslld XTMP1, XTMP1, (32-18) - - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - vpxor XTMP3, XTMP3, XTMP1 - - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - - vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 - - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - - vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 - - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - ;; compute low s1 - vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} - - mov y0, e ; y0 = e - mov y1, a ; y1 = a - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - - ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} - - xor y0, e ; y0 = e ^ (e >> (25-11)) - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - - vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} - - xor y2, g ; y2 = f^g - - vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} - - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - - vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} - - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP2, XTMP2, XTMP3 - add y2, y0 ; y2 = S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH - vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - ;; compute high s1 - vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - - vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} - - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - - vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} - - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - - vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} - - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - vpxor XTMP2, XTMP2, XTMP3 - - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH - vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS -rotate_Xs -%endm - -;; input is [rsp + _XFER + %1 * 4] -%macro DO_ROUND 1 - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - xor y0, e ; y0 = e ^ (e >> (25-11)) - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - and y2, e ; y2 = (f^g)&e - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - add y2, y0 ; y2 = S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_avx -align 32 -sha256_avx: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 - vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 - vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 - vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 - vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 - vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 - vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 - vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] - vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 2*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 3*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1 - - mov SRND, 2 -loop2: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd XFER, X1, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X0, X2 - vmovdqa X1, X3 - - sub SRND, 1 - jne loop2 - - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0 - -done_hash: -%ifndef LINUX - vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] - vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] - vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] - vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] - vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] - vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] - vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] - vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] -%endif - - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - - -section .data -align 64 -K256: - dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 - -; shuffle xBxA -> 00BA -_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -; shuffle xDxC -> DC00 -_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/crypto/sha2/intel/sha256_sse4.asm b/crypto/sha2/intel/sha256_sse4.asm deleted file mode 100644 index aa570e0..0000000 --- a/crypto/sha2/intel/sha256_sse4.asm +++ /dev/null @@ -1,535 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright 2012 Intel Corporation All Rights Reserved. -; -; The source code contained or described herein and all documents -; related to the source code ("Material") are owned by Intel Corporation -; or its suppliers or licensors. Title to the Material remains with -; Intel Corporation or its suppliers and licensors. The Material may -; contain trade secrets and proprietary and confidential information of -; Intel Corporation and its suppliers and licensors, and is protected by -; worldwide copyright and trade secret laws and treaty provisions. No -; part of the Material may be used, copied, reproduced, modified, -; published, uploaded, posted, transmitted, distributed, or disclosed in -; any way without Intel's prior express written permission. -; -; No license under any patent, copyright, trade secret or other -; intellectual property right is granted to or conferred upon you by -; disclosure or delivery of the Materials, either expressly, by -; implication, inducement, estoppel or otherwise. Any license under such -; intellectual property rights must be express and approved by Intel in -; writing. -; -; Unless otherwise agreed by Intel in writing, you may not remove or -; alter this notice or any other notice embedded in Materials by Intel -; or Intel's suppliers or licensors in any way. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; Example YASM command lines: -; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm -; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; This code is described in an Intel White-Paper: -; "Fast SHA-256 Implementations on Intel Architecture Processors" -; -; To find it, surf to http://www.intel.com/p/en_US/embedded -; and search for that title. -; The paper is expected to be released roughly at the end of April, 2012 -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; This code schedules 1 blocks at a time, with 4 lanes per block -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define MOVDQ movdqu ;; assume buffers not aligned - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros - -; addm [mem], reg -; Add reg to mem using reg-mem add and store -%macro addm 2 - add %2, %1 - mov %1, %2 -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -; Load xmm with mem and byte swap each dword -%macro COPY_XMM_AND_BSWAP 3 - MOVDQ %1, %2 - pshufb %1, %3 -%endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define X0 xmm4 -%define X1 xmm5 -%define X2 xmm6 -%define X3 xmm7 - -%define XTMP0 xmm0 -%define XTMP1 xmm1 -%define XTMP2 xmm2 -%define XTMP3 xmm3 -%define XTMP4 xmm8 -%define XFER xmm9 - -%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA -%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 -%define BYTE_FLIP_MASK xmm12 - -%ifdef LINUX -%define NUM_BLKS rdx ; 3rd arg -%define CTX rsi ; 2nd arg -%define INP rdi ; 1st arg - -%define SRND rdi ; clobbers INP -%define c ecx -%define d r8d -%define e edx -%else -%define NUM_BLKS r8 ; 3rd arg -%define CTX rdx ; 2nd arg -%define INP rcx ; 1st arg - -%define SRND rcx ; clobbers INP -%define c edi -%define d esi -%define e r8d - -%endif -%define TBL rbp -%define a eax -%define b ebx - -%define f r9d -%define g r10d -%define h r11d - -%define y0 r13d -%define y1 r14d -%define y2 r15d - - - -_INP_END_SIZE equ 8 -_INP_SIZE equ 8 -_XFER_SIZE equ 8 -%ifdef LINUX -_XMM_SAVE_SIZE equ 0 -%else -_XMM_SAVE_SIZE equ 7*16 -%endif -; STACK_SIZE plus pushes must be an odd multiple of 8 -_ALIGN_SIZE equ 8 - -_INP_END equ 0 -_INP equ _INP_END + _INP_END_SIZE -_XFER equ _INP + _INP_SIZE -_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE -STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE - -; rotate_Xs -; Rotate values of symbols X0...X3 -%macro rotate_Xs 0 -%xdefine X_ X0 -%xdefine X0 X1 -%xdefine X1 X2 -%xdefine X2 X3 -%xdefine X3 X_ -%endm - -; ROTATE_ARGS -; Rotate values of symbols a...h -%macro ROTATE_ARGS 0 -%xdefine TMP_ h -%xdefine h g -%xdefine g f -%xdefine f e -%xdefine e d -%xdefine d c -%xdefine c b -%xdefine b a -%xdefine a TMP_ -%endm - -%macro FOUR_ROUNDS_AND_SCHED 0 - ;; compute s0 four at a time and s1 two at a time - ;; compute W[-16] + W[-7] 4 at a time - movdqa XTMP0, X3 - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - palignr XTMP0, X2, 4 ; XTMP0 = W[-7] - ror y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa XTMP1, X1 - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - ;; compute s0 - palignr XTMP1, X0, 4 ; XTMP1 = W[-15] - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH - movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pslld XTMP1, (32-7) - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - psrld XTMP2, 7 - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] - mov y0, e ; y0 = e - mov y1, a ; y1 = a - movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] - ror y0, (25-11) ; y0 = e >> (25-11) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y1, (22-13) ; y1 = a >> (22-13) - pslld XTMP3, (32-18) - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - psrld XTMP2, 18 - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP1, XTMP3 - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pxor XTMP1, XTMP4 ; XTMP1 = s0 - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - ;; compute low s1 - pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} - mov y0, e ; y0 = e - mov y1, a ; y1 = a - ror y0, (25-11) ; y0 = e >> (25-11) - movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} - xor y0, e ; y0 = e ^ (e >> (25-11)) - ror y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} - xor y2, g ; y2 = f^g - psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP2, XTMP3 - add y2, y0 ; y2 = S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH - pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - ;; compute high s1 - pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} - ror y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - pxor XTMP2, XTMP3 - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH - pxor X0, XTMP2 ; X0 = s1 {xDxC} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS -rotate_Xs -%endm - -;; input is [rsp + _XFER + %1 * 4] -%macro DO_ROUND 1 - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - xor y0, e ; y0 = e ^ (e >> (25-11)) - ror y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - and y2, e ; y2 = (f^g)&e - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - add y2, y0 ; y2 = S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_sse4 -align 32 -sha256_sse4: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - movdqa [rsp + _XMM_SAVE + 0*16],xmm6 - movdqa [rsp + _XMM_SAVE + 1*16],xmm7 - movdqa [rsp + _XMM_SAVE + 2*16],xmm8 - movdqa [rsp + _XMM_SAVE + 3*16],xmm9 - movdqa [rsp + _XMM_SAVE + 4*16],xmm10 - movdqa [rsp + _XMM_SAVE + 5*16],xmm11 - movdqa [rsp + _XMM_SAVE + 6*16],xmm12 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - movdqa SHUF_00BA, [_SHUF_00BA wrt rip] - movdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1: - movdqa XFER, [TBL + 0*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 1*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 2*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 3*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1 - - mov SRND, 2 -loop2: - paddd X0, [TBL + 0*16] - movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd X1, [TBL + 1*16] - movdqa [rsp + _XFER], X1 - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X0, X2 - movdqa X1, X3 - - sub SRND, 1 - jne loop2 - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0 - -done_hash: -%ifndef LINUX - movdqa xmm6,[rsp + _XMM_SAVE + 0*16] - movdqa xmm7,[rsp + _XMM_SAVE + 1*16] - movdqa xmm8,[rsp + _XMM_SAVE + 2*16] - movdqa xmm9,[rsp + _XMM_SAVE + 3*16] - movdqa xmm10,[rsp + _XMM_SAVE + 4*16] - movdqa xmm11,[rsp + _XMM_SAVE + 5*16] - movdqa xmm12,[rsp + _XMM_SAVE + 6*16] -%endif - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - - -section .data -align 64 -K256: - dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 - -; shuffle xBxA -> 00BA -_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -; shuffle xDxC -> DC00 -_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/crypto/sha2/intel/sha512_avx.asm b/crypto/sha2/intel/sha512_avx.asm new file mode 100644 index 0000000..2344844 --- /dev/null +++ b/crypto/sha2/intel/sha512_avx.asm @@ -0,0 +1,409 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright 2012 Intel Corporation All Rights Reserved. +; +; The source code contained or described herein and all documents +; related to the source code ("Material") are owned by Intel Corporation +; or its suppliers or licensors. Title to the Material remains with +; Intel Corporation or its suppliers and licensors. The Material may +; contain trade secrets and proprietary and confidential information of +; Intel Corporation and its suppliers and licensors, and is protected by +; worldwide copyright and trade secret laws and treaty provisions. No +; part of the Material may be used, copied, reproduced, modified, +; published, uploaded, posted, transmitted, distributed, or disclosed in +; any way without Intel's prior express written permission. +; +; No license under any patent, copyright, trade secret or other +; intellectual property right is granted to or conferred upon you by +; disclosure or delivery of the Materials, either expressly, by +; implication, inducement, estoppel or otherwise. Any license under such +; intellectual property rights must be express and approved by Intel in +; writing. +; +; Unless otherwise agreed by Intel in writing, you may not remove or +; alter this notice or any other notice embedded in Materials by Intel +; or Intel's suppliers or licensors in any way. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -f x64 -D WINABI sha512_avx.asm +; Linux: yasm -f elf64 sha512_avx.asm +; + +BITS 64 +section .text + +; Virtual Registers +%ifdef WINABI + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifdef WINABI + .XMMSAVE: resdq 4 + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro RORQ 2 + ; shld is faster than ror on Sandybridge + shld %1, %1, (64 - %2) +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + RORQ tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + RORQ tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_avx 1 +%assign %%t %1 + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + vmovdqa xmm4, [W_t(%%t-2)] ; XMM4 = W[t-2] + vmovdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov T1, f_64 + vpsrlq xmm0, xmm4, 61 ; XMM0 = W[t-2]>>61 + mov tmp0, e_64 + vpsrlq xmm6, xmm5, 1 ; XMM6 = W[t-15]>>1 + xor T1, g_64 + RORQ tmp0, 23 ; 41 + vpsrlq xmm1, xmm4, 19 ; XMM1 = W[t-2]>>19 + and T1, e_64 + xor tmp0, e_64 + vpxor xmm0, xmm1 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor T1, g_64 + add T1, [WK_2(%%t)]; + vpsrlq xmm7, xmm5, 8 ; XMM7 = W[t-15]>>8 + RORQ tmp0, 4 ; 18 + vpsrlq xmm2, xmm4, 6 ; XMM2 = W[t-2]>>6 + xor tmp0, e_64 + mov T2, a_64 + add T1, h_64 + vpxor xmm6, xmm7 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 ; 14 + add T1, tmp0 + vpsrlq xmm8, xmm5, 7 ; XMM8 = W[t-15]>>7 + mov tmp0, a_64 + xor T2, c_64 + vpsllq xmm3, xmm4, (64-61) ; XMM3 = W[t-2]<<3 + and tmp0, c_64 + and T2, b_64 + vpxor xmm2, xmm3 ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor T2, tmp0 + mov tmp0, a_64 + vpsllq xmm9, xmm5, (64-1) ; XMM9 = W[t-15]<<63 + RORQ tmp0, 5 ; 39 + vpxor xmm8, xmm9 ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 ; 34 + xor tmp0, a_64 + vpxor xmm6, xmm8 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 + lea h_64, [T1 + T2] + RORQ tmp0, 28 ; 28 + vpsllq xmm4, (64-19) ; XMM4 = W[t-2]<<25 + add h_64, tmp0 + RotateState + vpxor xmm0, xmm4 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 + mov T1, f_64 + vpxor xmm0, xmm2 ; XMM0 = s1(W[t-2]) + mov tmp0, e_64 + xor T1, g_64 + vpaddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + W[t-16] + vmovdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + RORQ tmp0, 23 ; 41 + and T1, e_64 + xor tmp0, e_64 + xor T1, g_64 + vpsllq xmm5, (64-8) ; XMM5 = W[t-15]<<56 + add T1, [WK_2(%%t+1)] + vpxor xmm6, xmm5 ; XMM6 = s0(W[t-15]) + RORQ tmp0, 4 ; 18 + vpaddq xmm0, xmm6 ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor tmp0, e_64 + vpaddq xmm0, xmm1 ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + mov T2, a_64 + add T1, h_64 + RORQ tmp0, 14 ; 14 + add T1, tmp0 + vmovdqa [W_t(%%t)], xmm0 ; Store W[t] + vpaddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + vmovdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + mov tmp0, a_64 + xor T2, c_64 + and tmp0, c_64 + and T2, b_64 + xor T2, tmp0 + mov tmp0, a_64 + RORQ tmp0, 5 ; 39 + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 ; 34 + xor tmp0, a_64 + lea h_64, [T1 + T2] + RORQ tmp0, 28 ; 28 + add h_64, tmp0 + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_avx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +global sha512_avx:function +sha512_avx: + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifdef WINABI + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + ; Save XMMs +%ifdef WINABI + vmovdqa [rsp + frame.XMMSAVE + 16 * 0], xmm6 + vmovdqa [rsp + frame.XMMSAVE + 16 * 1], xmm7 + vmovdqa [rsp + frame.XMMSAVE + 16 * 2], xmm8 + vmovdqa [rsp + frame.XMMSAVE + 16 * 3], xmm9 +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + vmovdqa xmm1, [XMM_QWORD_BSWAP wrt rip] + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 ; BSWAP + vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair + vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t] + vmovdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS, Compute 2 Rounds + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair + vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + vmovdqa [WK_2(t)], xmm0 ; W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_avx t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore XMMs +%ifdef WINABI + vmovdqa xmm6, [rsp + frame.XMMSAVE + 16 * 0] + vmovdqa xmm7, [rsp + frame.XMMSAVE + 16 * 1] + vmovdqa xmm8, [rsp + frame.XMMSAVE + 16 * 2] + vmovdqa xmm9, [rsp + frame.XMMSAVE + 16 * 3] +%endif + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifdef WINABI + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + ddq 0x08090a0b0c0d0e0f0001020304050607 + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + diff --git a/crypto/sha2/intel/sha512_sse4.asm b/crypto/sha2/intel/sha512_sse4.asm new file mode 100644 index 0000000..85a154c --- /dev/null +++ b/crypto/sha2/intel/sha512_sse4.asm @@ -0,0 +1,398 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright 2012 Intel Corporation All Rights Reserved. +; +; The source code contained or described herein and all documents +; related to the source code ("Material") are owned by Intel Corporation +; or its suppliers or licensors. Title to the Material remains with +; Intel Corporation or its suppliers and licensors. The Material may +; contain trade secrets and proprietary and confidential information of +; Intel Corporation and its suppliers and licensors, and is protected by +; worldwide copyright and trade secret laws and treaty provisions. No +; part of the Material may be used, copied, reproduced, modified, +; published, uploaded, posted, transmitted, distributed, or disclosed in +; any way without Intel's prior express written permission. +; +; No license under any patent, copyright, trade secret or other +; intellectual property right is granted to or conferred upon you by +; disclosure or delivery of the Materials, either expressly, by +; implication, inducement, estoppel or otherwise. Any license under such +; intellectual property rights must be express and approved by Intel in +; writing. +; +; Unless otherwise agreed by Intel in writing, you may not remove or +; alter this notice or any other notice embedded in Materials by Intel +; or Intel's suppliers or licensors in any way. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -f x64 -D WINABI sha512_sse4.asm +; Linux: yasm -f elf64 sha512_sse4.asm +; +; Alternative Example YASM command lines: +; Windows: yasm -Xvc -f x64 -D WINABI -rnasm -pnasm -o sha512_sse4.obj -g cv8 sha512_sse4.asm +; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha512_sse4.o sha512_sse4.asm +; + +BITS 64 +section .text + +; Virtual Registers +%ifdef WINABI + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifdef WINABI + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + ror tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + ror tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_sse 1 +%assign %%t (%1) + + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + mov T1, f_64 + movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2] + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 ; XMM0 = W[t-2] + xor T1, g_64 + add T1, [WK_2(%%t)] + movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov tmp0, e_64 + ror tmp0, 23 ; 41 + movdqa xmm3, xmm5 ; XMM3 = W[t-15] + xor tmp0, e_64 + ror tmp0, 4 ; 18 + psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42 + xor tmp0, e_64 + ror tmp0, 14 ; 14 + psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1 + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15] + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov tmp0, a_64 + ror tmp0, 5 ; 39 + pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor tmp0, a_64 + ror tmp0, 6 ; 34 + pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor tmp0, a_64 + ror tmp0, 28 ; 28 + psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 ; XMM1 = W[t-2] + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 ; XMM4 = W[t-15] + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42 + add T1, [WK_2(%%t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7 + ror tmp0, 23 ; 41 + xor tmp0, e_64 + pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2] + ror tmp0, 4 ; 18 + xor tmp0, e_64 + pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15] + ror tmp0, 14 ; 14 + add T1, tmp0 + psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 ; XMM0 = s1(W[t-2]) + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + xor T2, tmp0 + pxor xmm3, xmm4 ; XMM3 = s0(W[t-15]) + mov tmp0, a_64 + paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror tmp0, 5 ; 39 + paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor tmp0, a_64 + paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror tmp0, 6 ; 34 + movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords + xor tmp0, a_64 + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + ror tmp0, 28 ; 28 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +global sha512_sse4:function +sha512_sse4: + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifdef WINABI + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + movdqa xmm1, [XMM_QWORD_BSWAP wrt rip] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + movdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS; Compute 2 Rounds + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_sse t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifdef WINABI + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + ddq 0x08090a0b0c0d0e0f0001020304050607 + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + diff --git a/crypto/sha2/sha256.c b/crypto/sha2/sha256.c deleted file mode 100644 index f8960ea..0000000 --- a/crypto/sha2/sha256.c +++ /dev/null @@ -1,271 +0,0 @@ -/*- - * Copyright (c) 2001-2003 Allan Saddi - * Copyright (c) 2012 Moinak Ghosh moinakg gm0il com - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Define WORDS_BIGENDIAN if compiling on a big-endian architecture. - */ - -#ifdef HAVE_CONFIG_H -#include -#endif /* HAVE_CONFIG_H */ - -#if HAVE_INTTYPES_H -# include -#else -# if HAVE_STDINT_H -# include -# endif -#endif - -#include -#include -#include -#include - -#ifdef WORDS_BIGENDIAN - -#define BYTESWAP(x) (x) -#define BYTESWAP64(x) (x) - -#else /* WORDS_BIGENDIAN */ - -#define BYTESWAP(x) htonl(x) -#define BYTESWAP64(x) htonll(x) - -#endif /* WORDS_BIGENDIAN */ -typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks); - -static uint8_t padding[64] = { - 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 -}; - -static const uint32_t iv256[SHA256_HASH_WORDS] = { - 0x6a09e667L, - 0xbb67ae85L, - 0x3c6ef372L, - 0xa54ff53aL, - 0x510e527fL, - 0x9b05688cL, - 0x1f83d9abL, - 0x5be0cd19L -}; - -static update_func_ptr sha_update_func; - -int -APS_NAMESPACE(Init_SHA) (processor_info_t *pc) -{ - if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) { - if (pc->avx_level > 0) { - sha_update_func = sha256_avx; - - } else if (pc->sse_level >= 4) { - sha_update_func = sha256_sse4; - - } else { - return (1); - } - return (0); - } - return (1); -} - -static void -_init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS]) -{ - /* - * SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8! - * So we unroll a loop here. - */ - sc->hash[0] = iv[0]; - sc->hash[1] = iv[1]; - sc->hash[2] = iv[2]; - sc->hash[3] = iv[3]; - sc->hash[4] = iv[4]; - sc->hash[5] = iv[5]; - sc->hash[6] = iv[6]; - sc->hash[7] = iv[7]; - - sc->totalLength = 0LL; - sc->bufferLength = 0L; -} - -void -APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc) -{ - _init (sc, iv256); -} - -void -APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *vdata, size_t len) -{ - const uint8_t *data = (const uint8_t *)vdata; - uint32_t bufferBytesLeft; - size_t bytesToCopy; - int rem; - - if (sc->bufferLength) { - do { - bufferBytesLeft = 64L - sc->bufferLength; - bytesToCopy = bufferBytesLeft; - if (bytesToCopy > len) - bytesToCopy = len; - - memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy); - sc->totalLength += bytesToCopy * 8L; - sc->bufferLength += bytesToCopy; - data += bytesToCopy; - len -= bytesToCopy; - - if (sc->bufferLength == 64L) { - sc->blocks = 1; - sha_update_func(sc->buffer.words, sc->hash, sc->blocks); - sc->bufferLength = 0L; - } else { - return; - } - } while (len > 0 && len <= 64L); - if (!len) return; - } - - sc->blocks = len >> 6; - rem = len - (sc->blocks << 6); - len = sc->blocks << 6; - sc->totalLength += rem * 8L; - - if (len) { - sc->totalLength += len * 8L; - sha_update_func((uint32_t *)data, sc->hash, sc->blocks); - } - if (rem) { - memcpy (&sc->buffer.bytes[0], data + len, rem); - sc->bufferLength = rem; - } -} - -static void -_final (SHA256_Context *sc, uint8_t *hash, int hashWords) -{ - uint32_t bytesToPad; - uint64_t lengthPad; - int i; - - bytesToPad = 120L - sc->bufferLength; - if (bytesToPad > 64L) - bytesToPad -= 64L; - - lengthPad = BYTESWAP64(sc->totalLength); - - APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad); - APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L); - - if (hash) { - for (i = 0; i < hashWords; i++) { - hash[0] = (uint8_t) (sc->hash[i] >> 24); - hash[1] = (uint8_t) (sc->hash[i] >> 16); - hash[2] = (uint8_t) (sc->hash[i] >> 8); - hash[3] = (uint8_t) sc->hash[i]; - hash += 4; - } - } -} - -void -APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]) -{ - _final (sc, hash, SHA256_HASH_WORDS); -} - -/* Initialize an HMAC-SHA256 operation with the given key. */ -void -APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen) -{ - unsigned char pad[64]; - unsigned char khash[32]; - const unsigned char * K = (const unsigned char *)_K; - size_t i; - - /* If Klen > 64, the key is really SHA256(K). */ - if (Klen > 64) { - APS_NAMESPACE(SHA256_Init)(&ctx->ictx); - APS_NAMESPACE(SHA256_Update)(&ctx->ictx, K, Klen); - APS_NAMESPACE(SHA256_Final)(&ctx->ictx, khash); - K = khash; - Klen = 32; - } - - /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - APS_NAMESPACE(SHA256_Init)(&ctx->ictx); - memset(pad, 0x36, 64); - for (i = 0; i < Klen; i++) - pad[i] ^= K[i]; - APS_NAMESPACE(SHA256_Update)(&ctx->ictx, pad, 64); - - /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - APS_NAMESPACE(SHA256_Init)(&ctx->octx); - memset(pad, 0x5c, 64); - for (i = 0; i < Klen; i++) - pad[i] ^= K[i]; - APS_NAMESPACE(SHA256_Update)(&ctx->octx, pad, 64); - - /* Clean the stack. */ - memset(khash, 0, 32); -} - -/* Add bytes to the HMAC-SHA256 operation. */ -void -APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len) -{ - /* Feed data to the inner SHA256 operation. */ - APS_NAMESPACE(SHA256_Update)(&ctx->ictx, in, len); -} - -/* Finish an HMAC-SHA256 operation. */ -void -APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32]) -{ - unsigned char ihash[32]; - - /* Finish the inner SHA256 operation. */ - APS_NAMESPACE(SHA256_Final)(&ctx->ictx, ihash); - - /* Feed the inner hash to the outer SHA256 operation. */ - APS_NAMESPACE(SHA256_Update)(&ctx->octx, ihash, 32); - - /* Finish the outer SHA256 operation. */ - APS_NAMESPACE(SHA256_Final)(&ctx->octx, digest); - - /* Clean the stack. */ - memset(ihash, 0, 32); -} diff --git a/crypto/sha2/sha256.h b/crypto/sha2/sha256.h deleted file mode 100644 index 53ca362..0000000 --- a/crypto/sha2/sha256.h +++ /dev/null @@ -1,90 +0,0 @@ -/*- - * Copyright (c) 2001-2003 Allan Saddi - * Copyright (c) 2012 Moinak Ghosh moinakg gm0il com - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _APS_SHA256_H -#define _APS_SHA256_H - -#if HAVE_INTTYPES_H -# include -#else -# if HAVE_STDINT_H -# include -# endif -#endif - -#include - -#define SHA256_HASH_SIZE 32 - -/* Hash size in 32-bit words */ -#define SHA256_HASH_WORDS 8 - -typedef struct _SHA256_Context { - uint64_t totalLength, blocks; - uint32_t hash[SHA256_HASH_WORDS]; - uint32_t bufferLength; - union { - uint32_t words[16]; - uint8_t bytes[64]; - } buffer; -} SHA256_Context; - -typedef struct HMAC_SHA256Context { - SHA256_Context ictx; - SHA256_Context octx; -} HMAC_SHA256_Context; - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef APS_NAMESPACE -#define APS_NAMESPACE(name) opt_##name -#endif /* !APS_NAMESPACE */ - -void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc); -void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *data, size_t len); -void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]); -int APS_NAMESPACE(Init_SHA) (processor_info_t *pc); - -void APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen); -void APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len); -void APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32]); - -/* - * Intel's optimized SHA256 core routines. These routines are described in an - * Intel White-Paper: - * "Fast SHA-256 Implementations on Intel Architecture Processors" - */ -extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks); -extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks); - -#ifdef __cplusplus -} -#endif - -#endif /* !_APS_SHA256_H */ diff --git a/crypto/sha2/sha512.c b/crypto/sha2/sha512.c new file mode 100644 index 0000000..55ca23d --- /dev/null +++ b/crypto/sha2/sha512.c @@ -0,0 +1,294 @@ +/*- + * Copyright (c) 2001-2003 Allan Saddi + * Copyright (c) 2012 Moinak Ghosh moinakg gm0il com + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Define WORDS_BIGENDIAN if compiling on a big-endian architecture. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#if HAVE_INTTYPES_H +# include +#else +# if HAVE_STDINT_H +# include +# endif +#endif + +#include +#include +#include +#include "sha512.h" + + +#ifdef WORDS_BIGENDIAN + +#define BYTESWAP(x) (x) +#define BYTESWAP64(x) (x) + +#else /* WORDS_BIGENDIAN */ + +#define BYTESWAP(x) htonl(x) +#define BYTESWAP64(x) htonll(x) + +#endif /* WORDS_BIGENDIAN */ + +typedef void (*update_func_ptr)(const void *input_data, void *digest, uint64_t num_blks); + +static const uint8_t padding[128] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const uint64_t iv512[SHA512_HASH_WORDS] = { + 0x6a09e667f3bcc908LL, + 0xbb67ae8584caa73bLL, + 0x3c6ef372fe94f82bLL, + 0xa54ff53a5f1d36f1LL, + 0x510e527fade682d1LL, + 0x9b05688c2b3e6c1fLL, + 0x1f83d9abfb41bd6bLL, + 0x5be0cd19137e2179LL +}; + +static const uint64_t iv256[SHA512_HASH_WORDS] = { + 0x22312194fc2bf72cLL, + 0x9f555fa3c84c64c2LL, + 0x2393b86b6f53b151LL, + 0x963877195940eabdLL, + 0x96283ee2a88effe3LL, + 0xbe5e1e2553863992LL, + 0x2b0199fc2c85b8aaLL, + 0x0eb72ddc81c52ca2LL +}; + +static update_func_ptr sha512_update_func; + +int +APS_NAMESPACE(Init_SHA512) (processor_info_t *pc) +{ + if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) { + if (pc->avx_level > 0) { + sha512_update_func = sha512_avx; + + } else if (pc->sse_level >= 4) { + sha512_update_func = sha512_sse4; + + } else { + return (1); + } + return (0); + } + return (1); +} + +static void +_init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS]) +{ + int i; + + sc->totalLength[0] = 0LL; + sc->totalLength[1] = 0LL; + for (i = 0; i < SHA512_HASH_WORDS; i++) + sc->hash[i] = iv[i]; + sc->bufferLength = 0L; +} + +void +APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc) +{ + _init (sc, iv512); +} + +void +APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc) +{ + _init (sc, iv256); +} + +void +APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *vdata, size_t len) +{ + const uint8_t *data = (const uint8_t *)vdata; + uint32_t bufferBytesLeft; + size_t bytesToCopy; + int rem; + uint64_t carryCheck; + + if (sc->bufferLength) { + do { + bufferBytesLeft = 128L - sc->bufferLength; + bytesToCopy = bufferBytesLeft; + if (bytesToCopy > len) + bytesToCopy = len; + + memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy); + carryCheck = sc->totalLength[1]; + sc->totalLength[1] += bytesToCopy * 8L; + if (sc->totalLength[1] < carryCheck) + sc->totalLength[0]++; + + sc->bufferLength += bytesToCopy; + data += bytesToCopy; + len -= bytesToCopy; + + if (sc->bufferLength == 128L) { + sc->blocks = 1; + sha512_update_func(sc->buffer.words, sc->hash, sc->blocks); + sc->bufferLength = 0L; + } else { + return; + } + } while (len > 0 && len <= 128L); + if (!len) return; + } + + sc->blocks = len >> 7; + rem = len - (sc->blocks << 7); + len = sc->blocks << 7; + carryCheck = sc->totalLength[1]; + sc->totalLength[1] += rem * 8L; + if (sc->totalLength[1] < carryCheck) + sc->totalLength[0]++; + + if (len) { + carryCheck = sc->totalLength[1]; + sc->totalLength[1] += len * 8L; + if (sc->totalLength[1] < carryCheck) + sc->totalLength[0]++; + sha512_update_func((uint32_t *)data, sc->hash, sc->blocks); + } + if (rem) { + memcpy (&sc->buffer.bytes[0], data + len, rem); + sc->bufferLength = rem; + } +} + +void +APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len) +{ + APS_NAMESPACE(SHA512_Update) (sc, data, len); +} + +static void +_final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord) +{ + uint32_t bytesToPad; + uint64_t lengthPad[2]; + int i; + + bytesToPad = 240L - sc->bufferLength; + if (bytesToPad > 128L) + bytesToPad -= 128L; + + lengthPad[0] = BYTESWAP64(sc->totalLength[0]); + lengthPad[1] = BYTESWAP64(sc->totalLength[1]); + + APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad); + APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L); + + if (hash) { + for (i = 0; i < hashWords; i++) { + *((uint64_t *) hash) = BYTESWAP64(sc->hash[i]); + hash += 8; + } + if (halfWord) { + hash[0] = (uint8_t) (sc->hash[i] >> 56); + hash[1] = (uint8_t) (sc->hash[i] >> 48); + hash[2] = (uint8_t) (sc->hash[i] >> 40); + hash[3] = (uint8_t) (sc->hash[i] >> 32); + } + } +} + +void +APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]) +{ + _final (sc, hash, SHA512_HASH_WORDS, 0); +} + +void +APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]) +{ + _final (sc, hash, SHA512t256_HASH_WORDS, 0); +} + +#define HASH_CONTEXT SHA512_Context +#define HASH_INIT APS_NAMESPACE(SHA512_Init) +#define HASH_UPDATE APS_NAMESPACE(SHA512_Update) +#define HASH_FINAL APS_NAMESPACE(SHA512_Final) +#define HASH_SIZE SHA512_HASH_SIZE +#define HASH_BLOCK_SIZE 128 + +#define HMAC_CONTEXT HMAC_SHA512_Context +#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512_Init) +#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512_Update) +#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512_Final) + +#include "_hmac.c" + +#undef HASH_CONTEXT +#undef HASH_INIT +#undef HASH_UPDATE +#undef HASH_FINAL +#undef HASH_SIZE +#undef HASH_BLOCK_SIZE +#undef HMAC_CONTEXT +#undef HMAC_INIT +#undef HMAC_UPDATE +#undef HMAC_FINAL + +#define HASH_CONTEXT SHA512_Context +#define HASH_INIT APS_NAMESPACE(SHA512t256_Init) +#define HASH_UPDATE APS_NAMESPACE(SHA512t256_Update) +#define HASH_FINAL APS_NAMESPACE(SHA512t256_Final) +#define HASH_SIZE SHA512t256_HASH_SIZE +#define HASH_BLOCK_SIZE 128 + +#define HMAC_CONTEXT HMAC_SHA512_Context +#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512t256_Init) +#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512t256_Update) +#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512t256_Final) + +#include "_hmac.c" + diff --git a/crypto/sha2/sha512.h b/crypto/sha2/sha512.h new file mode 100644 index 0000000..501c0b0 --- /dev/null +++ b/crypto/sha2/sha512.h @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2001-2003 Allan Saddi + * Copyright (c) 2012 Moinak Ghosh moinakg gm0il com + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _APS_SHA512_H +#define _APS_SHA512_H + +#if HAVE_INTTYPES_H +# include +#else +# if HAVE_STDINT_H +# include +# endif +#endif + +#include + +#define SHA512_HASH_SIZE 64 +#define SHA512t256_HASH_SIZE 32 + +/* Hash size in 64-bit words */ +#define SHA512_HASH_WORDS 8 +#define SHA512t256_HASH_WORDS 4 + +typedef struct _SHA512_Context { + uint64_t totalLength[2], blocks; + uint64_t hash[SHA512_HASH_WORDS]; + uint32_t bufferLength; + union { + uint64_t words[16]; + uint8_t bytes[128]; + } buffer; +} SHA512_Context; + +typedef struct { + SHA512_Context outer; + SHA512_Context inner; +} HMAC_SHA512_Context; + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef APS_NAMESPACE +#define APS_NAMESPACE(name) opt_##name +#endif /* !APS_NAMESPACE */ + +void APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc); +void APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *data, size_t len); +void APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]); +int APS_NAMESPACE(Init_SHA512) (processor_info_t *pc); + +/* As are SHA-512/256 and SHA-512/224 */ +#define SHA512t256_Context SHA512_Context +void APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc); +void APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len); +void APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]); + +void APS_NAMESPACE(HMAC_SHA512_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen); +void APS_NAMESPACE(HMAC_SHA512_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len); +void APS_NAMESPACE(HMAC_SHA512_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512_HASH_SIZE]); + +void APS_NAMESPACE(HMAC_SHA512t256_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen); +void APS_NAMESPACE(HMAC_SHA512t256_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len); +void APS_NAMESPACE(HMAC_SHA512t256_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512t256_HASH_SIZE]); + +/* + * Intel's optimized SHA512 core routines. These routines are described in an + * Intel White-Paper: + * "Fast SHA-512 Implementations on Intel Architecture Processors" + * Note: Works on AMD Bulldozer and later as well. + */ +extern void sha512_sse4(const void *input_data, void *digest, uint64_t num_blks); +extern void sha512_avx(const void *input_data, void *digest, uint64_t num_blks); + +#ifdef __cplusplus +} +#endif + +#endif /* !_APS_SHA512_H */ diff --git a/main.c b/main.c index fb54d0c..0c3e2a4 100644 --- a/main.c +++ b/main.c @@ -2149,6 +2149,7 @@ main(int argc, char *argv[]) level = 6; err = 0; slab_init(); + init_pcompress(); while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLPS:B:F")) != -1) { int ovr; @@ -2341,7 +2342,6 @@ main(int argc, char *argv[]) exit(1); } main_cancel = 0; - init_pcompress(); if (cksum == 0) get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes); diff --git a/utils/xxhash_base.c b/utils/xxhash_base.c index 868994d..5c02abf 100644 --- a/utils/xxhash_base.c +++ b/utils/xxhash_base.c @@ -20,7 +20,6 @@ void * (*xxh32_init)(unsigned int seed) = NULL; int (*xxh32_feed)(void* state, const void* input, int len) = NULL; unsigned int (*xxh32_result)(void* state) = NULL; unsigned int (*xxh32_getIntermediateResult)(void* state) = NULL; -#include void XXH32_module_init() {