Major changes to use Intel's optimized SHA512 code for SHA512 and SHA512/256.

Remove earlier SHA256 code which is slower than SHA512/256 (on 64-bit CPU). Use HMAC from Alan Saddi's implementation for cleaner, faster code.
2013-01-25 22:55:55 +05:30 · 2013-01-25 22:55:55 +05:30 · 43af97042a
commit 43af97042a
parent 26bb137257
15 changed files with 1391 additions and 1540 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -102,12 +102,11 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \
 	crypto/skein/skein_debug.h crypto/skein/skein_iv.h
 SKEINOBJS = $(SKEINSRCS:.c=.o)
-SHA256_SRCS = crypto/sha2/sha256.c
+SHA2_SRCS = crypto/sha2/sha512.c
-SHA256_HDRS = crypto/sha2/sha256.h
+SHA2_HDRS = crypto/sha2/sha512.h
-SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \
+SHA2ASM_SRCS = crypto/sha2/intel/sha512_avx.asm crypto/sha2/intel/sha512_sse4.asm
-	crypto/sha2/intel/sha256_sse4.asm
+SHA2ASM_OBJS = $(SHA2ASM_SRCS:.asm=.o)
-SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o)
+SHA2_OBJS = $(SHA2_SRCS:.c=.o)
 SHA256_OBJS = $(SHA256_SRCS:.c=.o)
 YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX
 LIBBSCWRAP = libbsc_compress.c
@ -161,7 +160,7 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR
 	-L./buildtmp -Wl,-R@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS)
 OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
 $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
-$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
+$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
 $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS)
 DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@
@ -199,7 +198,7 @@ SSE3_OPT_FLAG = -mssse3
 SSE2_OPT_FLAG = -msse2
 SKEIN_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
-SHA256_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
+SHA2_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
 KECCAK_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
 all: $(PROG)
@ -237,10 +236,10 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
 $(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS)
 	$(COMPILE) $(SKEIN_FLAGS) $(@:.o=.c) -o $@
-$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS)
+$(SHA2_OBJS): $(SHA2_SRCS) $(SHA2_HDRS)
-	$(COMPILE) $(SHA256_FLAGS) $(@:.o=.c) -o $@
+	$(COMPILE) $(SHA2_FLAGS) $(@:.o=.c) -o $@
-$(SHA256ASM_OBJS): $(SHA256ASM_SRCS)
+$(SHA2ASM_OBJS): $(SHA2ASM_SRCS)
 	$(YASM)	-o $@ $(@:.o=.asm)
 $(KECCAK_OBJS): $(KECCAK_SRCS) $(KECCAK_HDRS)
--- a/8
+++ b/8
@ -236,8 +236,8 @@ then
 			# Minimum yasm version 1.1
 			[ $major -lt 1 -o $minor -lt 1 ] && continue
 			yasm=${bindir}/yasm
-			sha256asmobjs='\$\(SHA256ASM_OBJS\)'
+			sha256asmobjs='\$\(SHA2ASM_OBJS\)'
-			sha256objs='\$\(SHA256_OBJS\)'
+			sha256objs='\$\(SHA2_OBJS\)'
 		fi
 	done
 	if [ "x${yasm}" = "x" ]
@ -492,8 +492,8 @@ libbsclflagsvar="LIBBSCLFLAGS"
 libbscwrapobjvar="LIBBSCWRAPOBJ"
 libbscgenoptvar="LIBBSCGEN_OPT"
 libbsccppflagsvar="LIBBSCCPPFLAGS"
-sha256asmobjsvar="SHA256ASM_OBJS"
+sha256asmobjsvar="SHA2ASM_OBJS"
-sha256objsvar="SHA256_OBJS"
+sha256objsvar="SHA2_OBJS"
 yasmvar="YASM"
 fptr_flag_var="FPTR_FLAG"
 extra_opt_flags_var="EXTRA_OPT_FLAGS"
--- a/crypto/crypto_utils.c
+++ b/crypto/crypto_utils.c
@ -36,7 +36,8 @@
 #include <openssl/rand.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
-#include <sha256.h>
+//#include <sha256.h>
 #include <sha512.h>
 #include <crypto_aes.h>
 #include <KeccakNISTInterface.h>
 #include <utils.h>
@ -46,7 +47,7 @@
 #define	PROVIDER_OPENSSL	0
 #define	PROVIDER_X64_OPT	1
-static void init_sha256(void);
+static void init_sha512(void);
 static int geturandom_bytes(uchar_t rbytes[32]);
 /*
 * Checksum properties
@ -66,9 +67,9 @@ static struct {
 	{"SKEIN512",	"512-bit SKEIN",
 			CKSUM_SKEIN512,		64,	64,	NULL},
 	{"SHA256",	"Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.",
-			CKSUM_SHA256,		32,	32,	init_sha256},
+			CKSUM_SHA256,		32,	32,	init_sha512},
 	{"SHA512",	"512-bit SHA2 from OpenSSL's crypto library.",
-			CKSUM_SHA512,		64,	64,	NULL},
+			CKSUM_SHA512,		64,	64,	init_sha512},
 	{"KECCAK256",	"Official 256-bit NIST SHA3 optimized implementation.",
 			CKSUM_KECCAK256,		32,	32,	NULL},
 	{"KECCAK512",	"Official 512-bit NIST SHA3 optimized implementation.",
@ -190,18 +191,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
 			SHA256_Update(&ctx, buf, bytes);
 			SHA256_Final(cksum_buf, &ctx);
 		} else {
-			SHA256_Context ctx;
+			SHA512_Context ctx;
-			opt_SHA256_Init(&ctx);
+			opt_SHA512t256_Init(&ctx);
-			opt_SHA256_Update(&ctx, buf, bytes);
+			opt_SHA512t256_Update(&ctx, buf, bytes);
-			opt_SHA256_Final(&ctx, cksum_buf);
+			opt_SHA512t256_Final(&ctx, cksum_buf);
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			SHA512_CTX ctx;
 			SHA512_Init(&ctx);
 			SHA512_Update(&ctx, buf, bytes);
 			SHA512_Final(cksum_buf, &ctx);
 		} else {
 			SHA512_Context ctx;
 			opt_SHA512_Init(&ctx);
 			opt_SHA512_Update(&ctx, buf, bytes);
 			opt_SHA512_Final(&ctx, cksum_buf);
 		}
 	} else if (cksum == CKSUM_KECCAK256) {
 		if (Keccak_Hash(256, buf, bytes * 8, cksum_buf) != 0)
@ -219,7 +228,7 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
 }
 static void
-init_sha256(void)
+init_sha512(void)
 {
 #ifdef	WORDS_BIGENDIAN
 	cksum_provider = PROVIDER_OPENSSL;
@ -227,7 +236,7 @@ init_sha256(void)
 #ifdef	__x86_64__
 	cksum_provider = PROVIDER_OPENSSL;
 	if (proc_info.proc_type == PROC_X64_INTEL || proc_info.proc_type == PROC_X64_AMD) {
-		if (opt_Init_SHA(&proc_info) == 0) {
+		if (opt_Init_SHA512(&proc_info) == 0) {
 			cksum_provider = PROVIDER_X64_OPT;
 		}
 	}
@ -355,7 +364,7 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
 			}
 			mctx->mac_ctx_reinit = ctx;
 		} else {
-			HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
+/*			HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
 			if (!ctx) return (-1);
 			opt_HMAC_SHA256_Init(ctx, actx->pkey, KEYLEN);
 			mctx->mac_ctx = ctx;
@ -366,9 +375,23 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
 				return (-1);
 			}
 			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA256_Context));
 			mctx->mac_ctx_reinit = ctx;*/
 			HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
 			if (!ctx) return (-1);
 			opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN);
 			mctx->mac_ctx = ctx;
 			ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
 			if (!ctx) {
 				free(mctx->mac_ctx);
 				return (-1);
 			}
 			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
 			mctx->mac_ctx_reinit = ctx;
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
 			if (!ctx) return (-1);
 			HMAC_CTX_init(ctx);
@ -386,6 +409,20 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
 				return (-1);
 			}
 			mctx->mac_ctx_reinit = ctx;
 		} else {
 			HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
 			if (!ctx) return (-1);
 			opt_HMAC_SHA512_Init(ctx, actx->pkey, KEYLEN);
 			mctx->mac_ctx = ctx;
 			ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
 			if (!ctx) {
 				free(mctx->mac_ctx);
 				return (-1);
 			}
 			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
 			mctx->mac_ctx_reinit = ctx;
 		}
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		hashState *ctx = (hashState *)malloc(sizeof (hashState));
@ -423,16 +460,13 @@ hmac_reinit(mac_ctx_t *mctx)
 	if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
 		memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t));
-	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
+	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx),
 				      (HMAC_CTX *)(mctx->mac_ctx_reinit));
 		} else {
-			memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA256_Context));
+			memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA512_Context));
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx), (HMAC_CTX *)(mctx->mac_ctx_reinit));
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (hashState));
 	} else {
@ -458,15 +492,19 @@ hmac_update(mac_ctx_t *mctx, uchar_t *data, uint64_t len)
 			HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
 #endif
 		} else {
-			opt_HMAC_SHA256_Update((HMAC_SHA256_Context *)(mctx->mac_ctx), data, len);
+			opt_HMAC_SHA512t256_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 #ifndef __OSSL_OLD__
 			if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0)
 				return (-1);
 #else
 			HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
 #endif
 		} else {
 			opt_HMAC_SHA512_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
 		}
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		// Keccak takes data length in bits so we have to scale
@ -503,12 +541,16 @@ hmac_final(mac_ctx_t *mctx, uchar_t *hash, unsigned int *len)
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
 		} else {
-			opt_HMAC_SHA256_Final((HMAC_SHA256_Context *)(mctx->mac_ctx), hash);
+			opt_HMAC_SHA512t256_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
 			*len = 32;
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
-
+		} else {
 			opt_HMAC_SHA512_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
 			*len = 64;
 		}
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		if (Keccak_Final((hashState *)(mctx->mac_ctx), hash) != 0)
 			return (-1);
@ -531,18 +573,14 @@ hmac_cleanup(mac_ctx_t *mctx)
 		memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t));
 		memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t));
-	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
+	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
 			HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
 		} else {
-			memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA256_Context));
+			memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA512_Context));
-			memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA256_Context));
+			memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA512_Context));
 		}
 	} else if (cksum == CKSUM_SHA512) {
 		HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
 		HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		memset(mctx->mac_ctx, 0, sizeof (hashState));
 		memset(mctx->mac_ctx_reinit, 0, sizeof (hashState));
--- a/crypto/crypto_utils.h
+++ b/crypto/crypto_utils.h
@ -33,7 +33,7 @@ extern "C" {
 #endif
 #define	MAX_PW_LEN	16
-#define	CKSUM_MASK		0x800
+#define	CKSUM_MASK		0x700
 #define	CKSUM_MAX_BYTES		64
 #define	DEFAULT_CKSUM		"SKEIN256"
--- a/crypto/sha2/_hmac.c
+++ b/crypto/sha2/_hmac.c
@ -0,0 +1,84 @@
 /*-
 * Copyright (c) 2010, 2011 Allan Saddi <allan@saddi.com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 void
 HMAC_INIT(HMAC_CONTEXT *ctxt, const void *key, size_t keyLen)
 {
  HASH_CONTEXT keyCtxt;
  unsigned int i;
  uint8_t pkey[HASH_BLOCK_SIZE], okey[HASH_BLOCK_SIZE], ikey[HASH_BLOCK_SIZE];
  /* Ensure key is zero-padded */
  memset(pkey, 0, sizeof(pkey));
  if (keyLen > sizeof(pkey)) {
    /* Hash key if > HASH_BLOCK_SIZE */
    HASH_INIT(&keyCtxt);
    HASH_UPDATE(&keyCtxt, key, keyLen);
    HASH_FINAL(&keyCtxt, pkey);
  }
  else {
    memcpy(pkey, key, keyLen);
  }
  /* XOR with opad, ipad */
  for (i = 0; i < sizeof(okey); i++) {
    okey[i] = pkey[i] ^ 0x5c;
  }
  for (i = 0; i < sizeof(ikey); i++) {
    ikey[i] = pkey[i] ^ 0x36;
  }
  /* Initialize hash contexts */
  HASH_INIT(&ctxt->outer);
  HASH_UPDATE(&ctxt->outer, okey, sizeof(okey));
  HASH_INIT(&ctxt->inner);
  HASH_UPDATE(&ctxt->inner, ikey, sizeof(ikey));
  /* Burn the stack */
  memset(ikey, 0, sizeof(ikey));
  memset(okey, 0, sizeof(okey));
  memset(pkey, 0, sizeof(pkey));
  memset(&keyCtxt, 0, sizeof(keyCtxt));
 }
 void
 HMAC_UPDATE(HMAC_CONTEXT *ctxt, const void *data, size_t len)
 {
  HASH_UPDATE(&ctxt->inner, data, len);
 }
 void
 HMAC_FINAL(HMAC_CONTEXT *ctxt, uint8_t hmac[HASH_SIZE])
 {
  uint8_t ihash[HASH_SIZE];
  HASH_FINAL(&ctxt->inner, ihash);
  HASH_UPDATE(&ctxt->outer, ihash, sizeof(ihash));
  HASH_FINAL(&ctxt->outer, hmac);
  memset(ihash, 0, sizeof(ihash));
 }
--- a/crypto/sha2/intel/sha256_avx1.asm
+++ b/crypto/sha2/intel/sha256_avx1.asm
@ -1,577 +0,0 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright 2012 Intel Corporation All Rights Reserved.
 ; 
 ; The source code contained or described herein and all documents
 ; related to the source code ("Material") are owned by Intel Corporation
 ; or its suppliers or licensors. Title to the Material remains with
 ; Intel Corporation or its suppliers and licensors. The Material may
 ; contain trade secrets and proprietary and confidential information of
 ; Intel Corporation and its suppliers and licensors, and is protected by
 ; worldwide copyright and trade secret laws and treaty provisions. No
 ; part of the Material may be used, copied, reproduced, modified,
 ; published, uploaded, posted, transmitted, distributed, or disclosed in
 ; any way without Intel's prior express written permission.
 ; 
 ; No license under any patent, copyright, trade secret or other
 ; intellectual property right is granted to or conferred upon you by
 ; disclosure or delivery of the Materials, either expressly, by
 ; implication, inducement, estoppel or otherwise. Any license under such
 ; intellectual property rights must be express and approved by Intel in
 ; writing.
 ; 
 ; Unless otherwise agreed by Intel in writing, you may not remove or
 ; alter this notice or any other notice embedded in Materials by Intel
 ; or Intel's suppliers or licensors in any way.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; Example YASM command lines:
 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded 
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 %define	VMOVDQ vmovdqu ;; assume buffers not aligned 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
 ; addm [mem], reg
 ; Add reg to mem using reg-mem add and store
 %macro addm 2
 	add	%2, %1
 	mov	%1, %2
 %endm
 %macro MY_ROR 2
 	shld	%1,%1,(32-(%2))
 %endm
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
 ; Load xmm with mem and byte swap each dword
 %macro COPY_XMM_AND_BSWAP 3
 	VMOVDQ %1, %2
 	vpshufb %1, %1, %3
 %endmacro
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 %define X0 xmm4
 %define X1 xmm5
 %define X2 xmm6
 %define X3 xmm7
 %define XTMP0 xmm0
 %define XTMP1 xmm1
 %define XTMP2 xmm2
 %define XTMP3 xmm3
 %define XTMP4 xmm8
 %define XFER  xmm9
 %define XTMP5 xmm11
 %define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
 %define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
 %define BYTE_FLIP_MASK	xmm13
 %ifdef LINUX
 %define NUM_BLKS rdx	; 3rd arg
 %define CTX	rsi	; 2nd arg
 %define INP	rdi	; 1st arg
 %define SRND	rdi	; clobbers INP
 %define c	ecx
 %define d 	r8d
 %define e 	edx
 %else
 %define NUM_BLKS r8	; 3rd arg
 %define CTX	rdx 	; 2nd arg
 %define INP	rcx 	; 1st arg
 %define SRND	rcx	; clobbers INP
 %define c 	edi 
 %define d	esi 
 %define e 	r8d
 %endif
 %define TBL	rbp
 %define a eax
 %define b ebx
 %define f r9d
 %define g r10d
 %define h r11d
 %define y0 r13d
 %define y1 r14d
 %define y2 r15d
 _INP_END_SIZE	equ 8
 _INP_SIZE	equ 8
 _XFER_SIZE	equ 8
 %ifdef LINUX
 _XMM_SAVE_SIZE	equ 0
 %else
 _XMM_SAVE_SIZE	equ 8*16
 %endif
 ; STACK_SIZE plus pushes must be an odd multiple of 8
 _ALIGN_SIZE	equ 8
 _INP_END	equ 0
 _INP		equ _INP_END  + _INP_END_SIZE
 _XFER		equ _INP      + _INP_SIZE
 _XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
 STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
 ; rotate_Xs
 ; Rotate values of symbols X0...X3
 %macro rotate_Xs 0
 %xdefine X_ X0
 %xdefine X0 X1
 %xdefine X1 X2
 %xdefine X2 X3
 %xdefine X3 X_
 %endm
 ; ROTATE_ARGS
 ; Rotate values of symbols a...h
 %macro ROTATE_ARGS 0
 %xdefine TMP_ h
 %xdefine h g
 %xdefine g f
 %xdefine f e
 %xdefine e d
 %xdefine d c
 %xdefine c b
 %xdefine b a
 %xdefine a TMP_
 %endm
 %macro FOUR_ROUNDS_AND_SCHED 0
 		;; compute s0 four at a time and s1 two at a time
 		;; compute W[-16] + W[-7] 4 at a time
 		;vmovdqa	XTMP0, X3
 	mov	y0, e		; y0 = e
 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		;vmovdqa	XTMP1, X1
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	xor	y2, g		; y2 = f^g
 		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 		;; compute s0
 		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		vpsrld	XTMP2, XTMP1, 7
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		vpslld	XTMP3, XTMP1, (32-7)
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 	mov	y0, e		; y0 = e
 	mov	y1, a		; y1 = a
 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
 		vpsrld	XTMP2, XTMP1,18
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 	xor	y2, g		; y2 = f^g
 		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] >> 3
 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 		vpslld	XTMP1, XTMP1, (32-18)
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 		vpxor	XTMP3, XTMP3, XTMP1
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		;; compute low s1
 		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
 	mov	y0, e		; y0 = e
 	mov	y1, a		; y1 = a
 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
 		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
 	mov	y2, f		; y2 = f
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
 	xor	y2, g		; y2 = f^g
 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 		vpxor	XTMP2, XTMP2, XTMP3
 	add	y2, y0		; y2 = S1 + CH
 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
 		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		;; compute high s1
 		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
 	mov	y0, e		; y0 = e
 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	xor	y2, g		; y2 = f^g
 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 		vpxor	XTMP2, XTMP2, XTMP3
 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
 		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 rotate_Xs
 %endm
 ;; input is [rsp + _XFER + %1 * 4]
 %macro DO_ROUND 1
 	mov	y0, e		; y0 = e
 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
 	mov	y2, f		; y2 = f
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 	xor	y2, g		; y2 = f^g
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	and	y2, e		; y2 = (f^g)&e
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 	add	y2, y0		; y2 = S1 + CH
 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 	ROTATE_ARGS
 %endm
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 section .text
 global sha256_avx
 align 32
 sha256_avx:
 	push	rbx
 %ifndef LINUX
 	push	rsi
 	push	rdi
 %endif
 	push	rbp
 	push	r13
 	push	r14
 	push	r15
 	sub	rsp,STACK_SIZE
 %ifndef LINUX
 	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
 	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
 	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
 	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
 	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
 	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
 	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
 	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
 %endif
 	shl	NUM_BLKS, 6	; convert to bytes
 	jz	done_hash
 	add	NUM_BLKS, INP	; pointer to end of data
 	mov	[rsp + _INP_END], NUM_BLKS
 	;; load initial digest
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
 	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
 loop0:
 	lea	TBL,[K256 wrt rip]
 	;; byte swap first 16 dwords
 	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
 	mov	[rsp + _INP], INP
 	;; schedule 48 input dwords, by doing 3 rounds of 16 each
 	mov	SRND, 3
 align 16
 loop1:
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	vpaddd	XFER, X0, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	vpaddd	XFER, X0, [TBL + 2*16]
 	vmovdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	vpaddd	XFER, X0, [TBL + 3*16]
 	vmovdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
 	FOUR_ROUNDS_AND_SCHED
 	sub	SRND, 1
 	jne	loop1
 	mov	SRND, 2
 loop2:
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], XFER
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	vpaddd	XFER, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], XFER
 	add	TBL, 2*16
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	vmovdqa	X0, X2
 	vmovdqa	X1, X3
 	sub	SRND, 1
 	jne	loop2
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	loop0
 done_hash:
 %ifndef LINUX
 	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
 	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
 	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
 	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
 	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
 	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
 	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
 	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
 %endif
 	add	rsp, STACK_SIZE
 	pop	r15
 	pop	r14
 	pop	r13
 	pop	rbp
 %ifndef LINUX
 	pop	rdi
 	pop	rsi
 %endif
 	pop	rbx
 	ret	
 section .data
 align 64
 K256:
 	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
 ; shuffle xBxA -> 00BA
 _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
 ; shuffle xDxC -> DC00
 _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/intel/sha256_sse4.asm
+++ b/crypto/sha2/intel/sha256_sse4.asm
@ -1,535 +0,0 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright 2012 Intel Corporation All Rights Reserved.
 ; 
 ; The source code contained or described herein and all documents
 ; related to the source code ("Material") are owned by Intel Corporation
 ; or its suppliers or licensors. Title to the Material remains with
 ; Intel Corporation or its suppliers and licensors. The Material may
 ; contain trade secrets and proprietary and confidential information of
 ; Intel Corporation and its suppliers and licensors, and is protected by
 ; worldwide copyright and trade secret laws and treaty provisions. No
 ; part of the Material may be used, copied, reproduced, modified,
 ; published, uploaded, posted, transmitted, distributed, or disclosed in
 ; any way without Intel's prior express written permission.
 ; 
 ; No license under any patent, copyright, trade secret or other
 ; intellectual property right is granted to or conferred upon you by
 ; disclosure or delivery of the Materials, either expressly, by
 ; implication, inducement, estoppel or otherwise. Any license under such
 ; intellectual property rights must be express and approved by Intel in
 ; writing.
 ; 
 ; Unless otherwise agreed by Intel in writing, you may not remove or
 ; alter this notice or any other notice embedded in Materials by Intel
 ; or Intel's suppliers or licensors in any way.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; Example YASM command lines:
 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; This code is described in an Intel White-Paper:
 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
 ;
 ; To find it, surf to http://www.intel.com/p/en_US/embedded 
 ; and search for that title.
 ; The paper is expected to be released roughly at the end of April, 2012
 ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This code schedules 1 blocks at a time, with 4 lanes per block
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 %define	MOVDQ movdqu ;; assume buffers not aligned 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
 ; addm [mem], reg
 ; Add reg to mem using reg-mem add and store
 %macro addm 2
 	add	%2, %1
 	mov	%1, %2
 %endm
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
 ; Load xmm with mem and byte swap each dword
 %macro COPY_XMM_AND_BSWAP 3
 	MOVDQ %1, %2
 	pshufb %1, %3
 %endmacro
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 %define X0 xmm4
 %define X1 xmm5
 %define X2 xmm6
 %define X3 xmm7
 %define XTMP0 xmm0
 %define XTMP1 xmm1
 %define XTMP2 xmm2
 %define XTMP3 xmm3
 %define XTMP4 xmm8
 %define XFER  xmm9
 %define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
 %define SHUF_DC00	xmm11 ; shuffle xDxC -> DC00
 %define BYTE_FLIP_MASK	xmm12
 %ifdef LINUX
 %define NUM_BLKS rdx	; 3rd arg
 %define CTX	rsi	; 2nd arg
 %define INP	rdi	; 1st arg
 %define SRND	rdi	; clobbers INP
 %define c	ecx
 %define d 	r8d
 %define e 	edx
 %else
 %define NUM_BLKS r8	; 3rd arg
 %define CTX	rdx 	; 2nd arg
 %define INP	rcx 	; 1st arg
 %define SRND	rcx	; clobbers INP
 %define c 	edi 
 %define d	esi 
 %define e 	r8d
 %endif
 %define TBL	rbp
 %define a eax
 %define b ebx
 %define f r9d
 %define g r10d
 %define h r11d
 %define y0 r13d
 %define y1 r14d
 %define y2 r15d
 _INP_END_SIZE	equ 8
 _INP_SIZE	equ 8
 _XFER_SIZE	equ 8
 %ifdef LINUX
 _XMM_SAVE_SIZE	equ 0
 %else
 _XMM_SAVE_SIZE	equ 7*16
 %endif
 ; STACK_SIZE plus pushes must be an odd multiple of 8
 _ALIGN_SIZE	equ 8
 _INP_END	equ 0
 _INP		equ _INP_END  + _INP_END_SIZE
 _XFER		equ _INP      + _INP_SIZE
 _XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
 STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
 ; rotate_Xs
 ; Rotate values of symbols X0...X3
 %macro rotate_Xs 0
 %xdefine X_ X0
 %xdefine X0 X1
 %xdefine X1 X2
 %xdefine X2 X3
 %xdefine X3 X_
 %endm
 ; ROTATE_ARGS
 ; Rotate values of symbols a...h
 %macro ROTATE_ARGS 0
 %xdefine TMP_ h
 %xdefine h g
 %xdefine g f
 %xdefine f e
 %xdefine e d
 %xdefine d c
 %xdefine c b
 %xdefine b a
 %xdefine a TMP_
 %endm
 %macro FOUR_ROUNDS_AND_SCHED 0
 		;; compute s0 four at a time and s1 two at a time
 		;; compute W[-16] + W[-7] 4 at a time
 		movdqa	XTMP0, X3
 	mov	y0, e		; y0 = e
 	ror	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 		palignr	XTMP0, X2, 4	; XTMP0 = W[-7]
 	ror	y1, (22-13)	; y1 = a >> (22-13)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		movdqa	XTMP1, X1
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	xor	y2, g		; y2 = f^g
 		paddd	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 		;; compute s0
 		palignr	XTMP1, X0, 4	; XTMP1 = W[-15]
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 		movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
 	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
 		movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		pslld	XTMP1, (32-7)
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		psrld	XTMP2, 7
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 		movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
 	mov	y0, e		; y0 = e
 	mov	y1, a		; y1 = a
 		movdqa	XTMP4, XTMP3	; XTMP4 = W[-15]
 	ror	y0, (25-11)	; y0 = e >> (25-11)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	ror	y1, (22-13)	; y1 = a >> (22-13)
 		pslld	XTMP3, (32-18)
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 	xor	y2, g		; y2 = f^g
 		psrld	XTMP2, 18
 	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 		pxor	XTMP1, XTMP3
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 		psrld	XTMP4, 3	; XTMP4 = W[-15] >> 3
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
 	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 		pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		pxor	XTMP1, XTMP4	; XTMP1 = s0
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		;; compute low s1
 		pshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		paddd	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
 	mov	y0, e		; y0 = e
 	mov	y1, a		; y1 = a
 	ror	y0, (25-11)	; y0 = e >> (25-11)
 		movdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	ror	y1, (22-13)	; y1 = a >> (22-13)
 	mov	y2, f		; y2 = f
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
 	xor	y2, g		; y2 = f^g
 		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xBxA}
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 		psrld	XTMP4, 10	; XTMP4 = W[-2] >> 10 {BBAA}
 	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 		pxor	XTMP2, XTMP3
 	add	y2, y0		; y2 = S1 + CH
 	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
 		pxor	XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		pshufb	XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		paddd	XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 		;; compute high s1
 		pshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
 	mov	y0, e		; y0 = e
 	ror	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 		movdqa	X0,    XTMP2	; X0    = W[-2] {DDCC}
 	ror	y1, (22-13)	; y1 = a >> (22-13)
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	mov	y2, f		; y2 = f
 	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	xor	y2, g		; y2 = f^g
 		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xDxC}
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e		; y2 = (f^g)&e
 	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 		psrld	X0,    10	; X0 = W[-2] >> 10 {DDCC}
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 		pxor	XTMP2, XTMP3
 	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, y0		; y2 = S1 + CH
 	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
 		pxor	X0, XTMP2	; X0 = s1 {xDxC}
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 		pshufb	X0, SHUF_DC00	; X0 = s1 {DC00}
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 		paddd	X0, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 ROTATE_ARGS
 rotate_Xs
 %endm
 ;; input is [rsp + _XFER + %1 * 4]
 %macro DO_ROUND 1
 	mov	y0, e		; y0 = e
 	ror	y0, (25-11)	; y0 = e >> (25-11)
 	mov	y1, a		; y1 = a
 	xor	y0, e		; y0 = e ^ (e >> (25-11))
 	ror	y1, (22-13)	; y1 = a >> (22-13)
 	mov	y2, f		; y2 = f
 	xor	y1, a		; y1 = a ^ (a >> (22-13)
 	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
 	xor	y2, g		; y2 = f^g
 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
 	and	y2, e		; y2 = (f^g)&e
 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
 	add	y2, y0		; y2 = S1 + CH
 	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
 	mov	y0, a		; y0 = a
 	add	h, y2		; h = h + S1 + CH + k + w
 	mov	y2, a		; y2 = a
 	or	y0, c		; y0 = a|c
 	add	d, h		; d = d + h + S1 + CH + k + w
 	and	y2, c		; y2 = a&c
 	and	y0, b		; y0 = (a|c)&b
 	add	h, y1		; h = h + S1 + CH + k + w + S0
 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
 	ROTATE_ARGS
 %endm
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
 ;; arg 1 : pointer to input data
 ;; arg 2 : pointer to digest
 ;; arg 3 : Num blocks
 section .text
 global sha256_sse4
 align 32
 sha256_sse4:
 	push	rbx
 %ifndef LINUX
 	push	rsi
 	push	rdi
 %endif
 	push	rbp
 	push	r13
 	push	r14
 	push	r15
 	sub	rsp,STACK_SIZE
 %ifndef LINUX
 	movdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
 	movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
 	movdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
 	movdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
 	movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
 	movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
 	movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
 %endif
 	shl	NUM_BLKS, 6	; convert to bytes
 	jz	done_hash
 	add	NUM_BLKS, INP	; pointer to end of data
 	mov	[rsp + _INP_END], NUM_BLKS
 	;; load initial digest
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
 	mov	c,[4*2 + CTX]
 	mov	d,[4*3 + CTX]
 	mov	e,[4*4 + CTX]
 	mov	f,[4*5 + CTX]
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 	movdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
 	movdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
 	movdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
 loop0:
 	lea	TBL,[K256 wrt rip]
 	;; byte swap first 16 dwords
 	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
 	mov	[rsp + _INP], INP
 	;; schedule 48 input dwords, by doing 3 rounds of 16 each
 	mov	SRND, 3
 align 16
 loop1:
 	movdqa	XFER, [TBL + 0*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	movdqa	XFER, [TBL + 1*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	movdqa	XFER, [TBL + 2*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	FOUR_ROUNDS_AND_SCHED
 	movdqa	XFER, [TBL + 3*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
 	FOUR_ROUNDS_AND_SCHED
 	sub	SRND, 1
 	jne	loop1
 	mov	SRND, 2
 loop2:
 	paddd	X0, [TBL + 0*16]
 	movdqa	[rsp + _XFER], X0
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	paddd	X1, [TBL + 1*16]
 	movdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
 	DO_ROUND	0
 	DO_ROUND	1
 	DO_ROUND	2
 	DO_ROUND	3
 	movdqa	X0, X2
 	movdqa	X1, X3
 	sub	SRND, 1
 	jne	loop2
 	addm	[4*0 + CTX],a
 	addm	[4*1 + CTX],b
 	addm	[4*2 + CTX],c
 	addm	[4*3 + CTX],d
 	addm	[4*4 + CTX],e
 	addm	[4*5 + CTX],f
 	addm	[4*6 + CTX],g
 	addm	[4*7 + CTX],h
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 	cmp	INP, [rsp + _INP_END]
 	jne	loop0
 done_hash:
 %ifndef LINUX
 	movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
 	movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
 	movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
 	movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
 	movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
 	movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
 	movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
 %endif
 	add	rsp, STACK_SIZE
 	pop	r15
 	pop	r14
 	pop	r13
 	pop	rbp
 %ifndef LINUX
 	pop	rdi
 	pop	rsi
 %endif
 	pop	rbx
 	ret	
 section .data
 align 64
 K256:
 	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
 ; shuffle xBxA -> 00BA
 _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
 ; shuffle xDxC -> DC00
 _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/intel/sha512_avx.asm
+++ b/crypto/sha2/intel/sha512_avx.asm
@ -0,0 +1,409 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright 2012 Intel Corporation All Rights Reserved.
 ; 
 ; The source code contained or described herein and all documents
 ; related to the source code ("Material") are owned by Intel Corporation
 ; or its suppliers or licensors. Title to the Material remains with
 ; Intel Corporation or its suppliers and licensors. The Material may
 ; contain trade secrets and proprietary and confidential information of
 ; Intel Corporation and its suppliers and licensors, and is protected by
 ; worldwide copyright and trade secret laws and treaty provisions. No
 ; part of the Material may be used, copied, reproduced, modified,
 ; published, uploaded, posted, transmitted, distributed, or disclosed in
 ; any way without Intel's prior express written permission.
 ; 
 ; No license under any patent, copyright, trade secret or other
 ; intellectual property right is granted to or conferred upon you by
 ; disclosure or delivery of the Materials, either expressly, by
 ; implication, inducement, estoppel or otherwise. Any license under such
 ; intellectual property rights must be express and approved by Intel in
 ; writing.
 ; 
 ; Unless otherwise agreed by Intel in writing, you may not remove or
 ; alter this notice or any other notice embedded in Materials by Intel
 ; or Intel's suppliers or licensors in any way.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; Example YASM command lines:
 ; Windows:  yasm -f x64 -D WINABI sha512_avx.asm
 ; Linux:    yasm -f elf64 sha512_avx.asm
 ;
 BITS 64
 section .text
 ; Virtual Registers
 %ifdef WINABI
 	%define msg	rcx ; ARG1
 	%define digest	rdx ; ARG2
 	%define msglen	r8  ; ARG3
 	%define T1	rsi
 	%define T2	rdi
 %else
 	%define msg	rdi ; ARG1
 	%define digest	rsi ; ARG2
 	%define msglen	rdx ; ARG3
 	%define T1	rcx
 	%define T2	r8
 %endif
 %define a_64	r9
 %define b_64	r10
 %define c_64	r11
 %define d_64	r12
 %define e_64	r13
 %define f_64	r14
 %define g_64	r15
 %define h_64	rbx
 %define tmp0	rax
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 struc frame
 	.W:       resq 80 ; Message Schedule
 	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
 %ifdef WINABI
 	.XMMSAVE: resdq 4
 	.GPRSAVE: resq  7
 %else
 	.GPRSAVE: resq  5
 %endif
 endstruc
 ; Useful QWORD "arrays" for simpler memory references
 %define MSG(i)    msg    + 8*(i) ; Input message (arg1)
 %define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
 %define K_t(i)    K512   + 8*(i) wrt rip ; SHA Constants (static mem)
 %define W_t(i)    rsp + frame.W  + 8*(i) ; Message Schedule (stack frame)
 %define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
 ; MSG, DIGEST, K_t, W_t are arrays
 ; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
 %macro RotateState 0
 	; Rotate symbles a..h right
 	%xdefine	%%TMP h_64
 	%xdefine	h_64 g_64
 	%xdefine	g_64 f_64
 	%xdefine	f_64 e_64
 	%xdefine	e_64 d_64
 	%xdefine	d_64 c_64
 	%xdefine	c_64 b_64
 	%xdefine	b_64 a_64
 	%xdefine	a_64 %%TMP
 %endmacro
 %macro RORQ 2
 	; shld is faster than ror on Sandybridge
 	shld	%1, %1, (64 - %2)
 %endmacro
 %macro SHA512_Round 1
 %assign %%t   (%1)
 	; Compute Round %%t
 	mov	T1,   f_64        ; T1 = f
 	mov	tmp0, e_64        ; tmp = e
 	xor	T1,   g_64        ; T1 = f ^ g
 	RORQ	tmp0, 23 ; 41     ; tmp = e ror 23
 	and	T1,   e_64        ; T1 = (f ^ g) & e
 	xor	tmp0, e_64        ; tmp = (e ror 23) ^ e
 	xor	T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
 	add	T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
 	RORQ	tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
 	xor	tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
 	mov	T2,   a_64        ; T2 = a
 	add	T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
 	RORQ	tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
 	add	T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
 	mov	tmp0, a_64        ; tmp = a
 	xor	T2,   c_64        ; T2 = a ^ c
 	and	tmp0, c_64        ; tmp = a & c
 	and	T2,   b_64        ; T2 = (a ^ c) & b
 	xor	T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
 	mov	tmp0, a_64        ; tmp = a
 	RORQ	tmp0, 5 ; 39      ; tmp = a ror 5
 	xor	tmp0, a_64        ; tmp = (a ror 5) ^ a
 	add	d_64, T1          ; e(next_state) = d + T1 
 	RORQ	tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
 	xor	tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
 	lea	h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
 	RORQ	tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
 	add	h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
 	RotateState
 %endmacro
 %macro SHA512_2Sched_2Round_avx 1
 %assign %%t %1
 	; Compute rounds %%t-2 and %%t-1
 	; Compute message schedule QWORDS %%t and %%t+1
 	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
 	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
 	; scheduler.
 	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
 	; They are then added to their respective SHA512 constants at
 	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
 	;   For brievity, the comments following vectored instructions only refer to
 	; the first of a pair of QWORDS.
 	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
 	;   The computation of the message schedule and the rounds are tightly
 	; stitched to take advantage of instruction-level parallelism.
 	; For clarity, integer instructions (for the rounds calculation) are indented
 	; by one tab. Vectored instructions (for the message scheduler) are indented
 	; by two tabs.
 		vmovdqa	xmm4, [W_t(%%t-2)]   ; XMM4 = W[t-2]
 		vmovdqu	xmm5, [W_t(%%t-15)]  ; XMM5 = W[t-15]
 	mov	T1,   f_64
 		vpsrlq	xmm0, xmm4, 61       ; XMM0 = W[t-2]>>61
 	mov	tmp0, e_64
 		vpsrlq	xmm6, xmm5, 1        ; XMM6 = W[t-15]>>1
 	xor	T1,   g_64
 	RORQ	tmp0, 23 ; 41
 		vpsrlq	xmm1, xmm4, 19       ; XMM1 = W[t-2]>>19
 	and	T1,   e_64
 	xor	tmp0, e_64
 		vpxor	xmm0, xmm1           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19
 	xor	T1,   g_64
 	add	T1,   [WK_2(%%t)];
 		vpsrlq	xmm7, xmm5, 8        ; XMM7 = W[t-15]>>8
 	RORQ	tmp0, 4 ; 18
 		vpsrlq	xmm2, xmm4, 6        ; XMM2 = W[t-2]>>6
 	xor	tmp0, e_64
 	mov	T2,   a_64
 	add	T1,   h_64
 		vpxor	xmm6, xmm7           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8
 	RORQ	tmp0, 14 ; 14
 	add	T1,   tmp0
 		vpsrlq	xmm8, xmm5, 7        ; XMM8 = W[t-15]>>7
 	mov 	tmp0, a_64
 	xor	T2,   c_64
 		vpsllq	xmm3, xmm4, (64-61)  ; XMM3 = W[t-2]<<3
 	and	tmp0, c_64
 	and	T2,   b_64
 		vpxor	xmm2, xmm3           ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3
 	xor	T2,   tmp0
 	mov	tmp0, a_64
 		vpsllq	xmm9, xmm5, (64-1)   ; XMM9 = W[t-15]<<63
 	RORQ	tmp0, 5 ; 39
 		vpxor	xmm8, xmm9           ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63
 	xor	tmp0, a_64
 	add	d_64, T1
 	RORQ	tmp0, 6 ; 34
 	xor	tmp0, a_64
 		vpxor	xmm6, xmm8           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63
 	lea	h_64, [T1 + T2]
 	RORQ 	tmp0, 28 ; 28
 		vpsllq	xmm4, (64-19)        ; XMM4 = W[t-2]<<25
 	add	h_64, tmp0
 	RotateState
 		vpxor	xmm0, xmm4           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25
 	mov	T1, f_64
 		vpxor	xmm0, xmm2           ; XMM0 = s1(W[t-2])
 	mov	tmp0, e_64
 	xor	T1,   g_64
 		vpaddq	xmm0, [W_t(%%t-16)]  ; XMM0 = s1(W[t-2]) + W[t-16]
 		vmovdqu	xmm1, [W_t(%%t- 7)]  ; XMM1 = W[t-7]
 	RORQ	tmp0, 23 ; 41
 	and	T1,   e_64
 	xor	tmp0, e_64
 	xor	T1,   g_64
 		vpsllq	xmm5, (64-8)         ; XMM5 = W[t-15]<<56
 	add	T1,   [WK_2(%%t+1)]
 		vpxor	xmm6, xmm5           ; XMM6 = s0(W[t-15])
 	RORQ	tmp0, 4 ; 18
 		vpaddq	xmm0, xmm6           ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
 	xor	tmp0, e_64
 		vpaddq	xmm0, xmm1           ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
 	mov	T2,   a_64
 	add	T1,   h_64
 	RORQ	tmp0, 14 ; 14
 	add	T1,   tmp0
 		vmovdqa	[W_t(%%t)], xmm0      ; Store W[t]
 		vpaddq	xmm0, [K_t(t)]        ; Compute W[t]+K[t]
 		vmovdqa	[WK_2(t)], xmm0       ; Store W[t]+K[t] for next rounds
 	mov	tmp0, a_64
 	xor	T2,   c_64
 	and	tmp0, c_64
 	and	T2,   b_64
 	xor	T2,   tmp0
 	mov	tmp0, a_64
 	RORQ	tmp0, 5 ; 39
 	xor	tmp0, a_64
 	add	d_64, T1
 	RORQ	tmp0, 6 ; 34
 	xor	tmp0, a_64
 	lea	h_64, [T1 + T2]
 	RORQ	tmp0, 28 ; 28
 	add	h_64, tmp0
 	RotateState
 %endmacro
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_avx(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks
 global sha512_avx:function
 sha512_avx:
 	cmp	msglen, 0
 	je	.nowork
 	; Allocate Stack Space
 	sub	rsp, frame_size
 	; Save GPRs
 	mov	[rsp + frame.GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame.GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame.GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame.GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame.GPRSAVE + 8 * 4], r15
 %ifdef WINABI
 	mov	[rsp + frame.GPRSAVE + 8 * 5], rsi
 	mov	[rsp + frame.GPRSAVE + 8 * 6], rdi
 %endif
 	; Save XMMs
 %ifdef WINABI
 	vmovdqa	[rsp + frame.XMMSAVE + 16 * 0], xmm6
 	vmovdqa	[rsp + frame.XMMSAVE + 16 * 1], xmm7
 	vmovdqa	[rsp + frame.XMMSAVE + 16 * 2], xmm8
 	vmovdqa	[rsp + frame.XMMSAVE + 16 * 3], xmm9
 %endif	
 .updateblock:
 	; Load state variables
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 	%assign t 0
 	%rep 80/2 + 1
 	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
 	; +1 iteration because the scheduler leads hashing by 1 iteration
 		%if t < 2
 			; BSWAP 2 QWORDS
 			vmovdqa	xmm1, [XMM_QWORD_BSWAP wrt rip]
 			vmovdqu	xmm0, [MSG(t)]
 			vpshufb	xmm0, xmm0, xmm1     ; BSWAP
 			vmovdqa	[W_t(t)], xmm0       ; Store Scheduled Pair
 			vpaddq	xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
 			vmovdqa	[WK_2(t)], xmm0      ; Store into WK for rounds
 		%elif t < 16
 			; BSWAP 2 QWORDS, Compute 2 Rounds
 			vmovdqu	xmm0, [MSG(t)]
 			vpshufb	xmm0, xmm0, xmm1     ; BSWAP
 			SHA512_Round t - 2           ; Round t-2
 			vmovdqa	[W_t(t)], xmm0       ; Store Scheduled Pair
 			vpaddq	xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
 			SHA512_Round t - 1           ; Round t-1
 			vmovdqa	[WK_2(t)], xmm0      ; W[t]+K[t] into WK
 		%elif t < 79
 			; Schedule 2 QWORDS; Compute 2 Rounds
 			SHA512_2Sched_2Round_avx t
 		%else
 			; Compute 2 Rounds
 			SHA512_Round t - 2
 			SHA512_Round t - 1
 		%endif
 	%assign t t+2
 	%endrep
 	; Update digest
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 	; Advance to next message block
 	add	msg, 16*8
 	dec	msglen
 	jnz	.updateblock
 	; Restore XMMs
 %ifdef WINABI
 	vmovdqa	xmm6, [rsp + frame.XMMSAVE + 16 * 0]
 	vmovdqa	xmm7, [rsp + frame.XMMSAVE + 16 * 1]
 	vmovdqa	xmm8, [rsp + frame.XMMSAVE + 16 * 2]
 	vmovdqa	xmm9, [rsp + frame.XMMSAVE + 16 * 3]
 %endif
 	; Restore GPRs
 	mov	rbx, [rsp + frame.GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame.GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame.GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame.GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame.GPRSAVE + 8 * 4]
 %ifdef WINABI
 	mov	rsi, [rsp + frame.GPRSAVE + 8 * 5]
 	mov	rdi, [rsp + frame.GPRSAVE + 8 * 6]
 %endif
 	; Restore Stack Pointer
 	add	rsp, frame_size
 .nowork:
 	ret
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 section .data
 ALIGN 16
 ; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP: 
 	ddq 0x08090a0b0c0d0e0f0001020304050607
 ; K[t] used in SHA512 hashing
 K512:
 	dq 0x428a2f98d728ae22,0x7137449123ef65cd 
 	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	dq 0x3956c25bf348b538,0x59f111f1b605d019 
 	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	dq 0xd807aa98a3030242,0x12835b0145706fbe 
 	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
 	dq 0x9bdc06a725c71235,0xc19bf174cf692694
 	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
 	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
 	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	dq 0x983e5152ee66dfab,0xa831c66d2db43210 
 	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
 	dq 0x06ca6351e003826f,0x142929670a0e6e70
 	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
 	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
 	dq 0x81c2c92e47edaee6,0x92722c851482353b
 	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
 	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
 	dq 0xd192e819d6ef5218,0xd69906245565a910 
 	dq 0xf40e35855771202a,0x106aa07032bbd1b8
 	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
 	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
 	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
 	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	dq 0x90befffa23631e28,0xa4506cebde82bde9 
 	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	dq 0xca273eceea26619c,0xd186b8c721c0c207 
 	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
 	dq 0x113f9804bef90dae,0x1b710b35131c471b
 	dq 0x28db77f523047d84,0x32caab7b40c72493 
 	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
 	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
--- a/crypto/sha2/intel/sha512_sse4.asm
+++ b/crypto/sha2/intel/sha512_sse4.asm
@ -0,0 +1,398 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Copyright 2012 Intel Corporation All Rights Reserved.
 ; 
 ; The source code contained or described herein and all documents
 ; related to the source code ("Material") are owned by Intel Corporation
 ; or its suppliers or licensors. Title to the Material remains with
 ; Intel Corporation or its suppliers and licensors. The Material may
 ; contain trade secrets and proprietary and confidential information of
 ; Intel Corporation and its suppliers and licensors, and is protected by
 ; worldwide copyright and trade secret laws and treaty provisions. No
 ; part of the Material may be used, copied, reproduced, modified,
 ; published, uploaded, posted, transmitted, distributed, or disclosed in
 ; any way without Intel's prior express written permission.
 ; 
 ; No license under any patent, copyright, trade secret or other
 ; intellectual property right is granted to or conferred upon you by
 ; disclosure or delivery of the Materials, either expressly, by
 ; implication, inducement, estoppel or otherwise. Any license under such
 ; intellectual property rights must be express and approved by Intel in
 ; writing.
 ; 
 ; Unless otherwise agreed by Intel in writing, you may not remove or
 ; alter this notice or any other notice embedded in Materials by Intel
 ; or Intel's suppliers or licensors in any way.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;
 ; Example YASM command lines:
 ; Windows:  yasm -f x64 -D WINABI sha512_sse4.asm
 ; Linux:    yasm -f elf64 sha512_sse4.asm
 ;
 ; Alternative Example YASM command lines:
 ; Windows:  yasm -Xvc -f x64 -D WINABI -rnasm -pnasm -o sha512_sse4.obj -g cv8 sha512_sse4.asm
 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha512_sse4.o sha512_sse4.asm
 ;
 BITS 64
 section .text
 ; Virtual Registers
 %ifdef WINABI
 	%define msg	rcx ; ARG1
 	%define digest	rdx ; ARG2
 	%define msglen	r8  ; ARG3
 	%define T1	rsi
 	%define T2	rdi
 %else
 	%define msg	rdi ; ARG1
 	%define digest	rsi ; ARG2
 	%define msglen	rdx ; ARG3
 	%define T1	rcx
 	%define T2	r8
 %endif
 %define a_64	r9
 %define b_64	r10
 %define c_64	r11
 %define d_64	r12
 %define e_64	r13
 %define f_64	r14
 %define g_64	r15
 %define h_64	rbx
 %define tmp0	rax
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 struc frame
 	.W:       resq 80 ; Message Schedule
 	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
 %ifdef WINABI
 	.GPRSAVE: resq 7
 %else
 	.GPRSAVE: resq 5
 %endif
 endstruc
 ; Useful QWORD "arrays" for simpler memory references
 %define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
 %define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
 %define K_t(i)    K512   + 8*(i) wrt rip       ; SHA Constants (static mem)
 %define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
 %define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
 ; MSG, DIGEST, K_t, W_t are arrays
 ; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
 %macro RotateState 0
 	; Rotate symbles a..h right
 	%xdefine %%TMP h_64
 	%xdefine h_64  g_64
 	%xdefine g_64  f_64
 	%xdefine f_64  e_64
 	%xdefine e_64  d_64
 	%xdefine d_64  c_64
 	%xdefine c_64  b_64
 	%xdefine b_64  a_64
 	%xdefine a_64  %%TMP
 %endmacro
 %macro SHA512_Round 1
 %assign %%t   (%1)
 	; Compute Round %%t
 	mov	T1,   f_64        ; T1 = f
 	mov	tmp0, e_64        ; tmp = e
 	xor	T1,   g_64        ; T1 = f ^ g
 	ror	tmp0, 23 ; 41     ; tmp = e ror 23
 	and	T1,   e_64        ; T1 = (f ^ g) & e
 	xor	tmp0, e_64        ; tmp = (e ror 23) ^ e
 	xor	T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
 	add	T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
 	ror	tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
 	xor	tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
 	mov	T2,   a_64        ; T2 = a
 	add	T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
 	ror	tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
 	add	T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
 	mov	tmp0, a_64        ; tmp = a
 	xor	T2,   c_64        ; T2 = a ^ c
 	and	tmp0, c_64        ; tmp = a & c
 	and	T2,   b_64        ; T2 = (a ^ c) & b
 	xor	T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
 	mov	tmp0, a_64        ; tmp = a
 	ror	tmp0, 5 ; 39      ; tmp = a ror 5
 	xor	tmp0, a_64        ; tmp = (a ror 5) ^ a
 	add	d_64, T1          ; e(next_state) = d + T1 
 	ror	tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
 	xor	tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
 	lea	h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
 	ror	tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
 	add	h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
 	RotateState
 %endmacro
 %macro SHA512_2Sched_2Round_sse 1
 %assign %%t (%1)
 	; Compute rounds %%t-2 and %%t-1
 	; Compute message schedule QWORDS %%t and %%t+1
 	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
 	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
 	; scheduler.
 	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
 	; They are then added to their respective SHA512 constants at
 	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
 	;   For brievity, the comments following vectored instructions only refer to
 	; the first of a pair of QWORDS.
 	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
 	;   The computation of the message schedule and the rounds are tightly
 	; stitched to take advantage of instruction-level parallelism.
 	; For clarity, integer instructions (for the rounds calculation) are indented
 	; by one tab. Vectored instructions (for the message scheduler) are indented
 	; by two tabs.
 	mov	T1, f_64
 		movdqa	xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
 	xor	T1,   g_64
 	and	T1,   e_64
 		movdqa	xmm0, xmm2          ; XMM0 = W[t-2]
 	xor	T1,   g_64
 	add	T1,   [WK_2(%%t)]
 		movdqu	xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
 	mov	tmp0, e_64
 	ror	tmp0, 23 ; 41
 		movdqa	xmm3, xmm5          ; XMM3 = W[t-15]
 	xor	tmp0, e_64
 	ror	tmp0, 4 ; 18
 		psrlq	xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
 	xor	tmp0, e_64
 	ror	tmp0, 14 ; 14
 		psrlq	xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
 	add	T1,   tmp0
 	add	T1,   h_64
 		pxor	xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
 	mov	T2,   a_64
 	xor	T2,   c_64
 		pxor	xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
 	and	T2,   b_64
 	mov	tmp0, a_64
 		psrlq	xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
 	and	tmp0, c_64
 	xor	T2,   tmp0
 		psrlq	xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
 	mov	tmp0, a_64
 	ror	tmp0, 5 ; 39
 		pxor	xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
 	xor	tmp0, a_64
 	ror	tmp0, 6 ; 34
 		pxor	xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
 	xor	tmp0, a_64
 	ror	tmp0, 28 ; 28
 		psrlq	xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
 	add	T2,   tmp0
 	add	d_64, T1 
 		psrlq	xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
 	lea	h_64, [T1 + T2]
 	RotateState
 		movdqa	xmm1, xmm2          ; XMM1 = W[t-2]
 	mov	T1, f_64
 	xor	T1,   g_64
 		movdqa	xmm4, xmm5          ; XMM4 = W[t-15]
 	and	T1,   e_64
 	xor	T1,   g_64
 		psllq	xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
 	add	T1,   [WK_2(%%t+1)]
 	mov	tmp0, e_64
 		psllq	xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
 	ror	tmp0, 23 ; 41
 	xor	tmp0, e_64
 		pxor	xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
 	ror	tmp0, 4 ; 18
 	xor	tmp0, e_64
 		pxor	xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
 	ror	tmp0, 14 ; 14
 	add	T1,   tmp0
 		psllq	xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
 	add	T1,   h_64
 	mov	T2,   a_64
 		psllq	xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
 	xor	T2,   c_64
 	and	T2,   b_64
 		pxor	xmm0, xmm1          ; XMM0 = s1(W[t-2])
 	mov	tmp0, a_64
 	and	tmp0, c_64
 		movdqu	xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
 	xor	T2,   tmp0
 		pxor	xmm3, xmm4          ; XMM3 = s0(W[t-15])
 	mov	tmp0, a_64
 		paddq	xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
 	ror	tmp0, 5 ; 39
 		paddq	xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
 	xor	tmp0, a_64
 		paddq	xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
 	ror	tmp0, 6 ; 34
 		movdqa	[W_t(%%t)], xmm0    ; Store scheduled qwords
 	xor	tmp0, a_64
 		paddq	xmm0, [K_t(t)]      ; Compute W[t]+K[t]
 	ror	tmp0, 28 ; 28
 		movdqa	[WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
 	add	T2,   tmp0
 	add	d_64, T1
 	lea	h_64, [T1 + T2]
 	RotateState
 %endmacro
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; void sha512_sse4(const void* M, void* D, uint64_t L);
 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 ; The size of the message pointed to by M must be an integer multiple of SHA512
 ;   message blocks.
 ; L is the message length in SHA512 blocks.
 global sha512_sse4:function
 sha512_sse4:
 	cmp msglen, 0
 	je .nowork
 	; Allocate Stack Space
 	sub	rsp, frame_size
 	; Save GPRs
 	mov	[rsp + frame.GPRSAVE + 8 * 0], rbx
 	mov	[rsp + frame.GPRSAVE + 8 * 1], r12
 	mov	[rsp + frame.GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame.GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame.GPRSAVE + 8 * 4], r15
 %ifdef WINABI
 	mov	[rsp + frame.GPRSAVE + 8 * 5], rsi
 	mov	[rsp + frame.GPRSAVE + 8 * 6], rdi
 %endif
 .updateblock:
 	; Load state variables
 	mov	a_64, [DIGEST(0)]
 	mov	b_64, [DIGEST(1)]
 	mov	c_64, [DIGEST(2)]
 	mov	d_64, [DIGEST(3)]
 	mov	e_64, [DIGEST(4)]
 	mov	f_64, [DIGEST(5)]
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 	%assign t 0
 	%rep 80/2 + 1
 	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
 	; +1 iteration because the scheduler leads hashing by 1 iteration
 		%if t < 2
 			; BSWAP 2 QWORDS
 			movdqa	xmm1, [XMM_QWORD_BSWAP wrt rip]
 			movdqu	xmm0, [MSG(t)]
 			pshufb	xmm0, xmm1      ; BSWAP
 			movdqa	[W_t(t)], xmm0  ; Store Scheduled Pair
 			paddq	xmm0, [K_t(t)]  ; Compute W[t]+K[t]
 			movdqa	[WK_2(t)], xmm0 ; Store into WK for rounds
 		%elif t < 16
 			; BSWAP 2 QWORDS; Compute 2 Rounds
 			movdqu	xmm0, [MSG(t)]
 			pshufb	xmm0, xmm1      ; BSWAP
 			SHA512_Round t - 2      ; Round t-2
 			movdqa	[W_t(t)], xmm0  ; Store Scheduled Pair
 			paddq	xmm0, [K_t(t)]  ; Compute W[t]+K[t]
 			SHA512_Round t - 1      ; Round t-1
 			movdqa	[WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
 		%elif t < 79
 			; Schedule 2 QWORDS; Compute 2 Rounds
 			SHA512_2Sched_2Round_sse t 
 		%else
 			; Compute 2 Rounds
 			SHA512_Round t - 2
 			SHA512_Round t - 1
 		%endif
 	%assign t t+2
 	%endrep
 	; Update digest
 	add	[DIGEST(0)], a_64
 	add	[DIGEST(1)], b_64
 	add	[DIGEST(2)], c_64
 	add	[DIGEST(3)], d_64
 	add	[DIGEST(4)], e_64
 	add	[DIGEST(5)], f_64
 	add	[DIGEST(6)], g_64
 	add	[DIGEST(7)], h_64
 	; Advance to next message block
 	add	msg, 16*8
 	dec	msglen
 	jnz	.updateblock
 	; Restore GPRs
 	mov	rbx, [rsp + frame.GPRSAVE + 8 * 0]
 	mov	r12, [rsp + frame.GPRSAVE + 8 * 1]
 	mov	r13, [rsp + frame.GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame.GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame.GPRSAVE + 8 * 4]
 %ifdef WINABI
 	mov	rsi, [rsp + frame.GPRSAVE + 8 * 5]
 	mov	rdi, [rsp + frame.GPRSAVE + 8 * 6]
 %endif
 	; Restore Stack Pointer
 	add	rsp, frame_size
 .nowork:
 	ret
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Binary Data
 section .data
 ALIGN 16
 ; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP: 
 	ddq 0x08090a0b0c0d0e0f0001020304050607
 ; K[t] used in SHA512 hashing
 K512:
 	dq 0x428a2f98d728ae22,0x7137449123ef65cd 
 	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	dq 0x3956c25bf348b538,0x59f111f1b605d019 
 	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	dq 0xd807aa98a3030242,0x12835b0145706fbe 
 	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
 	dq 0x9bdc06a725c71235,0xc19bf174cf692694
 	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
 	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
 	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	dq 0x983e5152ee66dfab,0xa831c66d2db43210 
 	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
 	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
 	dq 0x06ca6351e003826f,0x142929670a0e6e70
 	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
 	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
 	dq 0x81c2c92e47edaee6,0x92722c851482353b
 	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
 	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
 	dq 0xd192e819d6ef5218,0xd69906245565a910 
 	dq 0xf40e35855771202a,0x106aa07032bbd1b8
 	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
 	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
 	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
 	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
 	dq 0x90befffa23631e28,0xa4506cebde82bde9 
 	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
 	dq 0xca273eceea26619c,0xd186b8c721c0c207 
 	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
 	dq 0x113f9804bef90dae,0x1b710b35131c471b
 	dq 0x28db77f523047d84,0x32caab7b40c72493 
 	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
 	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
--- a/crypto/sha2/sha256.c
+++ b/crypto/sha2/sha256.c
@ -1,271 +0,0 @@
 /*-
 * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
 * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
 */
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif /* HAVE_CONFIG_H */
 #if HAVE_INTTYPES_H
 # include <inttypes.h>
 #else
 # if HAVE_STDINT_H
 #  include <stdint.h>
 # endif
 #endif
 #include <pthread.h>
 #include <string.h>
 #include <utils.h>
 #include <sha256.h>
 #ifdef WORDS_BIGENDIAN
 #define BYTESWAP(x) (x)
 #define BYTESWAP64(x) (x)
 #else /* WORDS_BIGENDIAN */
 #define BYTESWAP(x) htonl(x)
 #define BYTESWAP64(x) htonll(x)
 #endif /* WORDS_BIGENDIAN */
 typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks);
 static uint8_t padding[64] = {
  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 static const uint32_t iv256[SHA256_HASH_WORDS] = {
  0x6a09e667L,
  0xbb67ae85L,
  0x3c6ef372L,
  0xa54ff53aL,
  0x510e527fL,
  0x9b05688cL,
  0x1f83d9abL,
  0x5be0cd19L
 };
 static update_func_ptr sha_update_func;
 int
 APS_NAMESPACE(Init_SHA) (processor_info_t *pc)
 {
 	if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
 		if (pc->avx_level > 0) {
 			sha_update_func = sha256_avx;
 		} else if (pc->sse_level >= 4) {
 			sha_update_func = sha256_sse4;
 		} else {
 			return (1);
 		}
 		return (0);
 	}
 	return (1);
 }
 static void
 _init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS])
 {
 	/*
 	 * SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8!
 	 * So we unroll a loop here.
 	 */
 	sc->hash[0] = iv[0];
 	sc->hash[1] = iv[1];
 	sc->hash[2] = iv[2];
 	sc->hash[3] = iv[3];
 	sc->hash[4] = iv[4];
 	sc->hash[5] = iv[5];
 	sc->hash[6] = iv[6];
 	sc->hash[7] = iv[7];
 	sc->totalLength = 0LL;
 	sc->bufferLength = 0L;
 }
 void
 APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc)
 {
 	_init (sc, iv256);
 }
 void
 APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *vdata, size_t len)
 {
 	const uint8_t *data = (const uint8_t *)vdata;
 	uint32_t bufferBytesLeft;
 	size_t bytesToCopy;
 	int rem;
 	if (sc->bufferLength) {
 		do {
 			bufferBytesLeft = 64L - sc->bufferLength;
 			bytesToCopy = bufferBytesLeft;
 			if (bytesToCopy > len)
 				bytesToCopy = len;
 			memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
 			sc->totalLength += bytesToCopy * 8L;
 			sc->bufferLength += bytesToCopy;
 			data += bytesToCopy;
 			len -= bytesToCopy;
 			if (sc->bufferLength == 64L) {
 				sc->blocks = 1;
 				sha_update_func(sc->buffer.words, sc->hash, sc->blocks);
 				sc->bufferLength = 0L;
 			} else {
 				return;
 			}
 		} while (len > 0 && len <= 64L);
 		if (!len) return;
 	}
 	sc->blocks = len >> 6;
 	rem = len - (sc->blocks << 6);
 	len = sc->blocks << 6;
 	sc->totalLength += rem * 8L;
 	if (len) {
 		sc->totalLength += len * 8L;
 		sha_update_func((uint32_t *)data, sc->hash, sc->blocks);
 	}
 	if (rem) {
 		memcpy (&sc->buffer.bytes[0], data + len, rem);
 		sc->bufferLength = rem;
 	}
 }
 static void
 _final (SHA256_Context *sc, uint8_t *hash, int hashWords)
 {
 	uint32_t bytesToPad;
 	uint64_t lengthPad;
 	int i;
 	bytesToPad = 120L - sc->bufferLength;
 	if (bytesToPad > 64L)
 		bytesToPad -= 64L;
 	lengthPad = BYTESWAP64(sc->totalLength);
 	APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad);
 	APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L);
 	if (hash) {
 		for (i = 0; i < hashWords; i++) {
 			hash[0] = (uint8_t) (sc->hash[i] >> 24);
 			hash[1] = (uint8_t) (sc->hash[i] >> 16);
 			hash[2] = (uint8_t) (sc->hash[i] >> 8);
 			hash[3] = (uint8_t) sc->hash[i];
 			hash += 4;
 		}
 	}
 }
 void
 APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE])
 {
 	_final (sc, hash, SHA256_HASH_WORDS);
 }
 /* Initialize an HMAC-SHA256 operation with the given key. */
 void
 APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen)
 {
 	unsigned char pad[64];
 	unsigned char khash[32];
 	const unsigned char * K = (const unsigned char *)_K;
 	size_t i;
 	/* If Klen > 64, the key is really SHA256(K). */
 	if (Klen > 64) {
 		APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
 		APS_NAMESPACE(SHA256_Update)(&ctx->ictx, K, Klen);
 		APS_NAMESPACE(SHA256_Final)(&ctx->ictx, khash);
 		K = khash;
 		Klen = 32;
 	}
 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
 	APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
 	memset(pad, 0x36, 64);
 	for (i = 0; i < Klen; i++)
 		pad[i] ^= K[i];
 	APS_NAMESPACE(SHA256_Update)(&ctx->ictx, pad, 64);
 	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
 	APS_NAMESPACE(SHA256_Init)(&ctx->octx);
 	memset(pad, 0x5c, 64);
 	for (i = 0; i < Klen; i++)
 		pad[i] ^= K[i];
 	APS_NAMESPACE(SHA256_Update)(&ctx->octx, pad, 64);
 	/* Clean the stack. */
 	memset(khash, 0, 32);
 }
 /* Add bytes to the HMAC-SHA256 operation. */
 void
 APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len)
 {
 	/* Feed data to the inner SHA256 operation. */
 	APS_NAMESPACE(SHA256_Update)(&ctx->ictx, in, len);
 }
 /* Finish an HMAC-SHA256 operation. */
 void
 APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32])
 {
 	unsigned char ihash[32];
 	/* Finish the inner SHA256 operation. */
 	APS_NAMESPACE(SHA256_Final)(&ctx->ictx, ihash);
 	/* Feed the inner hash to the outer SHA256 operation. */
 	APS_NAMESPACE(SHA256_Update)(&ctx->octx, ihash, 32);
 	/* Finish the outer SHA256 operation. */
 	APS_NAMESPACE(SHA256_Final)(&ctx->octx, digest);
 	/* Clean the stack. */
 	memset(ihash, 0, 32);
 }
--- a/crypto/sha2/sha256.h
+++ b/crypto/sha2/sha256.h
@ -1,90 +0,0 @@
 /*-
 * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
 * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #ifndef _APS_SHA256_H
 #define _APS_SHA256_H
 #if HAVE_INTTYPES_H
 # include <inttypes.h>
 #else
 # if HAVE_STDINT_H
 #  include <stdint.h>
 # endif
 #endif
 #include <utils.h>
 #define SHA256_HASH_SIZE 32
 /* Hash size in 32-bit words */
 #define SHA256_HASH_WORDS 8
 typedef struct _SHA256_Context {
 	uint64_t totalLength, blocks;
 	uint32_t hash[SHA256_HASH_WORDS];
 	uint32_t bufferLength;
 	union {
 		uint32_t words[16];
 		uint8_t bytes[64];
 	} buffer;
 } SHA256_Context;
 typedef struct HMAC_SHA256Context {
 	SHA256_Context ictx;
 	SHA256_Context octx;
 } HMAC_SHA256_Context;
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifndef APS_NAMESPACE
 #define APS_NAMESPACE(name) opt_##name
 #endif /* !APS_NAMESPACE */
 void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc);
 void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *data, size_t len);
 void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]);
 int  APS_NAMESPACE(Init_SHA) (processor_info_t *pc);
 void APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen);
 void APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len);
 void APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32]);
 /*
 * Intel's optimized SHA256 core routines. These routines are described in an
 * Intel White-Paper:
 * "Fast SHA-256 Implementations on Intel Architecture Processors"
 */
 extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
 extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
 #ifdef __cplusplus
 }
 #endif
 #endif /* !_APS_SHA256_H */
--- a/crypto/sha2/sha512.c
+++ b/crypto/sha2/sha512.c
@ -0,0 +1,294 @@
 /*-
 * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
 * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
 */
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif /* HAVE_CONFIG_H */
 #if HAVE_INTTYPES_H
 # include <inttypes.h>
 #else
 # if HAVE_STDINT_H
 #  include <stdint.h>
 # endif
 #endif
 #include <pthread.h>
 #include <string.h>
 #include <utils.h>
 #include "sha512.h"
 #ifdef WORDS_BIGENDIAN
 #define BYTESWAP(x) (x)
 #define BYTESWAP64(x) (x)
 #else /* WORDS_BIGENDIAN */
 #define BYTESWAP(x) htonl(x)
 #define BYTESWAP64(x) htonll(x)
 #endif /* WORDS_BIGENDIAN */
 typedef void (*update_func_ptr)(const void *input_data, void *digest, uint64_t num_blks);
 static const uint8_t padding[128] = {
  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 static const uint64_t iv512[SHA512_HASH_WORDS] = {
  0x6a09e667f3bcc908LL,
  0xbb67ae8584caa73bLL,
  0x3c6ef372fe94f82bLL,
  0xa54ff53a5f1d36f1LL,
  0x510e527fade682d1LL,
  0x9b05688c2b3e6c1fLL,
  0x1f83d9abfb41bd6bLL,
  0x5be0cd19137e2179LL
 };
 static const uint64_t iv256[SHA512_HASH_WORDS] = {
  0x22312194fc2bf72cLL,
  0x9f555fa3c84c64c2LL,
  0x2393b86b6f53b151LL,
  0x963877195940eabdLL,
  0x96283ee2a88effe3LL,
  0xbe5e1e2553863992LL,
  0x2b0199fc2c85b8aaLL,
  0x0eb72ddc81c52ca2LL
 };
 static update_func_ptr sha512_update_func;
 int
 APS_NAMESPACE(Init_SHA512) (processor_info_t *pc)
 {
 	if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
 		if (pc->avx_level > 0) {
 			sha512_update_func = sha512_avx;
 		} else if (pc->sse_level >= 4) {
 			sha512_update_func = sha512_sse4;
 		} else {
 			return (1);
 		}
 		return (0);
 	}
 	return (1);
 }
 static void
 _init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS])
 {
  int i;
  sc->totalLength[0] = 0LL;
  sc->totalLength[1] = 0LL;
  for (i = 0; i < SHA512_HASH_WORDS; i++)
    sc->hash[i] = iv[i];
  sc->bufferLength = 0L;
 }
 void
 APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc)
 {
  _init (sc, iv512);
 }
 void
 APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc)
 {
  _init (sc, iv256);
 }
 void
 APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *vdata, size_t len)
 {
 	const uint8_t *data = (const uint8_t *)vdata;
 	uint32_t bufferBytesLeft;
 	size_t bytesToCopy;
 	int rem;
 	uint64_t carryCheck;
 	if (sc->bufferLength) {
 		do {
 			bufferBytesLeft = 128L - sc->bufferLength;
 			bytesToCopy = bufferBytesLeft;
 			if (bytesToCopy > len)
 				bytesToCopy = len;
 			memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
 			carryCheck = sc->totalLength[1];
 			sc->totalLength[1] += bytesToCopy * 8L;
 			if (sc->totalLength[1] < carryCheck)
 				sc->totalLength[0]++;
 			sc->bufferLength += bytesToCopy;
 			data += bytesToCopy;
 			len -= bytesToCopy;
 			if (sc->bufferLength == 128L) {
 				sc->blocks = 1;
 				sha512_update_func(sc->buffer.words, sc->hash, sc->blocks);
 				sc->bufferLength = 0L;
 			} else {
 				return;
 			}
 		} while (len > 0 && len <= 128L);
 		if (!len) return;
 	}
 	sc->blocks = len >> 7;
 	rem = len - (sc->blocks << 7);
 	len = sc->blocks << 7;
 	carryCheck = sc->totalLength[1];
 	sc->totalLength[1] += rem * 8L;
 	if (sc->totalLength[1] < carryCheck)
 		sc->totalLength[0]++;
 	if (len) {
 		carryCheck = sc->totalLength[1];
 		sc->totalLength[1] += len * 8L;
 		if (sc->totalLength[1] < carryCheck)
 			sc->totalLength[0]++;
 		sha512_update_func((uint32_t *)data, sc->hash, sc->blocks);
 	}
 	if (rem) {
 		memcpy (&sc->buffer.bytes[0], data + len, rem);
 		sc->bufferLength = rem;
 	}
 }
 void
 APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len)
 {
  APS_NAMESPACE(SHA512_Update) (sc, data, len);
 }
 static void
 _final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord)
 {
  uint32_t bytesToPad;
  uint64_t lengthPad[2];
  int i;
  bytesToPad = 240L - sc->bufferLength;
  if (bytesToPad > 128L)
    bytesToPad -= 128L;
  lengthPad[0] = BYTESWAP64(sc->totalLength[0]);
  lengthPad[1] = BYTESWAP64(sc->totalLength[1]);
  APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad);
  APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L);
  if (hash) {
    for (i = 0; i < hashWords; i++) {
      *((uint64_t *) hash) = BYTESWAP64(sc->hash[i]);
      hash += 8;
    }
    if (halfWord) {
      hash[0] = (uint8_t) (sc->hash[i] >> 56);
      hash[1] = (uint8_t) (sc->hash[i] >> 48);
      hash[2] = (uint8_t) (sc->hash[i] >> 40);
      hash[3] = (uint8_t) (sc->hash[i] >> 32);
    }
  }
 }
 void
 APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE])
 {
  _final (sc, hash, SHA512_HASH_WORDS, 0);
 }
 void
 APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE])
 {
  _final (sc, hash, SHA512t256_HASH_WORDS, 0);
 }
 #define HASH_CONTEXT SHA512_Context
 #define HASH_INIT APS_NAMESPACE(SHA512_Init)
 #define HASH_UPDATE APS_NAMESPACE(SHA512_Update)
 #define HASH_FINAL APS_NAMESPACE(SHA512_Final)
 #define HASH_SIZE SHA512_HASH_SIZE
 #define HASH_BLOCK_SIZE 128
 #define HMAC_CONTEXT HMAC_SHA512_Context
 #define HMAC_INIT APS_NAMESPACE(HMAC_SHA512_Init)
 #define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512_Update)
 #define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512_Final)
 #include "_hmac.c"
 #undef HASH_CONTEXT
 #undef HASH_INIT
 #undef HASH_UPDATE
 #undef HASH_FINAL
 #undef HASH_SIZE
 #undef HASH_BLOCK_SIZE
 #undef HMAC_CONTEXT
 #undef HMAC_INIT
 #undef HMAC_UPDATE
 #undef HMAC_FINAL
 #define HASH_CONTEXT SHA512_Context
 #define HASH_INIT APS_NAMESPACE(SHA512t256_Init)
 #define HASH_UPDATE APS_NAMESPACE(SHA512t256_Update)
 #define HASH_FINAL APS_NAMESPACE(SHA512t256_Final)
 #define HASH_SIZE SHA512t256_HASH_SIZE
 #define HASH_BLOCK_SIZE 128
 #define HMAC_CONTEXT HMAC_SHA512_Context
 #define HMAC_INIT APS_NAMESPACE(HMAC_SHA512t256_Init)
 #define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512t256_Update)
 #define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512t256_Final)
 #include "_hmac.c"
--- a/crypto/sha2/sha512.h
+++ b/crypto/sha2/sha512.h
@ -0,0 +1,103 @@
 /*-
 * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
 * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #ifndef _APS_SHA512_H
 #define _APS_SHA512_H
 #if HAVE_INTTYPES_H
 # include <inttypes.h>
 #else
 # if HAVE_STDINT_H
 #  include <stdint.h>
 # endif
 #endif
 #include <utils.h>
 #define SHA512_HASH_SIZE 64
 #define SHA512t256_HASH_SIZE 32
 /* Hash size in 64-bit words */
 #define SHA512_HASH_WORDS 8
 #define SHA512t256_HASH_WORDS 4
 typedef struct _SHA512_Context {
  uint64_t totalLength[2], blocks;
  uint64_t hash[SHA512_HASH_WORDS];
  uint32_t bufferLength;
  union {
    uint64_t words[16];
    uint8_t bytes[128];
  } buffer;
 } SHA512_Context;
 typedef struct {
  SHA512_Context outer;
  SHA512_Context inner;
 } HMAC_SHA512_Context;
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifndef APS_NAMESPACE
 #define APS_NAMESPACE(name) opt_##name
 #endif /* !APS_NAMESPACE */
 void APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc);
 void APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *data, size_t len);
 void APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]);
 int  APS_NAMESPACE(Init_SHA512) (processor_info_t *pc);
 /* As are SHA-512/256 and SHA-512/224 */
 #define SHA512t256_Context SHA512_Context
 void APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc);
 void APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len);
 void APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]);
 void APS_NAMESPACE(HMAC_SHA512_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
 void APS_NAMESPACE(HMAC_SHA512_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
 void APS_NAMESPACE(HMAC_SHA512_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512_HASH_SIZE]);
 void APS_NAMESPACE(HMAC_SHA512t256_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
 void APS_NAMESPACE(HMAC_SHA512t256_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
 void APS_NAMESPACE(HMAC_SHA512t256_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512t256_HASH_SIZE]);
 /*
 * Intel's optimized SHA512 core routines. These routines are described in an
 * Intel White-Paper:
 * "Fast SHA-512 Implementations on Intel Architecture Processors"
 * Note: Works on AMD Bulldozer and later as well.
 */
 extern void sha512_sse4(const void *input_data, void *digest, uint64_t num_blks);
 extern void sha512_avx(const void *input_data, void *digest, uint64_t num_blks);
 #ifdef __cplusplus
 }
 #endif
 #endif /* !_APS_SHA512_H */
--- a/main.c
+++ b/main.c
@ -2149,6 +2149,7 @@ main(int argc, char *argv[])
 	level = 6;
 	err = 0;
 	slab_init();
 	init_pcompress();
 	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLPS:B:F")) != -1) {
 		int ovr;
@ -2341,7 +2342,6 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 	main_cancel = 0;
 	init_pcompress();
 	if (cksum == 0)
 		get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes);
--- a/utils/xxhash_base.c
+++ b/utils/xxhash_base.c
@ -20,7 +20,6 @@ void * (*xxh32_init)(unsigned int seed) = NULL;
 int (*xxh32_feed)(void* state, const void* input, int len) = NULL;
 unsigned int (*xxh32_result)(void* state) = NULL;
 unsigned int (*xxh32_getIntermediateResult)(void* state) = NULL;
 #include <stdio.h>
 void
 XXH32_module_init() {