Major changes to use Intel's optimized SHA512 code for SHA512 and SHA512/256.

Remove earlier SHA256 code which is slower than SHA512/256 (on 64-bit CPU). Use HMAC from Alan Saddi's implementation for cleaner, faster code.
2013-01-25 22:55:55 +05:30 · 2013-01-25 22:55:55 +05:30 · 43af97042a
commit 43af97042a
parent 26bb137257
15 changed files with 1391 additions and 1540 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -102,12 +102,11 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \
 	crypto/skein/skein_debug.h crypto/skein/skein_iv.h
 SKEINOBJS = $(SKEINSRCS:.c=.o)

-SHA256_SRCS = crypto/sha2/sha256.c
-SHA256_HDRS = crypto/sha2/sha256.h
-SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \
-	crypto/sha2/intel/sha256_sse4.asm
-SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o)
-SHA256_OBJS = $(SHA256_SRCS:.c=.o)
+SHA2_SRCS = crypto/sha2/sha512.c
+SHA2_HDRS = crypto/sha2/sha512.h
+SHA2ASM_SRCS = crypto/sha2/intel/sha512_avx.asm crypto/sha2/intel/sha512_sse4.asm
+SHA2ASM_OBJS = $(SHA2ASM_SRCS:.asm=.o)
+SHA2_OBJS = $(SHA2_SRCS:.c=.o)

 YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX
 LIBBSCWRAP = libbsc_compress.c
@ -161,7 +160,7 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR
 	-L./buildtmp -Wl,-R@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS)
 OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
 $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
-$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
+$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
 $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS)

 DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@
@ -199,7 +198,7 @@ SSE3_OPT_FLAG = -mssse3
 SSE2_OPT_FLAG = -msse2

 SKEIN_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
-SHA256_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
+SHA2_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@
 KECCAK_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@

 all: $(PROG)
@ -237,10 +236,10 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
 $(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS)
 	$(COMPILE) $(SKEIN_FLAGS) $(@:.o=.c) -o $@

-$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS)
-	$(COMPILE) $(SHA256_FLAGS) $(@:.o=.c) -o $@
+$(SHA2_OBJS): $(SHA2_SRCS) $(SHA2_HDRS)
+	$(COMPILE) $(SHA2_FLAGS) $(@:.o=.c) -o $@

-$(SHA256ASM_OBJS): $(SHA256ASM_SRCS)
+$(SHA2ASM_OBJS): $(SHA2ASM_SRCS)
 	$(YASM)	-o $@ $(@:.o=.asm)

 $(KECCAK_OBJS): $(KECCAK_SRCS) $(KECCAK_HDRS)
--- a/8
+++ b/8
@ -236,8 +236,8 @@ then
 			# Minimum yasm version 1.1
 			[ $major -lt 1 -o $minor -lt 1 ] && continue
 			yasm=${bindir}/yasm
-			sha256asmobjs='\$\(SHA256ASM_OBJS\)'
-			sha256objs='\$\(SHA256_OBJS\)'
+			sha256asmobjs='\$\(SHA2ASM_OBJS\)'
+			sha256objs='\$\(SHA2_OBJS\)'
 		fi
 	done
 	if [ "x${yasm}" = "x" ]
@ -492,8 +492,8 @@ libbsclflagsvar="LIBBSCLFLAGS"
 libbscwrapobjvar="LIBBSCWRAPOBJ"
 libbscgenoptvar="LIBBSCGEN_OPT"
 libbsccppflagsvar="LIBBSCCPPFLAGS"
-sha256asmobjsvar="SHA256ASM_OBJS"
-sha256objsvar="SHA256_OBJS"
+sha256asmobjsvar="SHA2ASM_OBJS"
+sha256objsvar="SHA2_OBJS"
 yasmvar="YASM"
 fptr_flag_var="FPTR_FLAG"
 extra_opt_flags_var="EXTRA_OPT_FLAGS"
--- a/crypto/crypto_utils.c
+++ b/crypto/crypto_utils.c
@ -36,7 +36,8 @@
 #include <openssl/rand.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
-#include <sha256.h>
+//#include <sha256.h>
+#include <sha512.h>
 #include <crypto_aes.h>
 #include <KeccakNISTInterface.h>
 #include <utils.h>
@ -46,7 +47,7 @@
 #define	PROVIDER_OPENSSL	0
 #define	PROVIDER_X64_OPT	1

-static void init_sha256(void);
+static void init_sha512(void);
 static int geturandom_bytes(uchar_t rbytes[32]);
 /*
 * Checksum properties
@ -66,9 +67,9 @@ static struct {
 	{"SKEIN512",	"512-bit SKEIN",
 			CKSUM_SKEIN512,		64,	64,	NULL},
 	{"SHA256",	"Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.",
-			CKSUM_SHA256,		32,	32,	init_sha256},
+			CKSUM_SHA256,		32,	32,	init_sha512},
 	{"SHA512",	"512-bit SHA2 from OpenSSL's crypto library.",
-			CKSUM_SHA512,		64,	64,	NULL},
+			CKSUM_SHA512,		64,	64,	init_sha512},
 	{"KECCAK256",	"Official 256-bit NIST SHA3 optimized implementation.",
 			CKSUM_KECCAK256,		32,	32,	NULL},
 	{"KECCAK512",	"Official 512-bit NIST SHA3 optimized implementation.",
@ -190,18 +191,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
 			SHA256_Update(&ctx, buf, bytes);
 			SHA256_Final(cksum_buf, &ctx);
 		} else {
-			SHA256_Context ctx;
+			SHA512_Context ctx;

-			opt_SHA256_Init(&ctx);
-			opt_SHA256_Update(&ctx, buf, bytes);
-			opt_SHA256_Final(&ctx, cksum_buf);
+			opt_SHA512t256_Init(&ctx);
+			opt_SHA512t256_Update(&ctx, buf, bytes);
+			opt_SHA512t256_Final(&ctx, cksum_buf);
 		}
 	} else if (cksum == CKSUM_SHA512) {
-		SHA512_CTX ctx;
+		if (cksum_provider == PROVIDER_OPENSSL) {
+			SHA512_CTX ctx;

-		SHA512_Init(&ctx);
-		SHA512_Update(&ctx, buf, bytes);
-		SHA512_Final(cksum_buf, &ctx);
+			SHA512_Init(&ctx);
+			SHA512_Update(&ctx, buf, bytes);
+			SHA512_Final(cksum_buf, &ctx);
+		} else {
+			SHA512_Context ctx;
+
+			opt_SHA512_Init(&ctx);
+			opt_SHA512_Update(&ctx, buf, bytes);
+			opt_SHA512_Final(&ctx, cksum_buf);
+		}

 	} else if (cksum == CKSUM_KECCAK256) {
 		if (Keccak_Hash(256, buf, bytes * 8, cksum_buf) != 0)
@ -219,7 +228,7 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
 }

 static void
-init_sha256(void)
+init_sha512(void)
 {
 #ifdef	WORDS_BIGENDIAN
 	cksum_provider = PROVIDER_OPENSSL;
@ -227,7 +236,7 @@ init_sha256(void)
 #ifdef	__x86_64__
 	cksum_provider = PROVIDER_OPENSSL;
 	if (proc_info.proc_type == PROC_X64_INTEL || proc_info.proc_type == PROC_X64_AMD) {
-		if (opt_Init_SHA(&proc_info) == 0) {
+		if (opt_Init_SHA512(&proc_info) == 0) {
 			cksum_provider = PROVIDER_X64_OPT;
 		}
 	}
@ -355,7 +364,7 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
 			}
 			mctx->mac_ctx_reinit = ctx;
 		} else {
-			HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
+/*			HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
 			if (!ctx) return (-1);
 			opt_HMAC_SHA256_Init(ctx, actx->pkey, KEYLEN);
 			mctx->mac_ctx = ctx;
@ -366,26 +375,54 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
 				return (-1);
 			}
 			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA256_Context));
+			mctx->mac_ctx_reinit = ctx;*/
+
+			HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
+			if (!ctx) return (-1);
+			opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN);
+			mctx->mac_ctx = ctx;
+
+			ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
+			if (!ctx) {
+				free(mctx->mac_ctx);
+				return (-1);
+			}
+			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
 			mctx->mac_ctx_reinit = ctx;
 		}
 	} else if (cksum == CKSUM_SHA512) {
-		HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
-		if (!ctx) return (-1);
-		HMAC_CTX_init(ctx);
-		HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL);
-		mctx->mac_ctx = ctx;
+		if (cksum_provider == PROVIDER_OPENSSL) {
+			HMAC_CTX *ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
+			if (!ctx) return (-1);
+			HMAC_CTX_init(ctx);
+			HMAC_Init_ex(ctx, actx->pkey, KEYLEN, EVP_sha512(), NULL);
+			mctx->mac_ctx = ctx;

-		ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
-		if (!ctx) {
-			free(mctx->mac_ctx);
-			return (-1);
+			ctx = (HMAC_CTX *)malloc(sizeof (HMAC_CTX));
+			if (!ctx) {
+				free(mctx->mac_ctx);
+				return (-1);
+			}
+			if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) {
+				free(ctx);
+				free(mctx->mac_ctx);
+				return (-1);
+			}
+			mctx->mac_ctx_reinit = ctx;
+		} else {
+			HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
+			if (!ctx) return (-1);
+			opt_HMAC_SHA512_Init(ctx, actx->pkey, KEYLEN);
+			mctx->mac_ctx = ctx;
+
+			ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
+			if (!ctx) {
+				free(mctx->mac_ctx);
+				return (-1);
+			}
+			memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA512_Context));
+			mctx->mac_ctx_reinit = ctx;
 		}
-		if (!HMAC_CTX_copy(ctx, (HMAC_CTX *)(mctx->mac_ctx))) {
-			free(ctx);
-			free(mctx->mac_ctx);
-			return (-1);
-		}
-		mctx->mac_ctx_reinit = ctx;

 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		hashState *ctx = (hashState *)malloc(sizeof (hashState));
@ -423,16 +460,13 @@ hmac_reinit(mac_ctx_t *mctx)
 	if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
 		memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t));

-	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
+	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx),
 				      (HMAC_CTX *)(mctx->mac_ctx_reinit));
 		} else {
-			memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA256_Context));
+			memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (HMAC_SHA512_Context));
 		}
-	} else if (cksum == CKSUM_SHA512) {
-		HMAC_CTX_copy((HMAC_CTX *)(mctx->mac_ctx), (HMAC_CTX *)(mctx->mac_ctx_reinit));
-
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (hashState));
 	} else {
@ -458,15 +492,19 @@ hmac_update(mac_ctx_t *mctx, uchar_t *data, uint64_t len)
 			HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
 #endif
 		} else {
-			opt_HMAC_SHA256_Update((HMAC_SHA256_Context *)(mctx->mac_ctx), data, len);
+			opt_HMAC_SHA512t256_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
 		}
 	} else if (cksum == CKSUM_SHA512) {
+		if (cksum_provider == PROVIDER_OPENSSL) {
 #ifndef __OSSL_OLD__
-		if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0)
-			return (-1);
+			if (HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len) == 0)
+				return (-1);
 #else
-		HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
+			HMAC_Update((HMAC_CTX *)(mctx->mac_ctx), data, len);
 #endif
+		} else {
+			opt_HMAC_SHA512_Update((HMAC_SHA512_Context *)(mctx->mac_ctx), data, len);
+		}

 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		// Keccak takes data length in bits so we have to scale
@ -503,12 +541,16 @@ hmac_final(mac_ctx_t *mctx, uchar_t *hash, unsigned int *len)
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
 		} else {
-			opt_HMAC_SHA256_Final((HMAC_SHA256_Context *)(mctx->mac_ctx), hash);
+			opt_HMAC_SHA512t256_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
 			*len = 32;
 		}
 	} else if (cksum == CKSUM_SHA512) {
-		HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
-
+		if (cksum_provider == PROVIDER_OPENSSL) {
+			HMAC_Final((HMAC_CTX *)(mctx->mac_ctx), hash, len);
+		} else {
+			opt_HMAC_SHA512_Final((HMAC_SHA512_Context *)(mctx->mac_ctx), hash);
+			*len = 64;
+		}
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		if (Keccak_Final((hashState *)(mctx->mac_ctx), hash) != 0)
 			return (-1);
@ -531,18 +573,14 @@ hmac_cleanup(mac_ctx_t *mctx)
 		memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t));
 		memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t));

-	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
+	} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
 		if (cksum_provider == PROVIDER_OPENSSL) {
 			HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
 			HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
 		} else {
-			memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA256_Context));
-			memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA256_Context));
+			memset(mctx->mac_ctx, 0, sizeof (HMAC_SHA512_Context));
+			memset(mctx->mac_ctx_reinit, 0, sizeof (HMAC_SHA512_Context));
 		}
-	} else if (cksum == CKSUM_SHA512) {
-		HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx));
-		HMAC_CTX_cleanup((HMAC_CTX *)(mctx->mac_ctx_reinit));
-
 	} else if (cksum == CKSUM_KECCAK256 || cksum == CKSUM_KECCAK512) {
 		memset(mctx->mac_ctx, 0, sizeof (hashState));
 		memset(mctx->mac_ctx_reinit, 0, sizeof (hashState));
--- a/crypto/crypto_utils.h
+++ b/crypto/crypto_utils.h
@ -33,7 +33,7 @@ extern "C" {
 #endif

 #define	MAX_PW_LEN	16
-#define	CKSUM_MASK		0x800
+#define	CKSUM_MASK		0x700
 #define	CKSUM_MAX_BYTES		64
 #define	DEFAULT_CKSUM		"SKEIN256"

--- a/crypto/sha2/_hmac.c
+++ b/crypto/sha2/_hmac.c
@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2010, 2011 Allan Saddi <allan@saddi.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+void
+HMAC_INIT(HMAC_CONTEXT *ctxt, const void *key, size_t keyLen)
+{
+  HASH_CONTEXT keyCtxt;
+  unsigned int i;
+  uint8_t pkey[HASH_BLOCK_SIZE], okey[HASH_BLOCK_SIZE], ikey[HASH_BLOCK_SIZE];
+
+  /* Ensure key is zero-padded */
+  memset(pkey, 0, sizeof(pkey));
+
+  if (keyLen > sizeof(pkey)) {
+    /* Hash key if > HASH_BLOCK_SIZE */
+    HASH_INIT(&keyCtxt);
+    HASH_UPDATE(&keyCtxt, key, keyLen);
+    HASH_FINAL(&keyCtxt, pkey);
+  }
+  else {
+    memcpy(pkey, key, keyLen);
+  }
+
+  /* XOR with opad, ipad */
+  for (i = 0; i < sizeof(okey); i++) {
+    okey[i] = pkey[i] ^ 0x5c;
+  }
+  for (i = 0; i < sizeof(ikey); i++) {
+    ikey[i] = pkey[i] ^ 0x36;
+  }
+
+  /* Initialize hash contexts */
+  HASH_INIT(&ctxt->outer);
+  HASH_UPDATE(&ctxt->outer, okey, sizeof(okey));
+  HASH_INIT(&ctxt->inner);
+  HASH_UPDATE(&ctxt->inner, ikey, sizeof(ikey));
+
+  /* Burn the stack */
+  memset(ikey, 0, sizeof(ikey));
+  memset(okey, 0, sizeof(okey));
+  memset(pkey, 0, sizeof(pkey));
+  memset(&keyCtxt, 0, sizeof(keyCtxt));
+}
+
+void
+HMAC_UPDATE(HMAC_CONTEXT *ctxt, const void *data, size_t len)
+{
+  HASH_UPDATE(&ctxt->inner, data, len);
+}
+
+void
+HMAC_FINAL(HMAC_CONTEXT *ctxt, uint8_t hmac[HASH_SIZE])
+{
+  uint8_t ihash[HASH_SIZE];
+
+  HASH_FINAL(&ctxt->inner, ihash);
+  HASH_UPDATE(&ctxt->outer, ihash, sizeof(ihash));
+  HASH_FINAL(&ctxt->outer, hmac);
+
+  memset(ihash, 0, sizeof(ihash));
+}
--- a/crypto/sha2/intel/sha256_avx1.asm
+++ b/crypto/sha2/intel/sha256_avx1.asm
@ -1,577 +0,0 @@
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Copyright 2012 Intel Corporation All Rights Reserved.
-; 
-; The source code contained or described herein and all documents
-; related to the source code ("Material") are owned by Intel Corporation
-; or its suppliers or licensors. Title to the Material remains with
-; Intel Corporation or its suppliers and licensors. The Material may
-; contain trade secrets and proprietary and confidential information of
-; Intel Corporation and its suppliers and licensors, and is protected by
-; worldwide copyright and trade secret laws and treaty provisions. No
-; part of the Material may be used, copied, reproduced, modified,
-; published, uploaded, posted, transmitted, distributed, or disclosed in
-; any way without Intel's prior express written permission.
-; 
-; No license under any patent, copyright, trade secret or other
-; intellectual property right is granted to or conferred upon you by
-; disclosure or delivery of the Materials, either expressly, by
-; implication, inducement, estoppel or otherwise. Any license under such
-; intellectual property rights must be express and approved by Intel in
-; writing.
-; 
-; Unless otherwise agreed by Intel in writing, you may not remove or
-; alter this notice or any other notice embedded in Materials by Intel
-; or Intel's suppliers or licensors in any way.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Example YASM command lines:
-; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
-; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; This code is described in an Intel White-Paper:
-; "Fast SHA-256 Implementations on Intel Architecture Processors"
-;
-; To find it, surf to http://www.intel.com/p/en_US/embedded 
-; and search for that title.
-; The paper is expected to be released roughly at the end of April, 2012
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; This code schedules 1 blocks at a time, with 4 lanes per block
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%define	VMOVDQ vmovdqu ;; assume buffers not aligned 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
-
-; addm [mem], reg
-; Add reg to mem using reg-mem add and store
-%macro addm 2
-	add	%2, %1
-	mov	%1, %2
-%endm
-
-%macro MY_ROR 2
-	shld	%1,%1,(32-(%2))
-%endm
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-; Load xmm with mem and byte swap each dword
-%macro COPY_XMM_AND_BSWAP 3
-	VMOVDQ %1, %2
-	vpshufb %1, %1, %3
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%define X0 xmm4
-%define X1 xmm5
-%define X2 xmm6
-%define X3 xmm7
-
-%define XTMP0 xmm0
-%define XTMP1 xmm1
-%define XTMP2 xmm2
-%define XTMP3 xmm3
-%define XTMP4 xmm8
-%define XFER  xmm9
-%define XTMP5 xmm11
-
-%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
-%define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
-%define BYTE_FLIP_MASK	xmm13
-	
-%ifdef LINUX
-%define NUM_BLKS rdx	; 3rd arg
-%define CTX	rsi	; 2nd arg
-%define INP	rdi	; 1st arg
-
-%define SRND	rdi	; clobbers INP
-%define c	ecx
-%define d 	r8d
-%define e 	edx
-%else
-%define NUM_BLKS r8	; 3rd arg
-%define CTX	rdx 	; 2nd arg
-%define INP	rcx 	; 1st arg
-
-%define SRND	rcx	; clobbers INP
-%define c 	edi 
-%define d	esi 
-%define e 	r8d
-	
-%endif
-%define TBL	rbp
-%define a eax
-%define b ebx
-
-%define f r9d
-%define g r10d
-%define h r11d
-
-%define y0 r13d
-%define y1 r14d
-%define y2 r15d
-
-
-_INP_END_SIZE	equ 8
-_INP_SIZE	equ 8
-_XFER_SIZE	equ 8
-%ifdef LINUX
-_XMM_SAVE_SIZE	equ 0
-%else
-_XMM_SAVE_SIZE	equ 8*16
-%endif
-; STACK_SIZE plus pushes must be an odd multiple of 8
-_ALIGN_SIZE	equ 8
-
-_INP_END	equ 0
-_INP		equ _INP_END  + _INP_END_SIZE
-_XFER		equ _INP      + _INP_SIZE
-_XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
-STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
-
-; rotate_Xs
-; Rotate values of symbols X0...X3
-%macro rotate_Xs 0
-%xdefine X_ X0
-%xdefine X0 X1
-%xdefine X1 X2
-%xdefine X2 X3
-%xdefine X3 X_
-%endm
-
-; ROTATE_ARGS
-; Rotate values of symbols a...h
-%macro ROTATE_ARGS 0
-%xdefine TMP_ h
-%xdefine h g
-%xdefine g f
-%xdefine f e
-%xdefine e d
-%xdefine d c
-%xdefine c b
-%xdefine b a
-%xdefine a TMP_
-%endm
-
-%macro FOUR_ROUNDS_AND_SCHED 0
-		;; compute s0 four at a time and s1 two at a time
-		;; compute W[-16] + W[-7] 4 at a time
-		;vmovdqa	XTMP0, X3
-	mov	y0, e		; y0 = e
-	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
-	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-		;vmovdqa	XTMP1, X1
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	xor	y2, g		; y2 = f^g
-		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-		;; compute s0
-		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	
-		
-	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
-
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-	
-		vpsrld	XTMP2, XTMP1, 7
-		
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-	
-		vpslld	XTMP3, XTMP1, (32-7)
-		
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-	
-		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7
-		
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-
-	mov	y0, e		; y0 = e
-	mov	y1, a		; y1 = a
-
-
-	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
-
-		vpsrld	XTMP2, XTMP1,18
-
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	xor	y2, g		; y2 = f^g
-	
-		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] >> 3
-		
-	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-		
-		vpslld	XTMP1, XTMP1, (32-18)
-
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-
-		vpxor	XTMP3, XTMP3, XTMP1
-
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
-	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	
-		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
-		
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-	
-		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
-		
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		;; compute low s1
-		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
-		
-	mov	y0, e		; y0 = e
-	mov	y1, a		; y1 = a
-	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
-	
-		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
-		
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
-	mov	y2, f		; y2 = f
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	
-		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
-		
-	xor	y2, g		; y2 = f^g
-	
-		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
-		
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	
-		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
-		
-	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-		vpxor	XTMP2, XTMP2, XTMP3
-	add	y2, y0		; y2 = S1 + CH
-	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
-		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-		;; compute high s1
-		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
-	mov	y0, e		; y0 = e
-	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
-	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	
-		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
-		
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	xor	y2, g		; y2 = f^g
-	
-		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
-		
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	
-		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
-		
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	
-		vpxor	XTMP2, XTMP2, XTMP3
-		
-	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
-		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-	
-ROTATE_ARGS
-rotate_Xs
-%endm
-
-;; input is [rsp + _XFER + %1 * 4]
-%macro DO_ROUND 1
-	mov	y0, e		; y0 = e
-	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
-	mov	y2, f		; y2 = f
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	xor	y2, g		; y2 = f^g
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	and	y2, e		; y2 = (f^g)&e
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	add	y2, y0		; y2 = S1 + CH
-	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-	ROTATE_ARGS
-%endm
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-;; arg 1 : pointer to input data
-;; arg 2 : pointer to digest
-;; arg 3 : Num blocks
-section .text
-global sha256_avx
-align 32
-sha256_avx:
-	push	rbx
-%ifndef LINUX
-	push	rsi
-	push	rdi
-%endif
-	push	rbp
-	push	r13
-	push	r14
-	push	r15
-
-	sub	rsp,STACK_SIZE
-%ifndef LINUX
-	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
-	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
-	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
-	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
-	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
-	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
-	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
-	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
-%endif
-
-	shl	NUM_BLKS, 6	; convert to bytes
-	jz	done_hash
-	add	NUM_BLKS, INP	; pointer to end of data
-	mov	[rsp + _INP_END], NUM_BLKS
-
-	;; load initial digest
-	mov	a,[4*0 + CTX]
-	mov	b,[4*1 + CTX]
-	mov	c,[4*2 + CTX]
-	mov	d,[4*3 + CTX]
-	mov	e,[4*4 + CTX]
-	mov	f,[4*5 + CTX]
-	mov	g,[4*6 + CTX]
-	mov	h,[4*7 + CTX]
-
-	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
-	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
-	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
-
-loop0:
-	lea	TBL,[K256 wrt rip]
-
-	;; byte swap first 16 dwords
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
-	
-	mov	[rsp + _INP], INP
-
-	;; schedule 48 input dwords, by doing 3 rounds of 16 each
-	mov	SRND, 3
-align 16
-loop1:
-	vpaddd	XFER, X0, [TBL + 0*16]
-	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	vpaddd	XFER, X0, [TBL + 1*16]
-	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	vpaddd	XFER, X0, [TBL + 2*16]
-	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	vpaddd	XFER, X0, [TBL + 3*16]
-	vmovdqa	[rsp + _XFER], XFER
-	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
-
-	sub	SRND, 1
-	jne	loop1
-
-	mov	SRND, 2
-loop2:
-	vpaddd	XFER, X0, [TBL + 0*16]
-	vmovdqa	[rsp + _XFER], XFER
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
-
-	vpaddd	XFER, X1, [TBL + 1*16]
-	vmovdqa	[rsp + _XFER], XFER
-	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
-
-	vmovdqa	X0, X2
-	vmovdqa	X1, X3
-
-	sub	SRND, 1
-	jne	loop2
-
-
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
-
-	mov	INP, [rsp + _INP]
-	add	INP, 64
-	cmp	INP, [rsp + _INP_END]
-	jne	loop0
-
-done_hash:
-%ifndef LINUX
-	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
-	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
-	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
-	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
-	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
-	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
-	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
-	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
-%endif
-
-
-	add	rsp, STACK_SIZE
-
-	pop	r15
-	pop	r14
-	pop	r13
-	pop	rbp
-%ifndef LINUX
-	pop	rdi
-	pop	rsi
-%endif
-	pop	rbx
-
-	ret	
-	
-
-section .data
-align 64
-K256:
-	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
-
-; shuffle xBxA -> 00BA
-_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-; shuffle xDxC -> DC00
-_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/intel/sha256_sse4.asm
+++ b/crypto/sha2/intel/sha256_sse4.asm
@ -1,535 +0,0 @@
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Copyright 2012 Intel Corporation All Rights Reserved.
-; 
-; The source code contained or described herein and all documents
-; related to the source code ("Material") are owned by Intel Corporation
-; or its suppliers or licensors. Title to the Material remains with
-; Intel Corporation or its suppliers and licensors. The Material may
-; contain trade secrets and proprietary and confidential information of
-; Intel Corporation and its suppliers and licensors, and is protected by
-; worldwide copyright and trade secret laws and treaty provisions. No
-; part of the Material may be used, copied, reproduced, modified,
-; published, uploaded, posted, transmitted, distributed, or disclosed in
-; any way without Intel's prior express written permission.
-; 
-; No license under any patent, copyright, trade secret or other
-; intellectual property right is granted to or conferred upon you by
-; disclosure or delivery of the Materials, either expressly, by
-; implication, inducement, estoppel or otherwise. Any license under such
-; intellectual property rights must be express and approved by Intel in
-; writing.
-; 
-; Unless otherwise agreed by Intel in writing, you may not remove or
-; alter this notice or any other notice embedded in Materials by Intel
-; or Intel's suppliers or licensors in any way.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Example YASM command lines:
-; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
-; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; This code is described in an Intel White-Paper:
-; "Fast SHA-256 Implementations on Intel Architecture Processors"
-;
-; To find it, surf to http://www.intel.com/p/en_US/embedded 
-; and search for that title.
-; The paper is expected to be released roughly at the end of April, 2012
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; This code schedules 1 blocks at a time, with 4 lanes per block
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%define	MOVDQ movdqu ;; assume buffers not aligned 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
-
-; addm [mem], reg
-; Add reg to mem using reg-mem add and store
-%macro addm 2
-	add	%2, %1
-	mov	%1, %2
-%endm
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-; Load xmm with mem and byte swap each dword
-%macro COPY_XMM_AND_BSWAP 3
-	MOVDQ %1, %2
-	pshufb %1, %3
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%define X0 xmm4
-%define X1 xmm5
-%define X2 xmm6
-%define X3 xmm7
-
-%define XTMP0 xmm0
-%define XTMP1 xmm1
-%define XTMP2 xmm2
-%define XTMP3 xmm3
-%define XTMP4 xmm8
-%define XFER  xmm9
-
-%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
-%define SHUF_DC00	xmm11 ; shuffle xDxC -> DC00
-%define BYTE_FLIP_MASK	xmm12
-	
-%ifdef LINUX
-%define NUM_BLKS rdx	; 3rd arg
-%define CTX	rsi	; 2nd arg
-%define INP	rdi	; 1st arg
-
-%define SRND	rdi	; clobbers INP
-%define c	ecx
-%define d 	r8d
-%define e 	edx
-%else
-%define NUM_BLKS r8	; 3rd arg
-%define CTX	rdx 	; 2nd arg
-%define INP	rcx 	; 1st arg
-
-%define SRND	rcx	; clobbers INP
-%define c 	edi 
-%define d	esi 
-%define e 	r8d
-	
-%endif
-%define TBL	rbp
-%define a eax
-%define b ebx
-
-%define f r9d
-%define g r10d
-%define h r11d
-
-%define y0 r13d
-%define y1 r14d
-%define y2 r15d
-
-
-
-_INP_END_SIZE	equ 8
-_INP_SIZE	equ 8
-_XFER_SIZE	equ 8
-%ifdef LINUX
-_XMM_SAVE_SIZE	equ 0
-%else
-_XMM_SAVE_SIZE	equ 7*16
-%endif
-; STACK_SIZE plus pushes must be an odd multiple of 8
-_ALIGN_SIZE	equ 8
-
-_INP_END	equ 0
-_INP		equ _INP_END  + _INP_END_SIZE
-_XFER		equ _INP      + _INP_SIZE
-_XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
-STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
-
-; rotate_Xs
-; Rotate values of symbols X0...X3
-%macro rotate_Xs 0
-%xdefine X_ X0
-%xdefine X0 X1
-%xdefine X1 X2
-%xdefine X2 X3
-%xdefine X3 X_
-%endm
-
-; ROTATE_ARGS
-; Rotate values of symbols a...h
-%macro ROTATE_ARGS 0
-%xdefine TMP_ h
-%xdefine h g
-%xdefine g f
-%xdefine f e
-%xdefine e d
-%xdefine d c
-%xdefine c b
-%xdefine b a
-%xdefine a TMP_
-%endm
-
-%macro FOUR_ROUNDS_AND_SCHED 0
-		;; compute s0 four at a time and s1 two at a time
-		;; compute W[-16] + W[-7] 4 at a time
-		movdqa	XTMP0, X3
-	mov	y0, e		; y0 = e
-	ror	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-		palignr	XTMP0, X2, 4	; XTMP0 = W[-7]
-	ror	y1, (22-13)	; y1 = a >> (22-13)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-		movdqa	XTMP1, X1
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	xor	y2, g		; y2 = f^g
-		paddd	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-		;; compute s0
-		palignr	XTMP1, X0, 4	; XTMP1 = W[-15]
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-		movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
-	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
-		movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		pslld	XTMP1, (32-7)
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		psrld	XTMP2, 7
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-		por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-		movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
-	mov	y0, e		; y0 = e
-	mov	y1, a		; y1 = a
-		movdqa	XTMP4, XTMP3	; XTMP4 = W[-15]
-	ror	y0, (25-11)	; y0 = e >> (25-11)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	ror	y1, (22-13)	; y1 = a >> (22-13)
-		pslld	XTMP3, (32-18)
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	xor	y2, g		; y2 = f^g
-		psrld	XTMP2, 18
-	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-		pxor	XTMP1, XTMP3
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-		psrld	XTMP4, 3	; XTMP4 = W[-15] >> 3
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
-	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-		pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		pxor	XTMP1, XTMP4	; XTMP1 = s0
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		;; compute low s1
-		pshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-		paddd	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
-	mov	y0, e		; y0 = e
-	mov	y1, a		; y1 = a
-	ror	y0, (25-11)	; y0 = e >> (25-11)
-		movdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	ror	y1, (22-13)	; y1 = a >> (22-13)
-	mov	y2, f		; y2 = f
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
-	xor	y2, g		; y2 = f^g
-		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xBxA}
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-		psrld	XTMP4, 10	; XTMP4 = W[-2] >> 10 {BBAA}
-	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-		pxor	XTMP2, XTMP3
-	add	y2, y0		; y2 = S1 + CH
-	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
-		pxor	XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		pshufb	XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		paddd	XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-		;; compute high s1
-		pshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
-	mov	y0, e		; y0 = e
-	ror	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-		movdqa	X0,    XTMP2	; X0    = W[-2] {DDCC}
-	ror	y1, (22-13)	; y1 = a >> (22-13)
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	mov	y2, f		; y2 = f
-	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	xor	y2, g		; y2 = f^g
-		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xDxC}
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	and	y2, e		; y2 = (f^g)&e
-	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-		psrld	X0,    10	; X0 = W[-2] >> 10 {DDCC}
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-		pxor	XTMP2, XTMP3
-	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, y0		; y2 = S1 + CH
-	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
-		pxor	X0, XTMP2	; X0 = s1 {xDxC}
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-		pshufb	X0, SHUF_DC00	; X0 = s1 {DC00}
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-		paddd	X0, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-
-ROTATE_ARGS
-rotate_Xs
-%endm
-
-;; input is [rsp + _XFER + %1 * 4]
-%macro DO_ROUND 1
-	mov	y0, e		; y0 = e
-	ror	y0, (25-11)	; y0 = e >> (25-11)
-	mov	y1, a		; y1 = a
-	xor	y0, e		; y0 = e ^ (e >> (25-11))
-	ror	y1, (22-13)	; y1 = a >> (22-13)
-	mov	y2, f		; y2 = f
-	xor	y1, a		; y1 = a ^ (a >> (22-13)
-	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
-	xor	y2, g		; y2 = f^g
-	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
-	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
-	and	y2, e		; y2 = (f^g)&e
-	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
-	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
-	xor	y2, g		; y2 = CH = ((f^g)&e)^g
-	add	y2, y0		; y2 = S1 + CH
-	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
-	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
-	mov	y0, a		; y0 = a
-	add	h, y2		; h = h + S1 + CH + k + w
-	mov	y2, a		; y2 = a
-	or	y0, c		; y0 = a|c
-	add	d, h		; d = d + h + S1 + CH + k + w
-	and	y2, c		; y2 = a&c
-	and	y0, b		; y0 = (a|c)&b
-	add	h, y1		; h = h + S1 + CH + k + w + S0
-	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
-	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
-	ROTATE_ARGS
-%endm
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
-;; arg 1 : pointer to input data
-;; arg 2 : pointer to digest
-;; arg 3 : Num blocks
-section .text
-global sha256_sse4
-align 32
-sha256_sse4:
-	push	rbx
-%ifndef LINUX
-	push	rsi
-	push	rdi
-%endif
-	push	rbp
-	push	r13
-	push	r14
-	push	r15
-
-	sub	rsp,STACK_SIZE
-%ifndef LINUX
-	movdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
-	movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
-	movdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
-	movdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
-	movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
-	movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
-	movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
-%endif
-
-	shl	NUM_BLKS, 6	; convert to bytes
-	jz	done_hash
-	add	NUM_BLKS, INP	; pointer to end of data
-	mov	[rsp + _INP_END], NUM_BLKS
-
-	;; load initial digest
-	mov	a,[4*0 + CTX]
-	mov	b,[4*1 + CTX]
-	mov	c,[4*2 + CTX]
-	mov	d,[4*3 + CTX]
-	mov	e,[4*4 + CTX]
-	mov	f,[4*5 + CTX]
-	mov	g,[4*6 + CTX]
-	mov	h,[4*7 + CTX]
-
-	movdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
-	movdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
-	movdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
-
-loop0:
-	lea	TBL,[K256 wrt rip]
-
-	;; byte swap first 16 dwords
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
-	
-	mov	[rsp + _INP], INP
-
-	;; schedule 48 input dwords, by doing 3 rounds of 16 each
-	mov	SRND, 3
-align 16
-loop1:
-	movdqa	XFER, [TBL + 0*16]
-	paddd	XFER, X0
-	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	movdqa	XFER, [TBL + 1*16]
-	paddd	XFER, X0
-	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	movdqa	XFER, [TBL + 2*16]
-	paddd	XFER, X0
-	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
-
-	movdqa	XFER, [TBL + 3*16]
-	paddd	XFER, X0
-	movdqa	[rsp + _XFER], XFER
-	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
-
-	sub	SRND, 1
-	jne	loop1
-
-	mov	SRND, 2
-loop2:
-	paddd	X0, [TBL + 0*16]
-	movdqa	[rsp + _XFER], X0
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
-	paddd	X1, [TBL + 1*16]
-	movdqa	[rsp + _XFER], X1
-	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
-
-	movdqa	X0, X2
-	movdqa	X1, X3
-
-	sub	SRND, 1
-	jne	loop2
-
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
-
-	mov	INP, [rsp + _INP]
-	add	INP, 64
-	cmp	INP, [rsp + _INP_END]
-	jne	loop0
-
-done_hash:
-%ifndef LINUX
-	movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
-	movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
-	movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
-	movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
-	movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
-	movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
-	movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
-%endif
-
-	add	rsp, STACK_SIZE
-
-	pop	r15
-	pop	r14
-	pop	r13
-	pop	rbp
-%ifndef LINUX
-	pop	rdi
-	pop	rsi
-%endif
-	pop	rbx
-
-	ret	
-	
-
-section .data
-align 64
-K256:
-	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
-
-; shuffle xBxA -> 00BA
-_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-; shuffle xDxC -> DC00
-_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/intel/sha512_avx.asm
+++ b/crypto/sha2/intel/sha512_avx.asm
@ -0,0 +1,409 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright 2012 Intel Corporation All Rights Reserved.
+; 
+; The source code contained or described herein and all documents
+; related to the source code ("Material") are owned by Intel Corporation
+; or its suppliers or licensors. Title to the Material remains with
+; Intel Corporation or its suppliers and licensors. The Material may
+; contain trade secrets and proprietary and confidential information of
+; Intel Corporation and its suppliers and licensors, and is protected by
+; worldwide copyright and trade secret laws and treaty provisions. No
+; part of the Material may be used, copied, reproduced, modified,
+; published, uploaded, posted, transmitted, distributed, or disclosed in
+; any way without Intel's prior express written permission.
+; 
+; No license under any patent, copyright, trade secret or other
+; intellectual property right is granted to or conferred upon you by
+; disclosure or delivery of the Materials, either expressly, by
+; implication, inducement, estoppel or otherwise. Any license under such
+; intellectual property rights must be express and approved by Intel in
+; writing.
+; 
+; Unless otherwise agreed by Intel in writing, you may not remove or
+; alter this notice or any other notice embedded in Materials by Intel
+; or Intel's suppliers or licensors in any way.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  yasm -f x64 -D WINABI sha512_avx.asm
+; Linux:    yasm -f elf64 sha512_avx.asm
+;
+
+BITS 64
+section .text
+
+; Virtual Registers
+%ifdef WINABI
+	%define msg	rcx ; ARG1
+	%define digest	rdx ; ARG2
+	%define msglen	r8  ; ARG3
+	%define T1	rsi
+	%define T2	rdi
+%else
+	%define msg	rdi ; ARG1
+	%define digest	rsi ; ARG2
+	%define msglen	rdx ; ARG3
+	%define T1	rcx
+	%define T2	r8
+%endif
+%define a_64	r9
+%define b_64	r10
+%define c_64	r11
+%define d_64	r12
+%define e_64	r13
+%define f_64	r14
+%define g_64	r15
+%define h_64	rbx
+%define tmp0	rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+	.W:       resq 80 ; Message Schedule
+	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifdef WINABI
+	.XMMSAVE: resdq 4
+	.GPRSAVE: resq  7
+%else
+	.GPRSAVE: resq  5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i)    msg    + 8*(i) ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
+%define K_t(i)    K512   + 8*(i) wrt rip ; SHA Constants (static mem)
+%define W_t(i)    rsp + frame.W  + 8*(i) ; Message Schedule (stack frame)
+%define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+	; Rotate symbles a..h right
+	%xdefine	%%TMP h_64
+	%xdefine	h_64 g_64
+	%xdefine	g_64 f_64
+	%xdefine	f_64 e_64
+	%xdefine	e_64 d_64
+	%xdefine	d_64 c_64
+	%xdefine	c_64 b_64
+	%xdefine	b_64 a_64
+	%xdefine	a_64 %%TMP
+%endmacro
+
+%macro RORQ 2
+	; shld is faster than ror on Sandybridge
+	shld	%1, %1, (64 - %2)
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t   (%1)
+
+	; Compute Round %%t
+	mov	T1,   f_64        ; T1 = f
+	mov	tmp0, e_64        ; tmp = e
+	xor	T1,   g_64        ; T1 = f ^ g
+	RORQ	tmp0, 23 ; 41     ; tmp = e ror 23
+	and	T1,   e_64        ; T1 = (f ^ g) & e
+	xor	tmp0, e_64        ; tmp = (e ror 23) ^ e
+	xor	T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	add	T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+	RORQ	tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
+	xor	tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	T2,   a_64        ; T2 = a
+	add	T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
+	RORQ	tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	tmp0, a_64        ; tmp = a
+	xor	T2,   c_64        ; T2 = a ^ c
+	and	tmp0, c_64        ; tmp = a & c
+	and	T2,   b_64        ; T2 = (a ^ c) & b
+	xor	T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	tmp0, a_64        ; tmp = a
+	RORQ	tmp0, 5 ; 39      ; tmp = a ror 5
+	xor	tmp0, a_64        ; tmp = (a ror 5) ^ a
+	add	d_64, T1          ; e(next_state) = d + T1 
+	RORQ	tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
+	xor	tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
+	RORQ	tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_avx 1
+%assign %%t %1
+	; Compute rounds %%t-2 and %%t-1
+	; Compute message schedule QWORDS %%t and %%t+1
+
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	; scheduler.
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	; They are then added to their respective SHA512 constants at
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	;   For brievity, the comments following vectored instructions only refer to
+	; the first of a pair of QWORDS.
+	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+	;   The computation of the message schedule and the rounds are tightly
+	; stitched to take advantage of instruction-level parallelism.
+	; For clarity, integer instructions (for the rounds calculation) are indented
+	; by one tab. Vectored instructions (for the message scheduler) are indented
+	; by two tabs.
+
+		vmovdqa	xmm4, [W_t(%%t-2)]   ; XMM4 = W[t-2]
+		vmovdqu	xmm5, [W_t(%%t-15)]  ; XMM5 = W[t-15]
+	mov	T1,   f_64
+		vpsrlq	xmm0, xmm4, 61       ; XMM0 = W[t-2]>>61
+	mov	tmp0, e_64
+		vpsrlq	xmm6, xmm5, 1        ; XMM6 = W[t-15]>>1
+	xor	T1,   g_64
+	RORQ	tmp0, 23 ; 41
+		vpsrlq	xmm1, xmm4, 19       ; XMM1 = W[t-2]>>19
+	and	T1,   e_64
+	xor	tmp0, e_64
+		vpxor	xmm0, xmm1           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+	xor	T1,   g_64
+	add	T1,   [WK_2(%%t)];
+		vpsrlq	xmm7, xmm5, 8        ; XMM7 = W[t-15]>>8
+	RORQ	tmp0, 4 ; 18
+		vpsrlq	xmm2, xmm4, 6        ; XMM2 = W[t-2]>>6
+	xor	tmp0, e_64
+	mov	T2,   a_64
+	add	T1,   h_64
+		vpxor	xmm6, xmm7           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+	RORQ	tmp0, 14 ; 14
+	add	T1,   tmp0
+		vpsrlq	xmm8, xmm5, 7        ; XMM8 = W[t-15]>>7
+	mov 	tmp0, a_64
+	xor	T2,   c_64
+		vpsllq	xmm3, xmm4, (64-61)  ; XMM3 = W[t-2]<<3
+	and	tmp0, c_64
+	and	T2,   b_64
+		vpxor	xmm2, xmm3           ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+	xor	T2,   tmp0
+	mov	tmp0, a_64
+		vpsllq	xmm9, xmm5, (64-1)   ; XMM9 = W[t-15]<<63
+	RORQ	tmp0, 5 ; 39
+		vpxor	xmm8, xmm9           ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+	xor	tmp0, a_64
+	add	d_64, T1
+	RORQ	tmp0, 6 ; 34
+	xor	tmp0, a_64
+		vpxor	xmm6, xmm8           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63
+	lea	h_64, [T1 + T2]
+	RORQ 	tmp0, 28 ; 28
+		vpsllq	xmm4, (64-19)        ; XMM4 = W[t-2]<<25
+	add	h_64, tmp0
+	RotateState
+		vpxor	xmm0, xmm4           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25
+	mov	T1, f_64
+		vpxor	xmm0, xmm2           ; XMM0 = s1(W[t-2])
+	mov	tmp0, e_64
+	xor	T1,   g_64
+		vpaddq	xmm0, [W_t(%%t-16)]  ; XMM0 = s1(W[t-2]) + W[t-16]
+		vmovdqu	xmm1, [W_t(%%t- 7)]  ; XMM1 = W[t-7]
+	RORQ	tmp0, 23 ; 41
+	and	T1,   e_64
+	xor	tmp0, e_64
+	xor	T1,   g_64
+		vpsllq	xmm5, (64-8)         ; XMM5 = W[t-15]<<56
+	add	T1,   [WK_2(%%t+1)]
+		vpxor	xmm6, xmm5           ; XMM6 = s0(W[t-15])
+	RORQ	tmp0, 4 ; 18
+		vpaddq	xmm0, xmm6           ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+	xor	tmp0, e_64
+		vpaddq	xmm0, xmm1           ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	mov	T2,   a_64
+	add	T1,   h_64
+	RORQ	tmp0, 14 ; 14
+	add	T1,   tmp0
+		vmovdqa	[W_t(%%t)], xmm0      ; Store W[t]
+		vpaddq	xmm0, [K_t(t)]        ; Compute W[t]+K[t]
+		vmovdqa	[WK_2(t)], xmm0       ; Store W[t]+K[t] for next rounds
+	mov	tmp0, a_64
+	xor	T2,   c_64
+	and	tmp0, c_64
+	and	T2,   b_64
+	xor	T2,   tmp0
+	mov	tmp0, a_64
+	RORQ	tmp0, 5 ; 39
+	xor	tmp0, a_64
+	add	d_64, T1
+	RORQ	tmp0, 6 ; 34
+	xor	tmp0, a_64
+	lea	h_64, [T1 + T2]
+	RORQ	tmp0, 28 ; 28
+	add	h_64, tmp0
+	RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_avx(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks
+global sha512_avx:function
+sha512_avx:
+	cmp	msglen, 0
+	je	.nowork
+	
+	; Allocate Stack Space
+	sub	rsp, frame_size
+
+	; Save GPRs
+	mov	[rsp + frame.GPRSAVE + 8 * 0], rbx
+	mov	[rsp + frame.GPRSAVE + 8 * 1], r12
+	mov	[rsp + frame.GPRSAVE + 8 * 2], r13
+	mov	[rsp + frame.GPRSAVE + 8 * 3], r14
+	mov	[rsp + frame.GPRSAVE + 8 * 4], r15
+%ifdef WINABI
+	mov	[rsp + frame.GPRSAVE + 8 * 5], rsi
+	mov	[rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+	; Save XMMs
+%ifdef WINABI
+	vmovdqa	[rsp + frame.XMMSAVE + 16 * 0], xmm6
+	vmovdqa	[rsp + frame.XMMSAVE + 16 * 1], xmm7
+	vmovdqa	[rsp + frame.XMMSAVE + 16 * 2], xmm8
+	vmovdqa	[rsp + frame.XMMSAVE + 16 * 3], xmm9
+%endif	
+
+.updateblock:
+
+	; Load state variables
+	mov	a_64, [DIGEST(0)]
+	mov	b_64, [DIGEST(1)]
+	mov	c_64, [DIGEST(2)]
+	mov	d_64, [DIGEST(3)]
+	mov	e_64, [DIGEST(4)]
+	mov	f_64, [DIGEST(5)]
+	mov	g_64, [DIGEST(6)]
+	mov	h_64, [DIGEST(7)]
+
+	%assign t 0
+	%rep 80/2 + 1
+	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	; +1 iteration because the scheduler leads hashing by 1 iteration
+		%if t < 2
+			; BSWAP 2 QWORDS
+			vmovdqa	xmm1, [XMM_QWORD_BSWAP wrt rip]
+			vmovdqu	xmm0, [MSG(t)]
+			vpshufb	xmm0, xmm0, xmm1     ; BSWAP
+			vmovdqa	[W_t(t)], xmm0       ; Store Scheduled Pair
+			vpaddq	xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
+			vmovdqa	[WK_2(t)], xmm0      ; Store into WK for rounds
+		%elif t < 16
+			; BSWAP 2 QWORDS, Compute 2 Rounds
+			vmovdqu	xmm0, [MSG(t)]
+			vpshufb	xmm0, xmm0, xmm1     ; BSWAP
+			SHA512_Round t - 2           ; Round t-2
+			vmovdqa	[W_t(t)], xmm0       ; Store Scheduled Pair
+			vpaddq	xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
+			SHA512_Round t - 1           ; Round t-1
+			vmovdqa	[WK_2(t)], xmm0      ; W[t]+K[t] into WK
+		%elif t < 79
+			; Schedule 2 QWORDS; Compute 2 Rounds
+			SHA512_2Sched_2Round_avx t
+		%else
+			; Compute 2 Rounds
+			SHA512_Round t - 2
+			SHA512_Round t - 1
+		%endif
+	%assign t t+2
+	%endrep
+
+	; Update digest
+	add	[DIGEST(0)], a_64
+	add	[DIGEST(1)], b_64
+	add	[DIGEST(2)], c_64
+	add	[DIGEST(3)], d_64
+	add	[DIGEST(4)], e_64
+	add	[DIGEST(5)], f_64
+	add	[DIGEST(6)], g_64
+	add	[DIGEST(7)], h_64
+
+	; Advance to next message block
+	add	msg, 16*8
+	dec	msglen
+	jnz	.updateblock
+
+	; Restore XMMs
+%ifdef WINABI
+	vmovdqa	xmm6, [rsp + frame.XMMSAVE + 16 * 0]
+	vmovdqa	xmm7, [rsp + frame.XMMSAVE + 16 * 1]
+	vmovdqa	xmm8, [rsp + frame.XMMSAVE + 16 * 2]
+	vmovdqa	xmm9, [rsp + frame.XMMSAVE + 16 * 3]
+%endif
+	; Restore GPRs
+	mov	rbx, [rsp + frame.GPRSAVE + 8 * 0]
+	mov	r12, [rsp + frame.GPRSAVE + 8 * 1]
+	mov	r13, [rsp + frame.GPRSAVE + 8 * 2]
+	mov	r14, [rsp + frame.GPRSAVE + 8 * 3]
+	mov	r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifdef WINABI
+	mov	rsi, [rsp + frame.GPRSAVE + 8 * 5]
+	mov	rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+	; Restore Stack Pointer
+	add	rsp, frame_size
+
+.nowork:
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP: 
+	ddq 0x08090a0b0c0d0e0f0001020304050607
+
+; K[t] used in SHA512 hashing
+K512:
+	dq 0x428a2f98d728ae22,0x7137449123ef65cd 
+	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538,0x59f111f1b605d019 
+	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242,0x12835b0145706fbe 
+	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
+	dq 0x9bdc06a725c71235,0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
+	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
+	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	dq 0x983e5152ee66dfab,0xa831c66d2db43210 
+	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
+	dq 0x06ca6351e003826f,0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
+	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
+	dq 0x81c2c92e47edaee6,0x92722c851482353b
+	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
+	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+	dq 0xd192e819d6ef5218,0xd69906245565a910 
+	dq 0xf40e35855771202a,0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
+	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
+	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
+	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	dq 0x90befffa23631e28,0xa4506cebde82bde9 
+	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	dq 0xca273eceea26619c,0xd186b8c721c0c207 
+	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
+	dq 0x113f9804bef90dae,0x1b710b35131c471b
+	dq 0x28db77f523047d84,0x32caab7b40c72493 
+	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
+	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
--- a/crypto/sha2/intel/sha512_sse4.asm
+++ b/crypto/sha2/intel/sha512_sse4.asm
@ -0,0 +1,398 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright 2012 Intel Corporation All Rights Reserved.
+; 
+; The source code contained or described herein and all documents
+; related to the source code ("Material") are owned by Intel Corporation
+; or its suppliers or licensors. Title to the Material remains with
+; Intel Corporation or its suppliers and licensors. The Material may
+; contain trade secrets and proprietary and confidential information of
+; Intel Corporation and its suppliers and licensors, and is protected by
+; worldwide copyright and trade secret laws and treaty provisions. No
+; part of the Material may be used, copied, reproduced, modified,
+; published, uploaded, posted, transmitted, distributed, or disclosed in
+; any way without Intel's prior express written permission.
+; 
+; No license under any patent, copyright, trade secret or other
+; intellectual property right is granted to or conferred upon you by
+; disclosure or delivery of the Materials, either expressly, by
+; implication, inducement, estoppel or otherwise. Any license under such
+; intellectual property rights must be express and approved by Intel in
+; writing.
+; 
+; Unless otherwise agreed by Intel in writing, you may not remove or
+; alter this notice or any other notice embedded in Materials by Intel
+; or Intel's suppliers or licensors in any way.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  yasm -f x64 -D WINABI sha512_sse4.asm
+; Linux:    yasm -f elf64 sha512_sse4.asm
+;
+; Alternative Example YASM command lines:
+; Windows:  yasm -Xvc -f x64 -D WINABI -rnasm -pnasm -o sha512_sse4.obj -g cv8 sha512_sse4.asm
+; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha512_sse4.o sha512_sse4.asm
+;
+
+BITS 64
+section .text
+
+; Virtual Registers
+%ifdef WINABI
+	%define msg	rcx ; ARG1
+	%define digest	rdx ; ARG2
+	%define msglen	r8  ; ARG3
+	%define T1	rsi
+	%define T2	rdi
+%else
+	%define msg	rdi ; ARG1
+	%define digest	rsi ; ARG2
+	%define msglen	rdx ; ARG3
+	%define T1	rcx
+	%define T2	r8
+%endif
+%define a_64	r9
+%define b_64	r10
+%define c_64	r11
+%define d_64	r12
+%define e_64	r13
+%define f_64	r14
+%define g_64	r15
+%define h_64	rbx
+%define tmp0	rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+	.W:       resq 80 ; Message Schedule
+	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifdef WINABI
+	.GPRSAVE: resq 7
+%else
+	.GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
+%define K_t(i)    K512   + 8*(i) wrt rip       ; SHA Constants (static mem)
+%define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
+%define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+	; Rotate symbles a..h right
+	%xdefine %%TMP h_64
+	%xdefine h_64  g_64
+	%xdefine g_64  f_64
+	%xdefine f_64  e_64
+	%xdefine e_64  d_64
+	%xdefine d_64  c_64
+	%xdefine c_64  b_64
+	%xdefine b_64  a_64
+	%xdefine a_64  %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t   (%1)
+
+	; Compute Round %%t
+	mov	T1,   f_64        ; T1 = f
+	mov	tmp0, e_64        ; tmp = e
+	xor	T1,   g_64        ; T1 = f ^ g
+	ror	tmp0, 23 ; 41     ; tmp = e ror 23
+	and	T1,   e_64        ; T1 = (f ^ g) & e
+	xor	tmp0, e_64        ; tmp = (e ror 23) ^ e
+	xor	T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	add	T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+	ror	tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
+	xor	tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	T2,   a_64        ; T2 = a
+	add	T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror	tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	tmp0, a_64        ; tmp = a
+	xor	T2,   c_64        ; T2 = a ^ c
+	and	tmp0, c_64        ; tmp = a & c
+	and	T2,   b_64        ; T2 = (a ^ c) & b
+	xor	T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	tmp0, a_64        ; tmp = a
+	ror	tmp0, 5 ; 39      ; tmp = a ror 5
+	xor	tmp0, a_64        ; tmp = (a ror 5) ^ a
+	add	d_64, T1          ; e(next_state) = d + T1 
+	ror	tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
+	xor	tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
+	ror	tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+	; Compute rounds %%t-2 and %%t-1
+	; Compute message schedule QWORDS %%t and %%t+1
+
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	; scheduler.
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	; They are then added to their respective SHA512 constants at
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	;   For brievity, the comments following vectored instructions only refer to
+	; the first of a pair of QWORDS.
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	;   The computation of the message schedule and the rounds are tightly
+	; stitched to take advantage of instruction-level parallelism.
+	; For clarity, integer instructions (for the rounds calculation) are indented
+	; by one tab. Vectored instructions (for the message scheduler) are indented
+	; by two tabs.
+
+	mov	T1, f_64
+		movdqa	xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
+	xor	T1,   g_64
+	and	T1,   e_64
+		movdqa	xmm0, xmm2          ; XMM0 = W[t-2]
+	xor	T1,   g_64
+	add	T1,   [WK_2(%%t)]
+		movdqu	xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+	mov	tmp0, e_64
+	ror	tmp0, 23 ; 41
+		movdqa	xmm3, xmm5          ; XMM3 = W[t-15]
+	xor	tmp0, e_64
+	ror	tmp0, 4 ; 18
+		psrlq	xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
+	xor	tmp0, e_64
+	ror	tmp0, 14 ; 14
+		psrlq	xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
+	add	T1,   tmp0
+	add	T1,   h_64
+		pxor	xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov	T2,   a_64
+	xor	T2,   c_64
+		pxor	xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and	T2,   b_64
+	mov	tmp0, a_64
+		psrlq	xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and	tmp0, c_64
+	xor	T2,   tmp0
+		psrlq	xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov	tmp0, a_64
+	ror	tmp0, 5 ; 39
+		pxor	xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor	tmp0, a_64
+	ror	tmp0, 6 ; 34
+		pxor	xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor	tmp0, a_64
+	ror	tmp0, 28 ; 28
+		psrlq	xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add	T2,   tmp0
+	add	d_64, T1 
+		psrlq	xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea	h_64, [T1 + T2]
+	RotateState
+		movdqa	xmm1, xmm2          ; XMM1 = W[t-2]
+	mov	T1, f_64
+	xor	T1,   g_64
+		movdqa	xmm4, xmm5          ; XMM4 = W[t-15]
+	and	T1,   e_64
+	xor	T1,   g_64
+		psllq	xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+	add	T1,   [WK_2(%%t+1)]
+	mov	tmp0, e_64
+		psllq	xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+	ror	tmp0, 23 ; 41
+	xor	tmp0, e_64
+		pxor	xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
+	ror	tmp0, 4 ; 18
+	xor	tmp0, e_64
+		pxor	xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
+	ror	tmp0, 14 ; 14
+	add	T1,   tmp0
+		psllq	xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add	T1,   h_64
+	mov	T2,   a_64
+		psllq	xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor	T2,   c_64
+	and	T2,   b_64
+		pxor	xmm0, xmm1          ; XMM0 = s1(W[t-2])
+	mov	tmp0, a_64
+	and	tmp0, c_64
+		movdqu	xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+	xor	T2,   tmp0
+		pxor	xmm3, xmm4          ; XMM3 = s0(W[t-15])
+	mov	tmp0, a_64
+		paddq	xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror	tmp0, 5 ; 39
+		paddq	xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor	tmp0, a_64
+		paddq	xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror	tmp0, 6 ; 34
+		movdqa	[W_t(%%t)], xmm0    ; Store scheduled qwords
+	xor	tmp0, a_64
+		paddq	xmm0, [K_t(t)]      ; Compute W[t]+K[t]
+	ror	tmp0, 28 ; 28
+		movdqa	[WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
+	add	T2,   tmp0
+	add	d_64, T1
+	lea	h_64, [T1 + T2]
+	RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks.
+global sha512_sse4:function
+sha512_sse4:
+	cmp msglen, 0
+	je .nowork
+	
+	; Allocate Stack Space
+	sub	rsp, frame_size
+
+	; Save GPRs
+	mov	[rsp + frame.GPRSAVE + 8 * 0], rbx
+	mov	[rsp + frame.GPRSAVE + 8 * 1], r12
+	mov	[rsp + frame.GPRSAVE + 8 * 2], r13
+	mov	[rsp + frame.GPRSAVE + 8 * 3], r14
+	mov	[rsp + frame.GPRSAVE + 8 * 4], r15
+%ifdef WINABI
+	mov	[rsp + frame.GPRSAVE + 8 * 5], rsi
+	mov	[rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+	; Load state variables
+	mov	a_64, [DIGEST(0)]
+	mov	b_64, [DIGEST(1)]
+	mov	c_64, [DIGEST(2)]
+	mov	d_64, [DIGEST(3)]
+	mov	e_64, [DIGEST(4)]
+	mov	f_64, [DIGEST(5)]
+	mov	g_64, [DIGEST(6)]
+	mov	h_64, [DIGEST(7)]
+
+	%assign t 0
+	%rep 80/2 + 1
+	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	; +1 iteration because the scheduler leads hashing by 1 iteration
+		%if t < 2
+			; BSWAP 2 QWORDS
+			movdqa	xmm1, [XMM_QWORD_BSWAP wrt rip]
+			movdqu	xmm0, [MSG(t)]
+			pshufb	xmm0, xmm1      ; BSWAP
+			movdqa	[W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq	xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			movdqa	[WK_2(t)], xmm0 ; Store into WK for rounds
+		%elif t < 16
+			; BSWAP 2 QWORDS; Compute 2 Rounds
+			movdqu	xmm0, [MSG(t)]
+			pshufb	xmm0, xmm1      ; BSWAP
+			SHA512_Round t - 2      ; Round t-2
+			movdqa	[W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq	xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			SHA512_Round t - 1      ; Round t-1
+			movdqa	[WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+		%elif t < 79
+			; Schedule 2 QWORDS; Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t 
+		%else
+			; Compute 2 Rounds
+			SHA512_Round t - 2
+			SHA512_Round t - 1
+		%endif
+	%assign t t+2
+	%endrep
+
+	; Update digest
+	add	[DIGEST(0)], a_64
+	add	[DIGEST(1)], b_64
+	add	[DIGEST(2)], c_64
+	add	[DIGEST(3)], d_64
+	add	[DIGEST(4)], e_64
+	add	[DIGEST(5)], f_64
+	add	[DIGEST(6)], g_64
+	add	[DIGEST(7)], h_64
+
+	; Advance to next message block
+	add	msg, 16*8
+	dec	msglen
+	jnz	.updateblock
+
+	; Restore GPRs
+	mov	rbx, [rsp + frame.GPRSAVE + 8 * 0]
+	mov	r12, [rsp + frame.GPRSAVE + 8 * 1]
+	mov	r13, [rsp + frame.GPRSAVE + 8 * 2]
+	mov	r14, [rsp + frame.GPRSAVE + 8 * 3]
+	mov	r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifdef WINABI
+	mov	rsi, [rsp + frame.GPRSAVE + 8 * 5]
+	mov	rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+	; Restore Stack Pointer
+	add	rsp, frame_size
+
+.nowork:
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP: 
+	ddq 0x08090a0b0c0d0e0f0001020304050607
+
+; K[t] used in SHA512 hashing
+K512:
+	dq 0x428a2f98d728ae22,0x7137449123ef65cd 
+	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538,0x59f111f1b605d019 
+	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242,0x12835b0145706fbe 
+	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
+	dq 0x9bdc06a725c71235,0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
+	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
+	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	dq 0x983e5152ee66dfab,0xa831c66d2db43210 
+	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
+	dq 0x06ca6351e003826f,0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
+	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
+	dq 0x81c2c92e47edaee6,0x92722c851482353b
+	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
+	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+	dq 0xd192e819d6ef5218,0xd69906245565a910 
+	dq 0xf40e35855771202a,0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
+	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
+	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
+	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	dq 0x90befffa23631e28,0xa4506cebde82bde9 
+	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	dq 0xca273eceea26619c,0xd186b8c721c0c207 
+	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
+	dq 0x113f9804bef90dae,0x1b710b35131c471b
+	dq 0x28db77f523047d84,0x32caab7b40c72493 
+	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
+	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
--- a/crypto/sha2/sha256.c
+++ b/crypto/sha2/sha256.c
@ -1,271 +0,0 @@
-/*-
- * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
- * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif /* HAVE_CONFIG_H */
-
-#if HAVE_INTTYPES_H
-# include <inttypes.h>
-#else
-# if HAVE_STDINT_H
-#  include <stdint.h>
-# endif
-#endif
-
-#include <pthread.h>
-#include <string.h>
-#include <utils.h>
-#include <sha256.h>
-
-#ifdef WORDS_BIGENDIAN
-
-#define BYTESWAP(x) (x)
-#define BYTESWAP64(x) (x)
-
-#else /* WORDS_BIGENDIAN */
-
-#define BYTESWAP(x) htonl(x)
-#define BYTESWAP64(x) htonll(x)
-
-#endif /* WORDS_BIGENDIAN */
-typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks);
-
-static uint8_t padding[64] = {
-  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-static const uint32_t iv256[SHA256_HASH_WORDS] = {
-  0x6a09e667L,
-  0xbb67ae85L,
-  0x3c6ef372L,
-  0xa54ff53aL,
-  0x510e527fL,
-  0x9b05688cL,
-  0x1f83d9abL,
-  0x5be0cd19L
-};
-
-static update_func_ptr sha_update_func;
-
-int
-APS_NAMESPACE(Init_SHA) (processor_info_t *pc)
-{
-	if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
-		if (pc->avx_level > 0) {
-			sha_update_func = sha256_avx;
-
-		} else if (pc->sse_level >= 4) {
-			sha_update_func = sha256_sse4;
-			
-		} else {
-			return (1);
-		}
-		return (0);
-	}
-	return (1);
-}
-
-static void
-_init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS])
-{
-	/*
-	 * SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8!
-	 * So we unroll a loop here.
-	 */
-	sc->hash[0] = iv[0];
-	sc->hash[1] = iv[1];
-	sc->hash[2] = iv[2];
-	sc->hash[3] = iv[3];
-	sc->hash[4] = iv[4];
-	sc->hash[5] = iv[5];
-	sc->hash[6] = iv[6];
-	sc->hash[7] = iv[7];
-
-	sc->totalLength = 0LL;
-	sc->bufferLength = 0L;
-}
-
-void
-APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc)
-{
-	_init (sc, iv256);
-}
-
-void
-APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *vdata, size_t len)
-{
-	const uint8_t *data = (const uint8_t *)vdata;
-	uint32_t bufferBytesLeft;
-	size_t bytesToCopy;
-	int rem;
-
-	if (sc->bufferLength) {
-		do {
-			bufferBytesLeft = 64L - sc->bufferLength;
-			bytesToCopy = bufferBytesLeft;
-			if (bytesToCopy > len)
-				bytesToCopy = len;
-
-			memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
-			sc->totalLength += bytesToCopy * 8L;
-			sc->bufferLength += bytesToCopy;
-			data += bytesToCopy;
-			len -= bytesToCopy;
-
-			if (sc->bufferLength == 64L) {
-				sc->blocks = 1;
-				sha_update_func(sc->buffer.words, sc->hash, sc->blocks);
-				sc->bufferLength = 0L;
-			} else {
-				return;
-			}
-		} while (len > 0 && len <= 64L);
-		if (!len) return;
-	}
-
-	sc->blocks = len >> 6;
-	rem = len - (sc->blocks << 6);
-	len = sc->blocks << 6;
-	sc->totalLength += rem * 8L;
-
-	if (len) {
-		sc->totalLength += len * 8L;
-		sha_update_func((uint32_t *)data, sc->hash, sc->blocks);
-	}
-	if (rem) {
-		memcpy (&sc->buffer.bytes[0], data + len, rem);
-		sc->bufferLength = rem;
-	}
-}
-
-static void
-_final (SHA256_Context *sc, uint8_t *hash, int hashWords)
-{
-	uint32_t bytesToPad;
-	uint64_t lengthPad;
-	int i;
-
-	bytesToPad = 120L - sc->bufferLength;
-	if (bytesToPad > 64L)
-		bytesToPad -= 64L;
-
-	lengthPad = BYTESWAP64(sc->totalLength);
-
-	APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad);
-	APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L);
-
-	if (hash) {
-		for (i = 0; i < hashWords; i++) {
-			hash[0] = (uint8_t) (sc->hash[i] >> 24);
-			hash[1] = (uint8_t) (sc->hash[i] >> 16);
-			hash[2] = (uint8_t) (sc->hash[i] >> 8);
-			hash[3] = (uint8_t) sc->hash[i];
-			hash += 4;
-		}
-	}
-}
-
-void
-APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE])
-{
-	_final (sc, hash, SHA256_HASH_WORDS);
-}
-
-/* Initialize an HMAC-SHA256 operation with the given key. */
-void
-APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen)
-{
-	unsigned char pad[64];
-	unsigned char khash[32];
-	const unsigned char * K = (const unsigned char *)_K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
-		APS_NAMESPACE(SHA256_Update)(&ctx->ictx, K, Klen);
-		APS_NAMESPACE(SHA256_Final)(&ctx->ictx, khash);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	APS_NAMESPACE(SHA256_Init)(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	APS_NAMESPACE(SHA256_Update)(&ctx->ictx, pad, 64);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	APS_NAMESPACE(SHA256_Init)(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	APS_NAMESPACE(SHA256_Update)(&ctx->octx, pad, 64);
-
-	/* Clean the stack. */
-	memset(khash, 0, 32);
-}
-
-/* Add bytes to the HMAC-SHA256 operation. */
-void
-APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len)
-{
-	/* Feed data to the inner SHA256 operation. */
-	APS_NAMESPACE(SHA256_Update)(&ctx->ictx, in, len);
-}
-
-/* Finish an HMAC-SHA256 operation. */
-void
-APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32])
-{
-	unsigned char ihash[32];
-
-	/* Finish the inner SHA256 operation. */
-	APS_NAMESPACE(SHA256_Final)(&ctx->ictx, ihash);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	APS_NAMESPACE(SHA256_Update)(&ctx->octx, ihash, 32);
-
-	/* Finish the outer SHA256 operation. */
-	APS_NAMESPACE(SHA256_Final)(&ctx->octx, digest);
-
-	/* Clean the stack. */
-	memset(ihash, 0, 32);
-}
--- a/crypto/sha2/sha256.h
+++ b/crypto/sha2/sha256.h
@ -1,90 +0,0 @@
-/*-
- * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
- * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _APS_SHA256_H
-#define _APS_SHA256_H
-
-#if HAVE_INTTYPES_H
-# include <inttypes.h>
-#else
-# if HAVE_STDINT_H
-#  include <stdint.h>
-# endif
-#endif
-
-#include <utils.h>
-
-#define SHA256_HASH_SIZE 32
-
-/* Hash size in 32-bit words */
-#define SHA256_HASH_WORDS 8
-
-typedef struct _SHA256_Context {
-	uint64_t totalLength, blocks;
-	uint32_t hash[SHA256_HASH_WORDS];
-	uint32_t bufferLength;
-	union {
-		uint32_t words[16];
-		uint8_t bytes[64];
-	} buffer;
-} SHA256_Context;
-
-typedef struct HMAC_SHA256Context {
-	SHA256_Context ictx;
-	SHA256_Context octx;
-} HMAC_SHA256_Context;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef APS_NAMESPACE
-#define APS_NAMESPACE(name) opt_##name
-#endif /* !APS_NAMESPACE */
-
-void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc);
-void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, const void *data, size_t len);
-void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]);
-int  APS_NAMESPACE(Init_SHA) (processor_info_t *pc);
-
-void APS_NAMESPACE(HMAC_SHA256_Init) (HMAC_SHA256_Context * ctx, const void * _K, size_t Klen);
-void APS_NAMESPACE(HMAC_SHA256_Update) (HMAC_SHA256_Context * ctx, const void *in, size_t len);
-void APS_NAMESPACE(HMAC_SHA256_Final) (HMAC_SHA256_Context * ctx, unsigned char digest[32]);
-
-/*
- * Intel's optimized SHA256 core routines. These routines are described in an
- * Intel White-Paper:
- * "Fast SHA-256 Implementations on Intel Architecture Processors"
- */
-extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
-extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_APS_SHA256_H */
--- a/crypto/sha2/sha512.c
+++ b/crypto/sha2/sha512.c
@ -0,0 +1,294 @@
+/*-
+ * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
+ * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+
+#include <pthread.h>
+#include <string.h>
+#include <utils.h>
+#include "sha512.h"
+
+
+#ifdef WORDS_BIGENDIAN
+
+#define BYTESWAP(x) (x)
+#define BYTESWAP64(x) (x)
+
+#else /* WORDS_BIGENDIAN */
+
+#define BYTESWAP(x) htonl(x)
+#define BYTESWAP64(x) htonll(x)
+
+#endif /* WORDS_BIGENDIAN */
+
+typedef void (*update_func_ptr)(const void *input_data, void *digest, uint64_t num_blks);
+
+static const uint8_t padding[128] = {
+  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const uint64_t iv512[SHA512_HASH_WORDS] = {
+  0x6a09e667f3bcc908LL,
+  0xbb67ae8584caa73bLL,
+  0x3c6ef372fe94f82bLL,
+  0xa54ff53a5f1d36f1LL,
+  0x510e527fade682d1LL,
+  0x9b05688c2b3e6c1fLL,
+  0x1f83d9abfb41bd6bLL,
+  0x5be0cd19137e2179LL
+};
+
+static const uint64_t iv256[SHA512_HASH_WORDS] = {
+  0x22312194fc2bf72cLL,
+  0x9f555fa3c84c64c2LL,
+  0x2393b86b6f53b151LL,
+  0x963877195940eabdLL,
+  0x96283ee2a88effe3LL,
+  0xbe5e1e2553863992LL,
+  0x2b0199fc2c85b8aaLL,
+  0x0eb72ddc81c52ca2LL
+};
+
+static update_func_ptr sha512_update_func;
+
+int
+APS_NAMESPACE(Init_SHA512) (processor_info_t *pc)
+{
+	if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) {
+		if (pc->avx_level > 0) {
+			sha512_update_func = sha512_avx;
+
+		} else if (pc->sse_level >= 4) {
+			sha512_update_func = sha512_sse4;
+
+		} else {
+			return (1);
+		}
+		return (0);
+	}
+	return (1);
+}
+
+static void
+_init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS])
+{
+  int i;
+
+  sc->totalLength[0] = 0LL;
+  sc->totalLength[1] = 0LL;
+  for (i = 0; i < SHA512_HASH_WORDS; i++)
+    sc->hash[i] = iv[i];
+  sc->bufferLength = 0L;
+}
+
+void
+APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc)
+{
+  _init (sc, iv512);
+}
+
+void
+APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc)
+{
+  _init (sc, iv256);
+}
+
+void
+APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *vdata, size_t len)
+{
+	const uint8_t *data = (const uint8_t *)vdata;
+	uint32_t bufferBytesLeft;
+	size_t bytesToCopy;
+	int rem;
+	uint64_t carryCheck;
+
+	if (sc->bufferLength) {
+		do {
+			bufferBytesLeft = 128L - sc->bufferLength;
+			bytesToCopy = bufferBytesLeft;
+			if (bytesToCopy > len)
+				bytesToCopy = len;
+
+			memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
+			carryCheck = sc->totalLength[1];
+			sc->totalLength[1] += bytesToCopy * 8L;
+			if (sc->totalLength[1] < carryCheck)
+				sc->totalLength[0]++;
+
+			sc->bufferLength += bytesToCopy;
+			data += bytesToCopy;
+			len -= bytesToCopy;
+
+			if (sc->bufferLength == 128L) {
+				sc->blocks = 1;
+				sha512_update_func(sc->buffer.words, sc->hash, sc->blocks);
+				sc->bufferLength = 0L;
+			} else {
+				return;
+			}
+		} while (len > 0 && len <= 128L);
+		if (!len) return;
+	}
+
+	sc->blocks = len >> 7;
+	rem = len - (sc->blocks << 7);
+	len = sc->blocks << 7;
+	carryCheck = sc->totalLength[1];
+	sc->totalLength[1] += rem * 8L;
+	if (sc->totalLength[1] < carryCheck)
+		sc->totalLength[0]++;
+
+	if (len) {
+		carryCheck = sc->totalLength[1];
+		sc->totalLength[1] += len * 8L;
+		if (sc->totalLength[1] < carryCheck)
+			sc->totalLength[0]++;
+		sha512_update_func((uint32_t *)data, sc->hash, sc->blocks);
+	}
+	if (rem) {
+		memcpy (&sc->buffer.bytes[0], data + len, rem);
+		sc->bufferLength = rem;
+	}
+}
+
+void
+APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len)
+{
+  APS_NAMESPACE(SHA512_Update) (sc, data, len);
+}
+
+static void
+_final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord)
+{
+  uint32_t bytesToPad;
+  uint64_t lengthPad[2];
+  int i;
+
+  bytesToPad = 240L - sc->bufferLength;
+  if (bytesToPad > 128L)
+    bytesToPad -= 128L;
+
+  lengthPad[0] = BYTESWAP64(sc->totalLength[0]);
+  lengthPad[1] = BYTESWAP64(sc->totalLength[1]);
+
+  APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad);
+  APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L);
+
+  if (hash) {
+    for (i = 0; i < hashWords; i++) {
+      *((uint64_t *) hash) = BYTESWAP64(sc->hash[i]);
+      hash += 8;
+    }
+    if (halfWord) {
+      hash[0] = (uint8_t) (sc->hash[i] >> 56);
+      hash[1] = (uint8_t) (sc->hash[i] >> 48);
+      hash[2] = (uint8_t) (sc->hash[i] >> 40);
+      hash[3] = (uint8_t) (sc->hash[i] >> 32);
+    }
+  }
+}
+
+void
+APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE])
+{
+  _final (sc, hash, SHA512_HASH_WORDS, 0);
+}
+
+void
+APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE])
+{
+  _final (sc, hash, SHA512t256_HASH_WORDS, 0);
+}
+
+#define HASH_CONTEXT SHA512_Context
+#define HASH_INIT APS_NAMESPACE(SHA512_Init)
+#define HASH_UPDATE APS_NAMESPACE(SHA512_Update)
+#define HASH_FINAL APS_NAMESPACE(SHA512_Final)
+#define HASH_SIZE SHA512_HASH_SIZE
+#define HASH_BLOCK_SIZE 128
+
+#define HMAC_CONTEXT HMAC_SHA512_Context
+#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512_Init)
+#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512_Update)
+#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512_Final)
+
+#include "_hmac.c"
+
+#undef HASH_CONTEXT
+#undef HASH_INIT
+#undef HASH_UPDATE
+#undef HASH_FINAL
+#undef HASH_SIZE
+#undef HASH_BLOCK_SIZE
+#undef HMAC_CONTEXT
+#undef HMAC_INIT
+#undef HMAC_UPDATE
+#undef HMAC_FINAL
+
+#define HASH_CONTEXT SHA512_Context
+#define HASH_INIT APS_NAMESPACE(SHA512t256_Init)
+#define HASH_UPDATE APS_NAMESPACE(SHA512t256_Update)
+#define HASH_FINAL APS_NAMESPACE(SHA512t256_Final)
+#define HASH_SIZE SHA512t256_HASH_SIZE
+#define HASH_BLOCK_SIZE 128
+
+#define HMAC_CONTEXT HMAC_SHA512_Context
+#define HMAC_INIT APS_NAMESPACE(HMAC_SHA512t256_Init)
+#define HMAC_UPDATE APS_NAMESPACE(HMAC_SHA512t256_Update)
+#define HMAC_FINAL APS_NAMESPACE(HMAC_SHA512t256_Final)
+
+#include "_hmac.c"
+
--- a/crypto/sha2/sha512.h
+++ b/crypto/sha2/sha512.h
@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
+ * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _APS_SHA512_H
+#define _APS_SHA512_H
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+
+#include <utils.h>
+
+#define SHA512_HASH_SIZE 64
+#define SHA512t256_HASH_SIZE 32
+
+/* Hash size in 64-bit words */
+#define SHA512_HASH_WORDS 8
+#define SHA512t256_HASH_WORDS 4
+
+typedef struct _SHA512_Context {
+  uint64_t totalLength[2], blocks;
+  uint64_t hash[SHA512_HASH_WORDS];
+  uint32_t bufferLength;
+  union {
+    uint64_t words[16];
+    uint8_t bytes[128];
+  } buffer;
+} SHA512_Context;
+
+typedef struct {
+  SHA512_Context outer;
+  SHA512_Context inner;
+} HMAC_SHA512_Context;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef APS_NAMESPACE
+#define APS_NAMESPACE(name) opt_##name
+#endif /* !APS_NAMESPACE */
+
+void APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc);
+void APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *data, size_t len);
+void APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]);
+int  APS_NAMESPACE(Init_SHA512) (processor_info_t *pc);
+
+/* As are SHA-512/256 and SHA-512/224 */
+#define SHA512t256_Context SHA512_Context
+void APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc);
+void APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len);
+void APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]);
+
+void APS_NAMESPACE(HMAC_SHA512_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
+void APS_NAMESPACE(HMAC_SHA512_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
+void APS_NAMESPACE(HMAC_SHA512_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512_HASH_SIZE]);
+
+void APS_NAMESPACE(HMAC_SHA512t256_Init) (HMAC_SHA512_Context *ctxt, const void *key, size_t keyLen);
+void APS_NAMESPACE(HMAC_SHA512t256_Update) (HMAC_SHA512_Context *ctxt, const void *data, size_t len);
+void APS_NAMESPACE(HMAC_SHA512t256_Final) (HMAC_SHA512_Context *ctxt, uint8_t hmac[SHA512t256_HASH_SIZE]);
+
+/*
+ * Intel's optimized SHA512 core routines. These routines are described in an
+ * Intel White-Paper:
+ * "Fast SHA-512 Implementations on Intel Architecture Processors"
+ * Note: Works on AMD Bulldozer and later as well.
+ */
+extern void sha512_sse4(const void *input_data, void *digest, uint64_t num_blks);
+extern void sha512_avx(const void *input_data, void *digest, uint64_t num_blks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_APS_SHA512_H */
--- a/main.c
+++ b/main.c
@ -2149,6 +2149,7 @@ main(int argc, char *argv[])
 	level = 6;
 	err = 0;
 	slab_init();
+	init_pcompress();

 	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLPS:B:F")) != -1) {
 		int ovr;
@ -2341,7 +2342,6 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 	main_cancel = 0;
-	init_pcompress();

 	if (cksum == 0)
 		get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes);
--- a/utils/xxhash_base.c
+++ b/utils/xxhash_base.c
@ -20,7 +20,6 @@ void * (*xxh32_init)(unsigned int seed) = NULL;
 int (*xxh32_feed)(void* state, const void* input, int len) = NULL;
 unsigned int (*xxh32_result)(void* state) = NULL;
 unsigned int (*xxh32_getIntermediateResult)(void* state) = NULL;
-#include <stdio.h>

 void
 XXH32_module_init() {