Incorporate SSE/AVX optimized Intel SHA-256 implementation.

Add support for runtime cpuid detection.
2012-10-06 20:51:23 +05:30 · 2012-10-06 20:51:23 +05:30 · 21cbef6d60
commit 21cbef6d60
parent c880b73d26
11 changed files with 1753 additions and 14 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -24,8 +24,9 @@
 PROG= pcompress
 MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
 	lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
-	lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c
-MAINHDRS = allocator.h  pcompress.h  utils/utils.h utils/xxhash.h utils/heapq.h
+	lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c utils/cpuid.c
+MAINHDRS = allocator.h  pcompress.h  utils/utils.h utils/xxhash.h utils/heapq.h \
+	utils/cpuid.h
 MAINOBJS = $(MAINSRCS:.c=.o)

 RABINSRCS = rabin/rabin_dedup.c
@ -72,6 +73,14 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \
 	crypto/skein/skein_debug.h crypto/skein/skein_iv.h
 SKEINOBJS = $(SKEINSRCS:.c=.o)

+SHA256_SRCS = crypto/sha2/sha256.c
+SHA256_HDRS = crypto/sha2/sha256.h
+SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \
+	crypto/sha2/intel/sha256_sse4.asm
+SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o)
+SHA256_OBJS = $(SHA256_SRCS:.c=.o)
+
+YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX
 LIBBSCWRAP = libbsc_compress.c
 LIBBSCWRAPOBJ = libbsc_compress.o
 LIBBSCDIR = @LIBBSCDIR@
@ -80,17 +89,20 @@ LIBBSCLIB = @LIBBSCLIB@
 LIBBSCGEN_OPT = -fopenmp
 LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC

-BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~
+BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \
+	crypto/sha2/intel/*~

 RM = rm -f
 COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
 	-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
-	-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@
+	-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \
+	-I./crypto/sha2
 COMMON_VEC_FLAGS = -ftree-vectorize
 COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
 LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm @LIBBSCLFLAGS@ -L@OPENSSL_LIBDIR@ -lcrypto
 OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
-$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ)
+$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ) \
+@SHA256ASM_OBJS@ @SHA256_OBJS@

 DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
 DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@ -155,6 +167,12 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
 $(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS)
 	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@

+$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS)
+	$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
+
+$(SHA256ASM_OBJS): $(SHA256ASM_SRCS)
+	$(YASM)	-o $@ $(@:.o=.asm)
+
 $(LIBBSCLIB):
 	(cd $(LIBBSCDIR); make)

--- a/37
+++ b/37
@ -36,6 +36,9 @@ libbsccppflags=
 openssl_prefix=
 openssl_libdir=
 openssl_incdir=
+sha256asmobjs=
+sha256objs=
+yasm=yasm

 while [ "${arg1}" != "" ]
 do
@ -98,6 +101,33 @@ echo $plat | egrep 'x86_64|amd64' > /dev/null
 if [ $? -eq 0 ]
 then
 	skeinblock='\$\(SKEIN_BLOCK_ASM\)'
+	yasm=
+
+	#
+	# Detect Yasm
+	#
+	for bindir in /bin /usr/bin /usr/local/bin
+	do
+		if [ -x ${bindir}/yasm ]
+		then
+			# Get yasm version
+			yver=`${bindir}/yasm --version | head -1 | awk '{print $2}'`
+			_OIFS=$IFS; IFS="."; set -- ${yver}; IFS="$_OIFS"
+			major=$1
+			minor=$2
+
+			# Minimum yasm version 1.1
+			[ $major -lt 1 -o $minor -lt 1 ] && continue
+			yasm=${bindir}/yasm
+			sha256asmobjs='\$\(SHA256ASM_OBJS\)'
+			sha256objs='\$\(SHA256_OBJS\)'
+		fi
+	done
+	if [ "x${yasm}" = "x" ]
+	then
+		echo "Yasm version 1.1 or later is required to build on x64 platforms"
+		exit 1
+	fi
 fi

 # Detect OpenSSL library
@ -152,6 +182,7 @@ then
 	exit 1
 fi

+
 linkvar="LINK"
 compilevar="COMPILE"
 compilecppvar="COMPILE_cpp"
@ -171,6 +202,9 @@ libbsclflagsvar="LIBBSCLFLAGS"
 libbscwrapobjvar="LIBBSCWRAPOBJ"
 libbscgenoptvar="LIBBSCGEN_OPT"
 libbsccppflagsvar="LIBBSCCPPFLAGS"
+sha256asmobjsvar="SHA256ASM_OBJS"
+sha256objsvar="SHA256_OBJS"
+yasmvar="YASM"

 openssllibdirvar="OPENSSL_LIBDIR"
 opensslincdirvar="OPENSSL_INCDIR"
@ -202,5 +236,8 @@ s#@${libbsccppflagsvar}@#${libbsccppflags}#g
 s#@${skeinblockvar}@#${skeinblock}#g
 s#@${openssllibdirvar}@#${openssl_libdir}#g
 s#@${opensslincdirvar}@#${openssl_incdir}#g
+s#@${sha256asmobjsvar}@#${sha256asmobjs}#g
+s#@${sha256objsvar}@#${sha256objs}#g
+s#@${yasmvar}@#${yasm}#g
 " > Makefile

--- a/crypto/sha2/intel/open_software_license.txt
+++ b/crypto/sha2/intel/open_software_license.txt
@ -0,0 +1,32 @@
+Copyright (c) 2012, Intel Corporation 
+
+All rights reserved. 
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met: 
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.  
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the
+  distribution. 
+
+* Neither the name of the Intel Corporation nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission. 
+
+
+THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/crypto/sha2/intel/sha256_avx1.asm
+++ b/crypto/sha2/intel/sha256_avx1.asm
@ -0,0 +1,577 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright 2012 Intel Corporation All Rights Reserved.
+; 
+; The source code contained or described herein and all documents
+; related to the source code ("Material") are owned by Intel Corporation
+; or its suppliers or licensors. Title to the Material remains with
+; Intel Corporation or its suppliers and licensors. The Material may
+; contain trade secrets and proprietary and confidential information of
+; Intel Corporation and its suppliers and licensors, and is protected by
+; worldwide copyright and trade secret laws and treaty provisions. No
+; part of the Material may be used, copied, reproduced, modified,
+; published, uploaded, posted, transmitted, distributed, or disclosed in
+; any way without Intel's prior express written permission.
+; 
+; No license under any patent, copyright, trade secret or other
+; intellectual property right is granted to or conferred upon you by
+; disclosure or delivery of the Materials, either expressly, by
+; implication, inducement, estoppel or otherwise. Any license under such
+; intellectual property rights must be express and approved by Intel in
+; writing.
+; 
+; Unless otherwise agreed by Intel in writing, you may not remove or
+; alter this notice or any other notice embedded in Materials by Intel
+; or Intel's suppliers or licensors in any way.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
+; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded 
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define	VMOVDQ vmovdqu ;; assume buffers not aligned 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+	add	%2, %1
+	mov	%1, %2
+%endm
+
+%macro MY_ROR 2
+	shld	%1,%1,(32-(%2))
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+	VMOVDQ %1, %2
+	vpshufb %1, %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER  xmm9
+%define XTMP5 xmm11
+
+%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK	xmm13
+	
+%ifdef LINUX
+%define NUM_BLKS rdx	; 3rd arg
+%define CTX	rsi	; 2nd arg
+%define INP	rdi	; 1st arg
+
+%define SRND	rdi	; clobbers INP
+%define c	ecx
+%define d 	r8d
+%define e 	edx
+%else
+%define NUM_BLKS r8	; 3rd arg
+%define CTX	rdx 	; 2nd arg
+%define INP	rcx 	; 1st arg
+
+%define SRND	rcx	; clobbers INP
+%define c 	edi 
+%define d	esi 
+%define e 	r8d
+	
+%endif
+%define TBL	rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+_INP_END_SIZE	equ 8
+_INP_SIZE	equ 8
+_XFER_SIZE	equ 8
+%ifdef LINUX
+_XMM_SAVE_SIZE	equ 0
+%else
+_XMM_SAVE_SIZE	equ 8*16
+%endif
+; STACK_SIZE plus pushes must be an odd multiple of 8
+_ALIGN_SIZE	equ 8
+
+_INP_END	equ 0
+_INP		equ _INP_END  + _INP_END_SIZE
+_XFER		equ _INP      + _INP_SIZE
+_XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
+STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+		;; compute s0 four at a time and s1 two at a time
+		;; compute W[-16] + W[-7] 4 at a time
+		;vmovdqa	XTMP0, X3
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		;vmovdqa	XTMP1, X1
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		;; compute s0
+		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	
+		
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
+
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	
+		vpsrld	XTMP2, XTMP1, 7
+		
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	
+		vpslld	XTMP3, XTMP1, (32-7)
+		
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	
+		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7
+		
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+
+
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+
+		vpsrld	XTMP2, XTMP1,18
+
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	
+		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] >> 3
+		
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		
+		vpslld	XTMP1, XTMP1, (32-18)
+
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+
+		vpxor	XTMP3, XTMP3, XTMP1
+
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	
+		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
+		
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	
+		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
+		
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		;; compute low s1
+		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
+		
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	
+		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
+		
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	
+		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
+		
+	xor	y2, g		; y2 = f^g
+	
+		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
+		
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	
+		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
+		
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		vpxor	XTMP2, XTMP2, XTMP3
+	add	y2, y0		; y2 = S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
+		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		;; compute high s1
+		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	
+		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
+		
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+	
+		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
+		
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	
+		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
+		
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	
+		vpxor	XTMP2, XTMP2, XTMP3
+		
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
+		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov	y0, e		; y0 = e
+	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+section .text
+global sha256_avx
+align 32
+sha256_avx:
+	push	rbx
+%ifndef LINUX
+	push	rsi
+	push	rdi
+%endif
+	push	rbp
+	push	r13
+	push	r14
+	push	r15
+
+	sub	rsp,STACK_SIZE
+%ifndef LINUX
+	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
+	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
+	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
+	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
+	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
+	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
+	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
+	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+	shl	NUM_BLKS, 6	; convert to bytes
+	jz	done_hash
+	add	NUM_BLKS, INP	; pointer to end of data
+	mov	[rsp + _INP_END], NUM_BLKS
+
+	;; load initial digest
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
+	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+loop0:
+	lea	TBL,[K256 wrt rip]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	
+	mov	[rsp + _INP], INP
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov	SRND, 3
+align 16
+loop1:
+	vpaddd	XFER, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 2*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd	XFER, X0, [TBL + 3*16]
+	vmovdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+
+	sub	SRND, 1
+	jne	loop1
+
+	mov	SRND, 2
+loop2:
+	vpaddd	XFER, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], XFER
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	vpaddd	XFER, X1, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], XFER
+	add	TBL, 2*16
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	vmovdqa	X0, X2
+	vmovdqa	X1, X3
+
+	sub	SRND, 1
+	jne	loop2
+
+
+	addm	[4*0 + CTX],a
+	addm	[4*1 + CTX],b
+	addm	[4*2 + CTX],c
+	addm	[4*3 + CTX],d
+	addm	[4*4 + CTX],e
+	addm	[4*5 + CTX],f
+	addm	[4*6 + CTX],g
+	addm	[4*7 + CTX],h
+
+	mov	INP, [rsp + _INP]
+	add	INP, 64
+	cmp	INP, [rsp + _INP_END]
+	jne	loop0
+
+done_hash:
+%ifndef LINUX
+	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
+	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
+	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
+	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
+	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
+	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
+	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
+	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
+%endif
+
+
+	add	rsp, STACK_SIZE
+
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	rbp
+%ifndef LINUX
+	pop	rdi
+	pop	rsi
+%endif
+	pop	rbx
+
+	ret	
+	
+
+section .data
+align 64
+K256:
+	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/intel/sha256_sse4.asm
+++ b/crypto/sha2/intel/sha256_sse4.asm
@ -0,0 +1,535 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright 2012 Intel Corporation All Rights Reserved.
+; 
+; The source code contained or described herein and all documents
+; related to the source code ("Material") are owned by Intel Corporation
+; or its suppliers or licensors. Title to the Material remains with
+; Intel Corporation or its suppliers and licensors. The Material may
+; contain trade secrets and proprietary and confidential information of
+; Intel Corporation and its suppliers and licensors, and is protected by
+; worldwide copyright and trade secret laws and treaty provisions. No
+; part of the Material may be used, copied, reproduced, modified,
+; published, uploaded, posted, transmitted, distributed, or disclosed in
+; any way without Intel's prior express written permission.
+; 
+; No license under any patent, copyright, trade secret or other
+; intellectual property right is granted to or conferred upon you by
+; disclosure or delivery of the Materials, either expressly, by
+; implication, inducement, estoppel or otherwise. Any license under such
+; intellectual property rights must be express and approved by Intel in
+; writing.
+; 
+; Unless otherwise agreed by Intel in writing, you may not remove or
+; alter this notice or any other notice embedded in Materials by Intel
+; or Intel's suppliers or licensors in any way.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
+; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded 
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define	MOVDQ movdqu ;; assume buffers not aligned 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+	add	%2, %1
+	mov	%1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+	MOVDQ %1, %2
+	pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER  xmm9
+
+%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00	xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK	xmm12
+	
+%ifdef LINUX
+%define NUM_BLKS rdx	; 3rd arg
+%define CTX	rsi	; 2nd arg
+%define INP	rdi	; 1st arg
+
+%define SRND	rdi	; clobbers INP
+%define c	ecx
+%define d 	r8d
+%define e 	edx
+%else
+%define NUM_BLKS r8	; 3rd arg
+%define CTX	rdx 	; 2nd arg
+%define INP	rcx 	; 1st arg
+
+%define SRND	rcx	; clobbers INP
+%define c 	edi 
+%define d	esi 
+%define e 	r8d
+	
+%endif
+%define TBL	rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+
+_INP_END_SIZE	equ 8
+_INP_SIZE	equ 8
+_XFER_SIZE	equ 8
+%ifdef LINUX
+_XMM_SAVE_SIZE	equ 0
+%else
+_XMM_SAVE_SIZE	equ 7*16
+%endif
+; STACK_SIZE plus pushes must be an odd multiple of 8
+_ALIGN_SIZE	equ 8
+
+_INP_END	equ 0
+_INP		equ _INP_END  + _INP_END_SIZE
+_XFER		equ _INP      + _INP_SIZE
+_XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
+STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+		;; compute s0 four at a time and s1 two at a time
+		;; compute W[-16] + W[-7] 4 at a time
+		movdqa	XTMP0, X3
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		palignr	XTMP0, X2, 4	; XTMP0 = W[-7]
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		movdqa	XTMP1, X1
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		paddd	XTMP0, X0	; XTMP0 = W[-7] + W[-16]
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		;; compute s0
+		palignr	XTMP1, X0, 4	; XTMP1 = W[-15]
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		movdqa	XTMP2, XTMP1	; XTMP2 = W[-15]
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
+		movdqa	XTMP3, XTMP1	; XTMP3 = W[-15]
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pslld	XTMP1, (32-7)
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		psrld	XTMP2, 7
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		por	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP2, XTMP3	; XTMP2 = W[-15]
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+		movdqa	XTMP4, XTMP3	; XTMP4 = W[-15]
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+		pslld	XTMP3, (32-18)
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+		psrld	XTMP2, 18
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		pxor	XTMP1, XTMP3
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		psrld	XTMP4, 3	; XTMP4 = W[-15] >> 3
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+		pxor	XTMP1, XTMP2	; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pxor	XTMP1, XTMP4	; XTMP1 = s0
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		;; compute low s1
+		pshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		paddd	XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
+	mov	y0, e		; y0 = e
+	mov	y1, a		; y1 = a
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+		movdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
+	xor	y2, g		; y2 = f^g
+		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xBxA}
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+		psrld	XTMP4, 10	; XTMP4 = W[-2] >> 10 {BBAA}
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+		pxor	XTMP2, XTMP3
+	add	y2, y0		; y2 = S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
+		pxor	XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pshufb	XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		paddd	XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+		;; compute high s1
+		pshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+		movdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+		movdqa	X0,    XTMP2	; X0    = W[-2] {DDCC}
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	mov	y2, f		; y2 = f
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+		psrlq	XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	xor	y2, g		; y2 = f^g
+		psrlq	XTMP3, 19	; XTMP3 = W[-2] ror 19 {xDxC}
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and	y2, e		; y2 = (f^g)&e
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+		psrld	X0,    10	; X0 = W[-2] >> 10 {DDCC}
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+		pxor	XTMP2, XTMP3
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, y0		; y2 = S1 + CH
+	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
+		pxor	X0, XTMP2	; X0 = s1 {xDxC}
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+		pshufb	X0, SHUF_DC00	; X0 = s1 {DC00}
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+		paddd	X0, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov	y0, e		; y0 = e
+	ror	y0, (25-11)	; y0 = e >> (25-11)
+	mov	y1, a		; y1 = a
+	xor	y0, e		; y0 = e ^ (e >> (25-11))
+	ror	y1, (22-13)	; y1 = a >> (22-13)
+	mov	y2, f		; y2 = f
+	xor	y1, a		; y1 = a ^ (a >> (22-13)
+	ror	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor	y2, g		; y2 = f^g
+	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and	y2, e		; y2 = (f^g)&e
+	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor	y2, g		; y2 = CH = ((f^g)&e)^g
+	add	y2, y0		; y2 = S1 + CH
+	ror	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
+	mov	y0, a		; y0 = a
+	add	h, y2		; h = h + S1 + CH + k + w
+	mov	y2, a		; y2 = a
+	or	y0, c		; y0 = a|c
+	add	d, h		; d = d + h + S1 + CH + k + w
+	and	y2, c		; y2 = a&c
+	and	y0, b		; y0 = (a|c)&b
+	add	h, y1		; h = h + S1 + CH + k + w + S0
+	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
+	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+section .text
+global sha256_sse4
+align 32
+sha256_sse4:
+	push	rbx
+%ifndef LINUX
+	push	rsi
+	push	rdi
+%endif
+	push	rbp
+	push	r13
+	push	r14
+	push	r15
+
+	sub	rsp,STACK_SIZE
+%ifndef LINUX
+	movdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
+	movdqa	[rsp + _XMM_SAVE + 1*16],xmm7
+	movdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
+	movdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
+	movdqa	[rsp + _XMM_SAVE + 4*16],xmm10
+	movdqa	[rsp + _XMM_SAVE + 5*16],xmm11
+	movdqa	[rsp + _XMM_SAVE + 6*16],xmm12
+%endif
+
+	shl	NUM_BLKS, 6	; convert to bytes
+	jz	done_hash
+	add	NUM_BLKS, INP	; pointer to end of data
+	mov	[rsp + _INP_END], NUM_BLKS
+
+	;; load initial digest
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	movdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+	movdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
+	movdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+loop0:
+	lea	TBL,[K256 wrt rip]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	
+	mov	[rsp + _INP], INP
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov	SRND, 3
+align 16
+loop1:
+	movdqa	XFER, [TBL + 0*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 1*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 2*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa	XFER, [TBL + 3*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+
+	sub	SRND, 1
+	jne	loop1
+
+	mov	SRND, 2
+loop2:
+	paddd	X0, [TBL + 0*16]
+	movdqa	[rsp + _XFER], X0
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+	paddd	X1, [TBL + 1*16]
+	movdqa	[rsp + _XFER], X1
+	add	TBL, 2*16
+	DO_ROUND	0
+	DO_ROUND	1
+	DO_ROUND	2
+	DO_ROUND	3
+
+	movdqa	X0, X2
+	movdqa	X1, X3
+
+	sub	SRND, 1
+	jne	loop2
+
+	addm	[4*0 + CTX],a
+	addm	[4*1 + CTX],b
+	addm	[4*2 + CTX],c
+	addm	[4*3 + CTX],d
+	addm	[4*4 + CTX],e
+	addm	[4*5 + CTX],f
+	addm	[4*6 + CTX],g
+	addm	[4*7 + CTX],h
+
+	mov	INP, [rsp + _INP]
+	add	INP, 64
+	cmp	INP, [rsp + _INP_END]
+	jne	loop0
+
+done_hash:
+%ifndef LINUX
+	movdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
+	movdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
+	movdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
+	movdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
+	movdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
+	movdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
+	movdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
+%endif
+
+	add	rsp, STACK_SIZE
+
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	rbp
+%ifndef LINUX
+	pop	rdi
+	pop	rsi
+%endif
+	pop	rbx
+
+	ret	
+	
+
+section .data
+align 64
+K256:
+	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/crypto/sha2/sha256.c
+++ b/crypto/sha2/sha256.c
@ -0,0 +1,210 @@
+/*-
+ * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
+ * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+
+#include <pthread.h>
+#include <string.h>
+#include <utils.h>
+#include <sha256.h>
+
+#ifdef WORDS_BIGENDIAN
+
+#define BYTESWAP(x) (x)
+#define BYTESWAP64(x) (x)
+
+#else /* WORDS_BIGENDIAN */
+
+#define BYTESWAP(x) htonl(x)
+#define BYTESWAP64(x) htonll(x)
+
+#endif /* WORDS_BIGENDIAN */
+typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks);
+
+static uint8_t padding[64] = {
+  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const uint32_t iv256[SHA256_HASH_WORDS] = {
+  0x6a09e667L,
+  0xbb67ae85L,
+  0x3c6ef372L,
+  0xa54ff53aL,
+  0x510e527fL,
+  0x9b05688cL,
+  0x1f83d9abL,
+  0x5be0cd19L
+};
+
+static update_func_ptr sha_update_func;
+
+int
+APS_NAMESPACE(Init_SHA) (processor_info_t *pc)
+{
+	if (pc->proc_type == PROC_X64_INTEL) {
+		if (pc->avx_level > 0) {
+			sha_update_func = sha256_avx;
+			
+		} else if (pc->sse_level >= 4) {
+			sha_update_func = sha256_sse4;
+			
+		} else {
+			return (1);
+		}
+		return (0);
+	}
+	return (1);
+}
+
+static void
+_init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS])
+{
+	int i;
+	
+	/*
+	 * SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8!
+	 * So we unroll a loop here.
+	 */
+	sc->hash[0] = iv[0];
+	sc->hash[1] = iv[1];
+	sc->hash[2] = iv[2];
+	sc->hash[3] = iv[3];
+	sc->hash[4] = iv[4];
+	sc->hash[5] = iv[5];
+	sc->hash[6] = iv[6];
+	sc->hash[7] = iv[7];
+	
+	sc->totalLength = 0LL;
+	sc->bufferLength = 0L;
+}
+
+void
+APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc)
+{
+	_init (sc, iv256);
+}
+
+void
+APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, void *vdata, size_t len)
+{
+	const uint8_t *data = vdata;
+	uint32_t bufferBytesLeft;
+	size_t bytesToCopy;
+	int rem;
+	
+	if (sc->bufferLength) {
+		do {
+			bufferBytesLeft = 64L - sc->bufferLength;
+			bytesToCopy = bufferBytesLeft;
+			if (bytesToCopy > len)
+				bytesToCopy = len;
+			
+			memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
+			sc->totalLength += bytesToCopy * 8L;
+			sc->bufferLength += bytesToCopy;
+			data += bytesToCopy;
+			len -= bytesToCopy;
+			
+			if (sc->bufferLength == 64L) {
+				sc->blocks = 1;
+				sha_update_func(sc->buffer.words, sc->hash, sc->blocks);
+				sc->bufferLength = 0L;
+			} else {
+				return;
+			}
+		} while (len > 0 && len <= 64L);
+		if (!len) return;
+	}
+	
+	sc->blocks = len >> 6;
+	rem = len - (sc->blocks << 6);
+	len = sc->blocks << 6;
+	sc->totalLength += rem * 8L;
+	
+	if (len) {
+		sc->totalLength += len * 8L;
+		sha_update_func((uint32_t *)data, sc->hash, sc->blocks);
+	}
+	if (rem) {
+		memcpy (&sc->buffer.bytes[0], data + len, rem);
+		sc->bufferLength = rem;
+	}
+}
+
+static void
+_final (SHA256_Context *sc, uint8_t *hash, int hashWords)
+{
+	uint32_t bytesToPad;
+	uint64_t lengthPad;
+	int i;
+	
+	bytesToPad = 120L - sc->bufferLength;
+	if (bytesToPad > 64L)
+		bytesToPad -= 64L;
+	
+	lengthPad = BYTESWAP64(sc->totalLength);
+	
+	APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad);
+	APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L);
+	
+	if (hash) {
+		for (i = 0; i < hashWords; i++) {
+			hash[0] = (uint8_t) (sc->hash[i] >> 24);
+			hash[1] = (uint8_t) (sc->hash[i] >> 16);
+			hash[2] = (uint8_t) (sc->hash[i] >> 8);
+			hash[3] = (uint8_t) sc->hash[i];
+			hash += 4;
+		}
+	}
+}
+
+void
+APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE])
+{
+	_final (sc, hash, SHA256_HASH_WORDS);
+}
--- a/crypto/sha2/sha256.h
+++ b/crypto/sha2/sha256.h
@ -0,0 +1,81 @@
+/*-
+ * Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
+ * Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _APS_SHA256_H
+#define _APS_SHA256_H
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+
+#include <utils.h>
+
+#define SHA256_HASH_SIZE 32
+
+/* Hash size in 32-bit words */
+#define SHA256_HASH_WORDS 8
+
+typedef struct _SHA256_Context {
+	uint64_t totalLength, blocks;
+	uint32_t hash[SHA256_HASH_WORDS];
+	uint32_t bufferLength;
+	union {
+		uint32_t words[16];
+		uint8_t bytes[64];
+	} buffer;
+} SHA256_Context;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef APS_NAMESPACE
+#define APS_NAMESPACE(name) opt_##name
+#endif /* !APS_NAMESPACE */
+
+void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc);
+void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, void *data, size_t len);
+void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]);
+int  APS_NAMESPACE(Init_SHA) (processor_info_t *pc);
+
+/*
+ * Intel's optimized SHA256 core routines. These routines are described in an
+ * Intel White-Paper:
+ * "Fast SHA-256 Implementations on Intel Architecture Processors"
+ */
+extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
+extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_APS_SHA256_H */
--- a/utils/cpuid.c
+++ b/utils/cpuid.c
@ -0,0 +1,134 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include "utils.h"
+#include "cpuid.h"
+
+#ifdef	__x86_64__
+void
+exec_cpuid(uint32_t *regs)
+{
+#ifdef __GNUC__
+	__asm __volatile(
+		"	push	%%rbx\n"
+		"	push	%%rcx\n"
+		"	push	%%rdx\n"
+		"	push	%%rdi\n"
+		
+		"	mov	%0,	%%rdi\n"
+		
+		"	mov	(%%rdi),	%%eax\n"
+		"	mov	4(%%rdi),	%%ebx\n"
+		"	mov	8(%%rdi),	%%ecx\n"
+		"	mov	12(%%rdi),	%%edx\n"
+		
+		"	cpuid\n"
+		
+		"	movl	%%eax,	(%%rdi)\n"
+		"	movl	%%ebx,	4(%%rdi)\n"
+		"	movl	%%ecx,	8(%%rdi)\n"
+		"	movl	%%edx,	12(%%rdi)\n"
+		"	pop	%%rdi\n"
+		"	pop	%%rdx\n"
+		"	pop	%%rcx\n"
+		"	pop	%%rbx\n"
+		:
+		:"rdi"(regs)
+		:"memory", "eax"
+	);
+#else
+#error	"Unsupported compiler"
+#endif
+}
+
+static void
+cpu_exec_cpuid(uint32_t eax, uint32_t* regs)
+{
+	regs[0] = eax;
+	regs[1] = regs[2] = regs[3] = 0;
+	exec_cpuid(regs);
+}
+
+static void
+cpu_exec_cpuid_ext(uint32_t* regs)
+{
+	exec_cpuid(regs);
+}
+
+void
+cpuid_get_raw_data(struct cpu_raw_data_t* data)
+{
+	unsigned i;
+	for (i = 0; i < 32; i++)
+		cpu_exec_cpuid(i, data->basic_cpuid[i]);
+	for (i = 0; i < 32; i++)
+		cpu_exec_cpuid(0x80000000 + i, data->ext_cpuid[i]);
+	for (i = 0; i < 4; i++) {
+		memset(data->intel_fn4[i], 0, sizeof(data->intel_fn4[i]));
+		data->intel_fn4[i][0] = 4;
+		data->intel_fn4[i][2] = i;
+		cpu_exec_cpuid_ext(data->intel_fn4[i]);
+	}
+}
+
+void
+cpuid_basic_identify(processor_info_t *pc)
+{
+	struct cpu_raw_data_t raw;
+	cpuid_get_raw_data(&raw);
+
+	memcpy(raw.vendor_str + 0, &raw.basic_cpuid[0][1], 4);
+	memcpy(raw.vendor_str + 4, &raw.basic_cpuid[0][3], 4);
+	memcpy(raw.vendor_str + 8, &raw.basic_cpuid[0][2], 4);
+	raw.vendor_str[12] = 0;
+	pc->avx_level = 0;
+	pc->sse_level = 0;
+
+	if (strcmp(raw.vendor_str, "GenuineIntel") == 0) {
+		pc->proc_type = PROC_X64_INTEL;
+
+		pc->sse_level = 2;
+		if (raw.basic_cpuid[0][0] >= 1) {
+			// ECX has SSE 4.2 and AVX flags
+			// Bit 20 is SSE 4.2 and bit 28 indicates AVX
+			if (raw.basic_cpuid[1][2] & (1 << 20)) {
+				pc->sse_level = 4;
+			} else {
+				pc->sse_level = 3;
+			}
+			pc->avx_level = 0;
+			if (raw.basic_cpuid[1][2] & (1 << 28)) {
+				pc->avx_level = 1;
+			}
+		}
+	} else if (strcmp(raw.vendor_str, "AuthenticAMD") == 0) {
+		pc->proc_type = PROC_X64_AMD;
+		pc->sse_level = 2;
+	}
+}
+
+#endif
--- a/utils/cpuid.h
+++ b/utils/cpuid.h
@ -0,0 +1,61 @@
+/*
+ * Copyright 2008  Veselin Georgiev,
+ * anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __CPUID_H__
+#define __CPUID_H__
+
+#ifdef	__x86_64__
+#define VENDOR_STR_MAX          16
+#define BRAND_STR_MAX           64
+#define CPU_FLAGS_MAX           128
+#define MAX_CPUID_LEVEL         32
+#define MAX_EXT_CPUID_LEVEL     32
+#define MAX_INTELFN4_LEVEL      4
+
+/**
+ * This contains only the most basic CPU data, required to do identification
+ * and feature recognition. Every processor should be identifiable using this
+ * data only.
+ */
+struct cpu_raw_data_t {
+	/** contains results of CPUID for eax = 0, 1, ...*/
+	uint32_t basic_cpuid[MAX_CPUID_LEVEL][4];
+
+	/** contains results of CPUID for eax = 0x80000000, 0x80000001, ...*/
+	uint32_t ext_cpuid[MAX_EXT_CPUID_LEVEL][4];
+
+	/** when the CPU is intel and it supports deterministic cache
+	    information: this contains the results of CPUID for eax = 4
+	    and ecx = 0, 1, ... */
+	uint32_t intel_fn4[MAX_INTELFN4_LEVEL][4];
+	char vendor_str[VENDOR_STR_MAX];
+};
+
+void exec_cpuid(uint32_t *regs);
+void cpuid_get_raw_data(struct cpu_raw_data_t* data);
+#endif /* __x86_64__ */
+
+#endif /* __CPUID_H__ */
+
--- a/utils/utils.c
+++ b/utils/utils.c
@ -34,24 +34,36 @@
 #include <rabin_dedup.h>
 #include <skein.h>
 #include <openssl/sha.h>
+#include <sha256.h>

 #include "utils.h"
+#include "cpuid.h"
+
+#define	PROVIDER_OPENSSL	0
+#define	PROVIDER_X64_OPT	1
+
+static void init_sha256(void);

 /*
 * Checksum properties
 */
+typedef void (*ckinit_func_ptr)(void);
 static struct {
 	char	*name;
 	cksum_t	cksum_id;
 	int	bytes;
+	ckinit_func_ptr init_func;
 } cksum_props[] = {
-	{"CRC64",	CKSUM_CRC64,	8},
-	{"SKEIN256",	CKSUM_SKEIN256,	32},
-	{"SKEIN512",	CKSUM_SKEIN512,	64},
-	{"SHA256",	CKSUM_SHA256,	32},
-	{"SHA512",	CKSUM_SHA512,	64}
+	{"CRC64",	CKSUM_CRC64,	8,	NULL},
+	{"SKEIN256",	CKSUM_SKEIN256,	32,	NULL},
+	{"SKEIN512",	CKSUM_SKEIN512,	64,	NULL},
+	{"SHA256",	CKSUM_SHA256,	32,	init_sha256},
+	{"SHA512",	CKSUM_SHA512,	64,	NULL}
 };

+
+static int cksum_provider = PROVIDER_OPENSSL;
+
 extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
 extern uint64_t lzma_crc64_8bchk(const uint8_t *buf, size_t size,
 	uint64_t crc, uint64_t *cnt);
@ -339,12 +351,19 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, ssize_t bytes)
 		Skein_512_Final(&ctx, cksum_buf);

 	} else if (cksum == CKSUM_SHA256) {
-		SHA256_CTX ctx;
+		if (cksum_provider == PROVIDER_OPENSSL) {
+			SHA256_CTX ctx;

-		SHA256_Init(&ctx);
-		SHA256_Update(&ctx, buf, bytes);
-		SHA256_Final(cksum_buf, &ctx);
+			SHA256_Init(&ctx);
+			SHA256_Update(&ctx, buf, bytes);
+			SHA256_Final(cksum_buf, &ctx);
+		} else {
+			SHA256_Context ctx;

+			opt_SHA256_Init(&ctx);
+			opt_SHA256_Update(&ctx, buf, bytes);
+			opt_SHA256_Final(&ctx, cksum_buf);
+		}
 	} else if (cksum == CKSUM_SHA512) {
 		SHA512_CTX ctx;

@ -357,6 +376,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, ssize_t bytes)
 	return (0);
 }

+static void
+init_sha256(void)
+{
+#ifdef	WORDS_BIGENDIAN
+	cksum_provider = PROVIDER_OPENSSL;
+#else
+#ifdef	__x86_64__
+	processor_info_t pc;
+
+	cksum_provider = PROVIDER_OPENSSL;
+	cpuid_basic_identify(&pc);
+	if (pc.proc_type == PROC_X64_INTEL || pc.proc_type == PROC_X64_AMD) {
+		if (opt_Init_SHA(&pc) == 0) {
+			cksum_provider = PROVIDER_X64_OPT;
+		}
+	}
+#endif
+#endif
+}
+
 /*
 * Check is either the given checksum name or id is valid and
 * return it's properties.
@ -371,6 +410,8 @@ get_checksum_props(char *name, int *cksum, int *cksum_bytes)
 		    (*cksum != 0 && *cksum == cksum_props[i].cksum_id)) {
 			*cksum = cksum_props[i].cksum_id;
 			*cksum_bytes = cksum_props[i].bytes;
+			if (cksum_props[i].init_func)
+				cksum_props[i].init_func();
 			return (0);
 		}
 	}
--- a/utils/utils.h
+++ b/utils/utils.h
@ -133,6 +133,19 @@ typedef enum {
 	DECOMPRESS_THREADS
 } algo_threads_type_t;

+typedef enum {
+	PROC_BIGENDIAN_GENERIC = 1,
+	PROC_LITENDIAN_GENERIC,
+	PROC_X64_INTEL,
+	PROC_X64_AMD
+} proc_type_t;
+
+typedef struct {
+	int sse_level;
+	int avx_level;
+	proc_type_t proc_type;
+} processor_info_t;
+
 extern void err_exit(int show_errno, const char *format, ...);
 extern const char *get_execname(const char *);
 extern int parse_numeric(ssize_t *val, const char *str);