Incorporate SSE/AVX optimized Intel SHA-256 implementation.
Add support for runtime cpuid detection.
This commit is contained in:
parent
c880b73d26
commit
21cbef6d60
11 changed files with 1753 additions and 14 deletions
28
Makefile.in
28
Makefile.in
|
@ -24,8 +24,9 @@
|
|||
PROG= pcompress
|
||||
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
||||
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
|
||||
lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c
|
||||
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h
|
||||
lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c utils/cpuid.c
|
||||
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \
|
||||
utils/cpuid.h
|
||||
MAINOBJS = $(MAINSRCS:.c=.o)
|
||||
|
||||
RABINSRCS = rabin/rabin_dedup.c
|
||||
|
@ -72,6 +73,14 @@ SKEINHDRS = crypto/skein/brg_endian.h crypto/skein/SHA3api_ref.h \
|
|||
crypto/skein/skein_debug.h crypto/skein/skein_iv.h
|
||||
SKEINOBJS = $(SKEINSRCS:.c=.o)
|
||||
|
||||
SHA256_SRCS = crypto/sha2/sha256.c
|
||||
SHA256_HDRS = crypto/sha2/sha256.h
|
||||
SHA256ASM_SRCS = crypto/sha2/intel/sha256_avx1.asm \
|
||||
crypto/sha2/intel/sha256_sse4.asm
|
||||
SHA256ASM_OBJS = $(SHA256ASM_SRCS:.asm=.o)
|
||||
SHA256_OBJS = $(SHA256_SRCS:.c=.o)
|
||||
|
||||
YASM = @YASM@ -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX
|
||||
LIBBSCWRAP = libbsc_compress.c
|
||||
LIBBSCWRAPOBJ = libbsc_compress.o
|
||||
LIBBSCDIR = @LIBBSCDIR@
|
||||
|
@ -80,17 +89,20 @@ LIBBSCLIB = @LIBBSCLIB@
|
|||
LIBBSCGEN_OPT = -fopenmp
|
||||
LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC
|
||||
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \
|
||||
crypto/sha2/intel/*~
|
||||
|
||||
RM = rm -f
|
||||
COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
|
||||
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
|
||||
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@
|
||||
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \
|
||||
-I./crypto/sha2
|
||||
COMMON_VEC_FLAGS = -ftree-vectorize
|
||||
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
||||
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm @LIBBSCLFLAGS@ -L@OPENSSL_LIBDIR@ -lcrypto
|
||||
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
|
||||
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ)
|
||||
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ) \
|
||||
@SHA256ASM_OBJS@ @SHA256_OBJS@
|
||||
|
||||
DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
|
||||
DEBUG_COMPILE = gcc -m64 -g -msse3 -c
|
||||
|
@ -155,6 +167,12 @@ $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
|
|||
$(SKEINOBJS): $(SKEINSRCS) $(SKEINHDRS)
|
||||
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(SHA256_OBJS): $(SHA256_SRCS) $(SHA256_HDRS)
|
||||
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||
|
||||
$(SHA256ASM_OBJS): $(SHA256ASM_SRCS)
|
||||
$(YASM) -o $@ $(@:.o=.asm)
|
||||
|
||||
$(LIBBSCLIB):
|
||||
(cd $(LIBBSCDIR); make)
|
||||
|
||||
|
|
37
config
37
config
|
@ -36,6 +36,9 @@ libbsccppflags=
|
|||
openssl_prefix=
|
||||
openssl_libdir=
|
||||
openssl_incdir=
|
||||
sha256asmobjs=
|
||||
sha256objs=
|
||||
yasm=yasm
|
||||
|
||||
while [ "${arg1}" != "" ]
|
||||
do
|
||||
|
@ -98,6 +101,33 @@ echo $plat | egrep 'x86_64|amd64' > /dev/null
|
|||
if [ $? -eq 0 ]
|
||||
then
|
||||
skeinblock='\$\(SKEIN_BLOCK_ASM\)'
|
||||
yasm=
|
||||
|
||||
#
|
||||
# Detect Yasm
|
||||
#
|
||||
for bindir in /bin /usr/bin /usr/local/bin
|
||||
do
|
||||
if [ -x ${bindir}/yasm ]
|
||||
then
|
||||
# Get yasm version
|
||||
yver=`${bindir}/yasm --version | head -1 | awk '{print $2}'`
|
||||
_OIFS=$IFS; IFS="."; set -- ${yver}; IFS="$_OIFS"
|
||||
major=$1
|
||||
minor=$2
|
||||
|
||||
# Minimum yasm version 1.1
|
||||
[ $major -lt 1 -o $minor -lt 1 ] && continue
|
||||
yasm=${bindir}/yasm
|
||||
sha256asmobjs='\$\(SHA256ASM_OBJS\)'
|
||||
sha256objs='\$\(SHA256_OBJS\)'
|
||||
fi
|
||||
done
|
||||
if [ "x${yasm}" = "x" ]
|
||||
then
|
||||
echo "Yasm version 1.1 or later is required to build on x64 platforms"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Detect OpenSSL library
|
||||
|
@ -152,6 +182,7 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
linkvar="LINK"
|
||||
compilevar="COMPILE"
|
||||
compilecppvar="COMPILE_cpp"
|
||||
|
@ -171,6 +202,9 @@ libbsclflagsvar="LIBBSCLFLAGS"
|
|||
libbscwrapobjvar="LIBBSCWRAPOBJ"
|
||||
libbscgenoptvar="LIBBSCGEN_OPT"
|
||||
libbsccppflagsvar="LIBBSCCPPFLAGS"
|
||||
sha256asmobjsvar="SHA256ASM_OBJS"
|
||||
sha256objsvar="SHA256_OBJS"
|
||||
yasmvar="YASM"
|
||||
|
||||
openssllibdirvar="OPENSSL_LIBDIR"
|
||||
opensslincdirvar="OPENSSL_INCDIR"
|
||||
|
@ -202,5 +236,8 @@ s#@${libbsccppflagsvar}@#${libbsccppflags}#g
|
|||
s#@${skeinblockvar}@#${skeinblock}#g
|
||||
s#@${openssllibdirvar}@#${openssl_libdir}#g
|
||||
s#@${opensslincdirvar}@#${openssl_incdir}#g
|
||||
s#@${sha256asmobjsvar}@#${sha256asmobjs}#g
|
||||
s#@${sha256objsvar}@#${sha256objs}#g
|
||||
s#@${yasmvar}@#${yasm}#g
|
||||
" > Makefile
|
||||
|
||||
|
|
32
crypto/sha2/intel/open_software_license.txt
Normal file
32
crypto/sha2/intel/open_software_license.txt
Normal file
|
@ -0,0 +1,32 @@
|
|||
Copyright (c) 2012, Intel Corporation
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
* Neither the name of the Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
577
crypto/sha2/intel/sha256_avx1.asm
Normal file
577
crypto/sha2/intel/sha256_avx1.asm
Normal file
|
@ -0,0 +1,577 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define VMOVDQ vmovdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
%macro MY_ROR 2
|
||||
shld %1,%1,(32-(%2))
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
VMOVDQ %1, %2
|
||||
vpshufb %1, %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
%define XTMP5 xmm11
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm13
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 8*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
;vmovdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
;vmovdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpsrld XTMP2, XTMP1, 7
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
|
||||
vpslld XTMP3, XTMP1, (32-7)
|
||||
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
|
||||
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
|
||||
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
|
||||
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
|
||||
vpsrld XTMP2, XTMP1,18
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
|
||||
vpslld XTMP1, XTMP1, (32-18)
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP1
|
||||
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
|
||||
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_avx
|
||||
align 32
|
||||
sha256_avx:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 2*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 3*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vpaddd XFER, X1, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vmovdqa X0, X2
|
||||
vmovdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
|
||||
%endif
|
||||
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
535
crypto/sha2/intel/sha256_sse4.asm
Normal file
535
crypto/sha2/intel/sha256_sse4.asm
Normal file
|
@ -0,0 +1,535 @@
|
|||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright 2012 Intel Corporation All Rights Reserved.
|
||||
;
|
||||
; The source code contained or described herein and all documents
|
||||
; related to the source code ("Material") are owned by Intel Corporation
|
||||
; or its suppliers or licensors. Title to the Material remains with
|
||||
; Intel Corporation or its suppliers and licensors. The Material may
|
||||
; contain trade secrets and proprietary and confidential information of
|
||||
; Intel Corporation and its suppliers and licensors, and is protected by
|
||||
; worldwide copyright and trade secret laws and treaty provisions. No
|
||||
; part of the Material may be used, copied, reproduced, modified,
|
||||
; published, uploaded, posted, transmitted, distributed, or disclosed in
|
||||
; any way without Intel's prior express written permission.
|
||||
;
|
||||
; No license under any patent, copyright, trade secret or other
|
||||
; intellectual property right is granted to or conferred upon you by
|
||||
; disclosure or delivery of the Materials, either expressly, by
|
||||
; implication, inducement, estoppel or otherwise. Any license under such
|
||||
; intellectual property rights must be express and approved by Intel in
|
||||
; writing.
|
||||
;
|
||||
; Unless otherwise agreed by Intel in writing, you may not remove or
|
||||
; alter this notice or any other notice embedded in Materials by Intel
|
||||
; or Intel's suppliers or licensors in any way.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define MOVDQ movdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
MOVDQ %1, %2
|
||||
pshufb %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm12
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 7*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
movdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
movdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pslld XTMP1, (32-7)
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
psrld XTMP2, 7
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
pslld XTMP3, (32-18)
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
psrld XTMP2, 18
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP1, XTMP3
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pxor XTMP1, XTMP4 ; XTMP1 = s0
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
pxor XTMP2, XTMP3
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
pxor X0, XTMP2 ; X0 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_sse4
|
||||
align 32
|
||||
sha256_sse4:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
movdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
movdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
movdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
movdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
movdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
movdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
movdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
movdqa XFER, [TBL + 0*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 1*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 2*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 3*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
paddd X0, [TBL + 0*16]
|
||||
movdqa [rsp + _XFER], X0
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
paddd X1, [TBL + 1*16]
|
||||
movdqa [rsp + _XFER], X1
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
movdqa X0, X2
|
||||
movdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
%endif
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
210
crypto/sha2/sha256.c
Normal file
210
crypto/sha2/sha256.c
Normal file
|
@ -0,0 +1,210 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Define WORDS_BIGENDIAN if compiling on a big-endian architecture.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif /* HAVE_CONFIG_H */
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <utils.h>
|
||||
#include <sha256.h>
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
|
||||
#define BYTESWAP(x) (x)
|
||||
#define BYTESWAP64(x) (x)
|
||||
|
||||
#else /* WORDS_BIGENDIAN */
|
||||
|
||||
#define BYTESWAP(x) htonl(x)
|
||||
#define BYTESWAP64(x) htonll(x)
|
||||
|
||||
#endif /* WORDS_BIGENDIAN */
|
||||
typedef void (*update_func_ptr)(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
|
||||
static uint8_t padding[64] = {
|
||||
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
static const uint32_t iv256[SHA256_HASH_WORDS] = {
|
||||
0x6a09e667L,
|
||||
0xbb67ae85L,
|
||||
0x3c6ef372L,
|
||||
0xa54ff53aL,
|
||||
0x510e527fL,
|
||||
0x9b05688cL,
|
||||
0x1f83d9abL,
|
||||
0x5be0cd19L
|
||||
};
|
||||
|
||||
static update_func_ptr sha_update_func;
|
||||
|
||||
int
|
||||
APS_NAMESPACE(Init_SHA) (processor_info_t *pc)
|
||||
{
|
||||
if (pc->proc_type == PROC_X64_INTEL) {
|
||||
if (pc->avx_level > 0) {
|
||||
sha_update_func = sha256_avx;
|
||||
|
||||
} else if (pc->sse_level >= 4) {
|
||||
sha_update_func = sha256_sse4;
|
||||
|
||||
} else {
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void
|
||||
_init (SHA256_Context *sc, const uint32_t iv[SHA256_HASH_WORDS])
|
||||
{
|
||||
int i;
|
||||
|
||||
/*
|
||||
* SHA256_HASH_WORDS is 8, must be 8, cannot be anything but 8!
|
||||
* So we unroll a loop here.
|
||||
*/
|
||||
sc->hash[0] = iv[0];
|
||||
sc->hash[1] = iv[1];
|
||||
sc->hash[2] = iv[2];
|
||||
sc->hash[3] = iv[3];
|
||||
sc->hash[4] = iv[4];
|
||||
sc->hash[5] = iv[5];
|
||||
sc->hash[6] = iv[6];
|
||||
sc->hash[7] = iv[7];
|
||||
|
||||
sc->totalLength = 0LL;
|
||||
sc->bufferLength = 0L;
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc)
|
||||
{
|
||||
_init (sc, iv256);
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, void *vdata, size_t len)
|
||||
{
|
||||
const uint8_t *data = vdata;
|
||||
uint32_t bufferBytesLeft;
|
||||
size_t bytesToCopy;
|
||||
int rem;
|
||||
|
||||
if (sc->bufferLength) {
|
||||
do {
|
||||
bufferBytesLeft = 64L - sc->bufferLength;
|
||||
bytesToCopy = bufferBytesLeft;
|
||||
if (bytesToCopy > len)
|
||||
bytesToCopy = len;
|
||||
|
||||
memcpy (&sc->buffer.bytes[sc->bufferLength], data, bytesToCopy);
|
||||
sc->totalLength += bytesToCopy * 8L;
|
||||
sc->bufferLength += bytesToCopy;
|
||||
data += bytesToCopy;
|
||||
len -= bytesToCopy;
|
||||
|
||||
if (sc->bufferLength == 64L) {
|
||||
sc->blocks = 1;
|
||||
sha_update_func(sc->buffer.words, sc->hash, sc->blocks);
|
||||
sc->bufferLength = 0L;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} while (len > 0 && len <= 64L);
|
||||
if (!len) return;
|
||||
}
|
||||
|
||||
sc->blocks = len >> 6;
|
||||
rem = len - (sc->blocks << 6);
|
||||
len = sc->blocks << 6;
|
||||
sc->totalLength += rem * 8L;
|
||||
|
||||
if (len) {
|
||||
sc->totalLength += len * 8L;
|
||||
sha_update_func((uint32_t *)data, sc->hash, sc->blocks);
|
||||
}
|
||||
if (rem) {
|
||||
memcpy (&sc->buffer.bytes[0], data + len, rem);
|
||||
sc->bufferLength = rem;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
_final (SHA256_Context *sc, uint8_t *hash, int hashWords)
|
||||
{
|
||||
uint32_t bytesToPad;
|
||||
uint64_t lengthPad;
|
||||
int i;
|
||||
|
||||
bytesToPad = 120L - sc->bufferLength;
|
||||
if (bytesToPad > 64L)
|
||||
bytesToPad -= 64L;
|
||||
|
||||
lengthPad = BYTESWAP64(sc->totalLength);
|
||||
|
||||
APS_NAMESPACE(SHA256_Update) (sc, padding, bytesToPad);
|
||||
APS_NAMESPACE(SHA256_Update) (sc, &lengthPad, 8L);
|
||||
|
||||
if (hash) {
|
||||
for (i = 0; i < hashWords; i++) {
|
||||
hash[0] = (uint8_t) (sc->hash[i] >> 24);
|
||||
hash[1] = (uint8_t) (sc->hash[i] >> 16);
|
||||
hash[2] = (uint8_t) (sc->hash[i] >> 8);
|
||||
hash[3] = (uint8_t) sc->hash[i];
|
||||
hash += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE])
|
||||
{
|
||||
_final (sc, hash, SHA256_HASH_WORDS);
|
||||
}
|
81
crypto/sha2/sha256.h
Normal file
81
crypto/sha2/sha256.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
/*-
|
||||
* Copyright (c) 2001-2003 Allan Saddi <allan@saddi.com>
|
||||
* Copyright (c) 2012 Moinak Ghosh moinakg <at1> gm0il <dot> com
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _APS_SHA256_H
|
||||
#define _APS_SHA256_H
|
||||
|
||||
#if HAVE_INTTYPES_H
|
||||
# include <inttypes.h>
|
||||
#else
|
||||
# if HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <utils.h>
|
||||
|
||||
#define SHA256_HASH_SIZE 32
|
||||
|
||||
/* Hash size in 32-bit words */
|
||||
#define SHA256_HASH_WORDS 8
|
||||
|
||||
typedef struct _SHA256_Context {
|
||||
uint64_t totalLength, blocks;
|
||||
uint32_t hash[SHA256_HASH_WORDS];
|
||||
uint32_t bufferLength;
|
||||
union {
|
||||
uint32_t words[16];
|
||||
uint8_t bytes[64];
|
||||
} buffer;
|
||||
} SHA256_Context;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef APS_NAMESPACE
|
||||
#define APS_NAMESPACE(name) opt_##name
|
||||
#endif /* !APS_NAMESPACE */
|
||||
|
||||
void APS_NAMESPACE(SHA256_Init) (SHA256_Context *sc);
|
||||
void APS_NAMESPACE(SHA256_Update) (SHA256_Context *sc, void *data, size_t len);
|
||||
void APS_NAMESPACE(SHA256_Final) (SHA256_Context *sc, uint8_t hash[SHA256_HASH_SIZE]);
|
||||
int APS_NAMESPACE(Init_SHA) (processor_info_t *pc);
|
||||
|
||||
/*
|
||||
* Intel's optimized SHA256 core routines. These routines are described in an
|
||||
* Intel White-Paper:
|
||||
* "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
*/
|
||||
extern void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
extern void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_APS_SHA256_H */
|
134
utils/cpuid.c
Normal file
134
utils/cpuid.c
Normal file
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Copyright 2008 Veselin Georgiev,
|
||||
* anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "utils.h"
|
||||
#include "cpuid.h"
|
||||
|
||||
#ifdef __x86_64__
|
||||
void
|
||||
exec_cpuid(uint32_t *regs)
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
__asm __volatile(
|
||||
" push %%rbx\n"
|
||||
" push %%rcx\n"
|
||||
" push %%rdx\n"
|
||||
" push %%rdi\n"
|
||||
|
||||
" mov %0, %%rdi\n"
|
||||
|
||||
" mov (%%rdi), %%eax\n"
|
||||
" mov 4(%%rdi), %%ebx\n"
|
||||
" mov 8(%%rdi), %%ecx\n"
|
||||
" mov 12(%%rdi), %%edx\n"
|
||||
|
||||
" cpuid\n"
|
||||
|
||||
" movl %%eax, (%%rdi)\n"
|
||||
" movl %%ebx, 4(%%rdi)\n"
|
||||
" movl %%ecx, 8(%%rdi)\n"
|
||||
" movl %%edx, 12(%%rdi)\n"
|
||||
" pop %%rdi\n"
|
||||
" pop %%rdx\n"
|
||||
" pop %%rcx\n"
|
||||
" pop %%rbx\n"
|
||||
:
|
||||
:"rdi"(regs)
|
||||
:"memory", "eax"
|
||||
);
|
||||
#else
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
cpu_exec_cpuid(uint32_t eax, uint32_t* regs)
|
||||
{
|
||||
regs[0] = eax;
|
||||
regs[1] = regs[2] = regs[3] = 0;
|
||||
exec_cpuid(regs);
|
||||
}
|
||||
|
||||
static void
|
||||
cpu_exec_cpuid_ext(uint32_t* regs)
|
||||
{
|
||||
exec_cpuid(regs);
|
||||
}
|
||||
|
||||
void
|
||||
cpuid_get_raw_data(struct cpu_raw_data_t* data)
|
||||
{
|
||||
unsigned i;
|
||||
for (i = 0; i < 32; i++)
|
||||
cpu_exec_cpuid(i, data->basic_cpuid[i]);
|
||||
for (i = 0; i < 32; i++)
|
||||
cpu_exec_cpuid(0x80000000 + i, data->ext_cpuid[i]);
|
||||
for (i = 0; i < 4; i++) {
|
||||
memset(data->intel_fn4[i], 0, sizeof(data->intel_fn4[i]));
|
||||
data->intel_fn4[i][0] = 4;
|
||||
data->intel_fn4[i][2] = i;
|
||||
cpu_exec_cpuid_ext(data->intel_fn4[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
cpuid_basic_identify(processor_info_t *pc)
|
||||
{
|
||||
struct cpu_raw_data_t raw;
|
||||
cpuid_get_raw_data(&raw);
|
||||
|
||||
memcpy(raw.vendor_str + 0, &raw.basic_cpuid[0][1], 4);
|
||||
memcpy(raw.vendor_str + 4, &raw.basic_cpuid[0][3], 4);
|
||||
memcpy(raw.vendor_str + 8, &raw.basic_cpuid[0][2], 4);
|
||||
raw.vendor_str[12] = 0;
|
||||
pc->avx_level = 0;
|
||||
pc->sse_level = 0;
|
||||
|
||||
if (strcmp(raw.vendor_str, "GenuineIntel") == 0) {
|
||||
pc->proc_type = PROC_X64_INTEL;
|
||||
|
||||
pc->sse_level = 2;
|
||||
if (raw.basic_cpuid[0][0] >= 1) {
|
||||
// ECX has SSE 4.2 and AVX flags
|
||||
// Bit 20 is SSE 4.2 and bit 28 indicates AVX
|
||||
if (raw.basic_cpuid[1][2] & (1 << 20)) {
|
||||
pc->sse_level = 4;
|
||||
} else {
|
||||
pc->sse_level = 3;
|
||||
}
|
||||
pc->avx_level = 0;
|
||||
if (raw.basic_cpuid[1][2] & (1 << 28)) {
|
||||
pc->avx_level = 1;
|
||||
}
|
||||
}
|
||||
} else if (strcmp(raw.vendor_str, "AuthenticAMD") == 0) {
|
||||
pc->proc_type = PROC_X64_AMD;
|
||||
pc->sse_level = 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
61
utils/cpuid.h
Normal file
61
utils/cpuid.h
Normal file
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Copyright 2008 Veselin Georgiev,
|
||||
* anrieffNOSPAM @ mgail_DOT.com (convert to gmail)
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef __CPUID_H__
|
||||
#define __CPUID_H__
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define VENDOR_STR_MAX 16
|
||||
#define BRAND_STR_MAX 64
|
||||
#define CPU_FLAGS_MAX 128
|
||||
#define MAX_CPUID_LEVEL 32
|
||||
#define MAX_EXT_CPUID_LEVEL 32
|
||||
#define MAX_INTELFN4_LEVEL 4
|
||||
|
||||
/**
|
||||
* This contains only the most basic CPU data, required to do identification
|
||||
* and feature recognition. Every processor should be identifiable using this
|
||||
* data only.
|
||||
*/
|
||||
struct cpu_raw_data_t {
|
||||
/** contains results of CPUID for eax = 0, 1, ...*/
|
||||
uint32_t basic_cpuid[MAX_CPUID_LEVEL][4];
|
||||
|
||||
/** contains results of CPUID for eax = 0x80000000, 0x80000001, ...*/
|
||||
uint32_t ext_cpuid[MAX_EXT_CPUID_LEVEL][4];
|
||||
|
||||
/** when the CPU is intel and it supports deterministic cache
|
||||
information: this contains the results of CPUID for eax = 4
|
||||
and ecx = 0, 1, ... */
|
||||
uint32_t intel_fn4[MAX_INTELFN4_LEVEL][4];
|
||||
char vendor_str[VENDOR_STR_MAX];
|
||||
};
|
||||
|
||||
void exec_cpuid(uint32_t *regs);
|
||||
void cpuid_get_raw_data(struct cpu_raw_data_t* data);
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#endif /* __CPUID_H__ */
|
||||
|
|
@ -34,24 +34,36 @@
|
|||
#include <rabin_dedup.h>
|
||||
#include <skein.h>
|
||||
#include <openssl/sha.h>
|
||||
#include <sha256.h>
|
||||
|
||||
#include "utils.h"
|
||||
#include "cpuid.h"
|
||||
|
||||
#define PROVIDER_OPENSSL 0
|
||||
#define PROVIDER_X64_OPT 1
|
||||
|
||||
static void init_sha256(void);
|
||||
|
||||
/*
|
||||
* Checksum properties
|
||||
*/
|
||||
typedef void (*ckinit_func_ptr)(void);
|
||||
static struct {
|
||||
char *name;
|
||||
cksum_t cksum_id;
|
||||
int bytes;
|
||||
ckinit_func_ptr init_func;
|
||||
} cksum_props[] = {
|
||||
{"CRC64", CKSUM_CRC64, 8},
|
||||
{"SKEIN256", CKSUM_SKEIN256, 32},
|
||||
{"SKEIN512", CKSUM_SKEIN512, 64},
|
||||
{"SHA256", CKSUM_SHA256, 32},
|
||||
{"SHA512", CKSUM_SHA512, 64}
|
||||
{"CRC64", CKSUM_CRC64, 8, NULL},
|
||||
{"SKEIN256", CKSUM_SKEIN256, 32, NULL},
|
||||
{"SKEIN512", CKSUM_SKEIN512, 64, NULL},
|
||||
{"SHA256", CKSUM_SHA256, 32, init_sha256},
|
||||
{"SHA512", CKSUM_SHA512, 64, NULL}
|
||||
};
|
||||
|
||||
|
||||
static int cksum_provider = PROVIDER_OPENSSL;
|
||||
|
||||
extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
|
||||
extern uint64_t lzma_crc64_8bchk(const uint8_t *buf, size_t size,
|
||||
uint64_t crc, uint64_t *cnt);
|
||||
|
@ -339,12 +351,19 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, ssize_t bytes)
|
|||
Skein_512_Final(&ctx, cksum_buf);
|
||||
|
||||
} else if (cksum == CKSUM_SHA256) {
|
||||
SHA256_CTX ctx;
|
||||
if (cksum_provider == PROVIDER_OPENSSL) {
|
||||
SHA256_CTX ctx;
|
||||
|
||||
SHA256_Init(&ctx);
|
||||
SHA256_Update(&ctx, buf, bytes);
|
||||
SHA256_Final(cksum_buf, &ctx);
|
||||
SHA256_Init(&ctx);
|
||||
SHA256_Update(&ctx, buf, bytes);
|
||||
SHA256_Final(cksum_buf, &ctx);
|
||||
} else {
|
||||
SHA256_Context ctx;
|
||||
|
||||
opt_SHA256_Init(&ctx);
|
||||
opt_SHA256_Update(&ctx, buf, bytes);
|
||||
opt_SHA256_Final(&ctx, cksum_buf);
|
||||
}
|
||||
} else if (cksum == CKSUM_SHA512) {
|
||||
SHA512_CTX ctx;
|
||||
|
||||
|
@ -357,6 +376,26 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, ssize_t bytes)
|
|||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
init_sha256(void)
|
||||
{
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
cksum_provider = PROVIDER_OPENSSL;
|
||||
#else
|
||||
#ifdef __x86_64__
|
||||
processor_info_t pc;
|
||||
|
||||
cksum_provider = PROVIDER_OPENSSL;
|
||||
cpuid_basic_identify(&pc);
|
||||
if (pc.proc_type == PROC_X64_INTEL || pc.proc_type == PROC_X64_AMD) {
|
||||
if (opt_Init_SHA(&pc) == 0) {
|
||||
cksum_provider = PROVIDER_X64_OPT;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Check is either the given checksum name or id is valid and
|
||||
* return it's properties.
|
||||
|
@ -371,6 +410,8 @@ get_checksum_props(char *name, int *cksum, int *cksum_bytes)
|
|||
(*cksum != 0 && *cksum == cksum_props[i].cksum_id)) {
|
||||
*cksum = cksum_props[i].cksum_id;
|
||||
*cksum_bytes = cksum_props[i].bytes;
|
||||
if (cksum_props[i].init_func)
|
||||
cksum_props[i].init_func();
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -133,6 +133,19 @@ typedef enum {
|
|||
DECOMPRESS_THREADS
|
||||
} algo_threads_type_t;
|
||||
|
||||
typedef enum {
|
||||
PROC_BIGENDIAN_GENERIC = 1,
|
||||
PROC_LITENDIAN_GENERIC,
|
||||
PROC_X64_INTEL,
|
||||
PROC_X64_AMD
|
||||
} proc_type_t;
|
||||
|
||||
typedef struct {
|
||||
int sse_level;
|
||||
int avx_level;
|
||||
proc_type_t proc_type;
|
||||
} processor_info_t;
|
||||
|
||||
extern void err_exit(int show_errno, const char *format, ...);
|
||||
extern const char *get_execname(const char *);
|
||||
extern int parse_numeric(ssize_t *val, const char *str);
|
||||
|
|
Loading…
Reference in a new issue