Add optimized BLAKE2 implementations with runtime detection of CPU capability (SSE/AVX).

Minor cleanups.
This commit is contained in:
Moinak Ghosh 2013-01-26 15:39:10 +05:30
parent 43af97042a
commit d08b5ea399
25 changed files with 18504 additions and 101 deletions

View file

@ -43,6 +43,22 @@ XXHASH_SSE2_SRCS = utils/xxhash_sse2.c
XXHASH_OBJS = utils/xxhash_sse4.o utils/xxhash_sse2.o XXHASH_OBJS = utils/xxhash_sse4.o utils/xxhash_sse2.o
XXHASH_HDRS = utils/xxhash.h XXHASH_HDRS = utils/xxhash.h
BLAKE2b_SSE2 = crypto/blake2/blake2b_sse2.c
BLAKE2b_SSE3 = crypto/blake2/blake2b_ssse3.c
BLAKE2b_SSE4 = crypto/blake2/blake2b_sse41.c
BLAKE2b_AVX = crypto/blake2/blake2b_avx.c
BLAKE2bp_SSE2 = crypto/blake2/blake2bp_sse2.c
BLAKE2bp_SSE3 = crypto/blake2/blake2bp_ssse3.c
BLAKE2bp_SSE4 = crypto/blake2/blake2bp_sse41.c
BLAKE2bp_AVX = crypto/blake2/blake2bp_avx.c
BLAKE2_BASE_SRCS = crypto/blake2/blake2b.c crypto/blake2/blake2bp.c
BLAKE2_HDRS = crypto/blake2/blake2.h crypto/blake2/blake2-impl.h crypto/blake2/blake2-config.h \
crypto/blake2/blake2-kat.h crypto/blake2/blake2b-round.h crypto/blake2/blake2b-load-sse2.h \
crypto/blake2/blake2b-load-sse41.h
BLAKE2_SRCS = $(BLAKE2b_SSE2) $(BLAKE2b_SSE3) $(BLAKE2b_SSE4) $(BLAKE2b_AVX) \
$(BLAKE2bp_SSE2) $(BLAKE2bp_SSE3) $(BLAKE2bp_SSE4) $(BLAKE2bp_AVX)
BLAKE2_OBJS = $(BLAKE2_SRCS:.c=.o)
ZLIB_SRCS = zlib_compress.c ZLIB_SRCS = zlib_compress.c
ZLIB_HDRS = $(MAINHDRS) ZLIB_HDRS = $(MAINHDRS)
ZLIB_OBJS = $(ZLIB_SRCS:.c=.o) ZLIB_OBJS = $(ZLIB_SRCS:.c=.o)
@ -152,7 +168,7 @@ COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \ -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I./crypto/sha2 \ -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I./crypto/sha2 \
-I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \ -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
-I./crypto/keccak -I./transpose $(EXTRA_CPPFLAGS) -pedantic -Wall -std=gnu99 \ -I./crypto/keccak -I./transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) -pedantic -Wall -std=gnu99 \
-fno-strict-aliasing -Wno-unused-but-set-variable -Wno-enum-compare -fno-strict-aliasing -Wno-unused-but-set-variable -Wno-enum-compare
COMMON_VEC_FLAGS = -ftree-vectorize COMMON_VEC_FLAGS = -ftree-vectorize
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
@ -161,24 +177,24 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ $(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS)
DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp
DEBUG_COMPILE = gcc -g -c @EXTRA_OPT_FLAGS@ DEBUG_COMPILE = gcc -g -c @EXTRA_OPT_FLAGS@
DEBUG_COMPILE_cpp = g++ -g -c @EXTRA_OPT_FLAGS@ DEBUG_COMPILE_cpp = g++ -g -c @EXTRA_OPT_FLAGS@
DEBUG_VEC_FLAGS = DEBUG_VEC_FLAGS =
DEBUG_LOOP_OPTFLAGS = DEBUG_LOOP_OPTFLAGS =
DEBUG_GEN_OPT = -O -fno-omit-frame-pointer @LIBBSCGEN_OPT@ DEBUG_GEN_OPT = -O -fno-omit-frame-pointer @LIBBSCGEN_OPT@ -fopenmp
DEBUG_CPPFLAGS = $(COMMON_CPPFLAGS) DEBUG_CPPFLAGS = $(COMMON_CPPFLAGS)
DEBUG_FPTR_FLAG = DEBUG_FPTR_FLAG =
RELEASE_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ RELEASE_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp
RELEASE_COMPILE = gcc -c @EXTRA_OPT_FLAGS@ RELEASE_COMPILE = gcc -c @EXTRA_OPT_FLAGS@
RELEASE_COMPILE_cpp = g++ -c @EXTRA_OPT_FLAGS@ RELEASE_COMPILE_cpp = g++ -c @EXTRA_OPT_FLAGS@
RELEASE_VEC_FLAGS = $(COMMON_VEC_FLAGS) RELEASE_VEC_FLAGS = $(COMMON_VEC_FLAGS)
RELEASE_LOOP_OPTFLAGS = $(COMMON_LOOP_OPTFLAGS) RELEASE_LOOP_OPTFLAGS = $(COMMON_LOOP_OPTFLAGS)
RELEASE_CPPFLAGS = $(COMMON_CPPFLAGS) -DNDEBUG RELEASE_CPPFLAGS = $(COMMON_CPPFLAGS) -DNDEBUG
RELEASE_GEN_OPT = -O3 @LIBBSCGEN_OPT@ RELEASE_GEN_OPT = -O3 @LIBBSCGEN_OPT@ -fopenmp
RELEASE_FPTR_FLAG = -fomit-frame-pointer RELEASE_FPTR_FLAG = -fomit-frame-pointer
NO_SLAB_CPPFLAGS = -DDEBUG_NO_SLAB NO_SLAB_CPPFLAGS = -DDEBUG_NO_SLAB
@ -193,6 +209,7 @@ CPPFLAGS = @CPPFLAGS@ @NO_SLAB_CPPFLAGS@ @DEBUG_STATS_CPPFLAGS@
GEN_OPT = @GEN_OPT@ @SSE_OPT_FLAGS@ GEN_OPT = @GEN_OPT@ @SSE_OPT_FLAGS@
BASE_OPT = @GEN_OPT@ BASE_OPT = @GEN_OPT@
PREFIX=@PREFIX@ PREFIX=@PREFIX@
AVX_OPT_FLAG = -mavx
SSE4_OPT_FLAG = -msse4.2 SSE4_OPT_FLAG = -msse4.2
SSE3_OPT_FLAG = -mssse3 SSE3_OPT_FLAG = -mssse3
SSE2_OPT_FLAG = -msse2 SSE2_OPT_FLAG = -msse2
@ -270,6 +287,16 @@ $(XXHASH_OBJS): $(XXHASH_SSE4_SRCS) $(XXHASH_SSE2_SRCS) $(XXHASH_HDRS) $(XXHASH_
$(COMPILE) $(BASE_OPT) $(SSE4_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE4_SRCS) -o $(XXHASH_SSE4_SRCS:.c=.o) $(COMPILE) $(BASE_OPT) $(SSE4_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE4_SRCS) -o $(XXHASH_SSE4_SRCS:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE2_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE2_SRCS) -o $(XXHASH_SSE2_SRCS:.c=.o) $(COMPILE) $(BASE_OPT) $(SSE2_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE2_SRCS) -o $(XXHASH_SSE2_SRCS:.c=.o)
$(BLAKE2_OBJS): $(BLAKE2_SRCS) $(BLAKE2_BASE_SRCS) $(BLAKE2_HDRS)
$(COMPILE) $(BASE_OPT) $(SSE2_OPT_FLAG) $(CPPFLAGS) $(BLAKE2b_SSE2) -o $(BLAKE2b_SSE2:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE3_OPT_FLAG) $(CPPFLAGS) $(BLAKE2b_SSE3) -o $(BLAKE2b_SSE3:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE4_OPT_FLAG) $(CPPFLAGS) $(BLAKE2b_SSE4) -o $(BLAKE2b_SSE4:.c=.o)
$(COMPILE) $(BASE_OPT) $(AVX_OPT_FLAG) $(CPPFLAGS) $(BLAKE2b_AVX) -o $(BLAKE2b_AVX:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE2_OPT_FLAG) $(CPPFLAGS) $(BLAKE2bp_SSE2) -o $(BLAKE2bp_SSE2:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE3_OPT_FLAG) $(CPPFLAGS) $(BLAKE2bp_SSE3) -o $(BLAKE2bp_SSE3:.c=.o)
$(COMPILE) $(BASE_OPT) $(SSE4_OPT_FLAG) $(CPPFLAGS) $(BLAKE2bp_SSE4) -o $(BLAKE2bp_SSE4:.c=.o)
$(COMPILE) $(BASE_OPT) $(AVX_OPT_FLAG) $(CPPFLAGS) $(BLAKE2bp_AVX) -o $(BLAKE2bp_AVX:.c=.o)
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ $(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@

View file

@ -0,0 +1,72 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_CONFIG_H__
#define __BLAKE2_CONFIG_H__
// These don't work everywhere
#if defined(__SSE2__)
#define HAVE_SSE2
#endif
#if defined(__SSSE3__)
#define HAVE_SSSE3
#endif
#if defined(__SSE4_1__)
#define HAVE_SSE41
#endif
#if defined(__AVX__)
#define HAVE_AVX
#endif
#if defined(__XOP__)
#define HAVE_XOP
#endif
#ifdef HAVE_AVX2
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_XOP
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif
#ifdef HAVE_AVX
#ifndef HAVE_SSE41
#define HAVE_SSE41
#endif
#endif
#ifdef HAVE_SSE41
#ifndef HAVE_SSSE3
#define HAVE_SSSE3
#endif
#endif
#ifdef HAVE_SSSE3
#define HAVE_SSE2
#endif
#if !defined(HAVE_SSE2)
#error "This code requires at least SSE2."
#endif
#endif

133
crypto/blake2/blake2-impl.h Normal file
View file

@ -0,0 +1,133 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_IMPL_H__
#define __BLAKE2_IMPL_H__
#include <stdint.h>
static inline uint32_t load32( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
return *( uint32_t * )( src );
#else
const uint8_t *p = ( uint8_t * )src;
uint32_t w = *p++;
w |= ( uint32_t )( *p++ ) << 8;
w |= ( uint32_t )( *p++ ) << 16;
w |= ( uint32_t )( *p++ ) << 24;
return w;
#endif
}
static inline uint64_t load64( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
return *( uint64_t * )( src );
#else
const uint8_t *p = ( uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
w |= ( uint64_t )( *p++ ) << 48;
w |= ( uint64_t )( *p++ ) << 56;
return w;
#endif
}
static inline void store32( void *dst, uint32_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
*( uint32_t * )( dst ) = w;
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline void store64( void *dst, uint64_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
*( uint64_t * )( dst ) = w;
#else
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
#endif
}
static inline uint64_t load48( const void *src )
{
const uint8_t *p = ( const uint8_t * )src;
uint64_t w = *p++;
w |= ( uint64_t )( *p++ ) << 8;
w |= ( uint64_t )( *p++ ) << 16;
w |= ( uint64_t )( *p++ ) << 24;
w |= ( uint64_t )( *p++ ) << 32;
w |= ( uint64_t )( *p++ ) << 40;
return w;
}
static inline void store48( void *dst, uint64_t w )
{
uint8_t *p = ( uint8_t * )dst;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w; w >>= 8;
*p++ = ( uint8_t )w;
}
static inline uint32_t rotl32( const uint32_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 32 - c ) );
}
static inline uint64_t rotl64( const uint64_t w, const unsigned c )
{
return ( w << c ) | ( w >> ( 64 - c ) );
}
static inline uint32_t rotr32( const uint32_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 32 - c ) );
}
static inline uint64_t rotr64( const uint64_t w, const unsigned c )
{
return ( w >> c ) | ( w << ( 64 - c ) );
}
/* prevents compiler optimizing out memset() */
static inline void secure_zero_memory( void *v, size_t n )
{
volatile uint8_t *p = ( volatile uint8_t * )v;
while( n-- ) *p++ = 0;
}
#endif

16467
crypto/blake2/blake2-kat.h Normal file

File diff suppressed because it is too large Load diff

133
crypto/blake2/blake2.h Normal file
View file

@ -0,0 +1,133 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2_H__
#define __BLAKE2_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#define BLAKE_ALIGN(x) __declspec(align(x))
#else
#define BLAKE_ALIGN(x) __attribute__ ((__aligned__(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
BLAKE2B_OUTBYTES = 64,
BLAKE2B_KEYBYTES = 64,
BLAKE2B_SALTBYTES = 16,
BLAKE2B_PERSONALBYTES = 16
};
#pragma pack(push, 1)
typedef struct __blake2b_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint64_t node_offset; // 16
uint8_t node_depth; // 17
uint8_t inner_length; // 18
uint8_t reserved[14]; // 32
uint8_t salt[BLAKE2B_SALTBYTES]; // 48
uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64
} blake2b_param;
BLAKE_ALIGN( 64 ) typedef struct __blake2b_state
{
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[2 * BLAKE2B_BLOCKBYTES];
size_t buflen;
uint8_t last_node;
} blake2b_state;
BLAKE_ALIGN( 64 ) typedef struct __blake2bp_state
{
blake2b_state S[4][1];
blake2b_state R[1];
uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
size_t buflen;
} blake2bp_state;
#pragma pack(pop)
// Streaming API
int blake2b_init_sse2( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key_sse2( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param_sse2( blake2b_state *S, const blake2b_param *P );
int blake2b_update_sse2( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final_sse2( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init_sse2( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key_sse2( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update_sse2( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final_sse2( blake2bp_state *S, uint8_t *out, uint8_t outlen );
int blake2b_init_ssse3( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key_ssse3( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param_ssse3( blake2b_state *S, const blake2b_param *P );
int blake2b_update_ssse3( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final_ssse3( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init_ssse3( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key_ssse3( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update_ssse3( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final_ssse3( blake2bp_state *S, uint8_t *out, uint8_t outlen );
int blake2b_init_sse41( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key_sse41( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param_sse41( blake2b_state *S, const blake2b_param *P );
int blake2b_update_sse41( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final_sse41( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init_sse41( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key_sse41( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update_sse41( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final_sse41( blake2bp_state *S, uint8_t *out, uint8_t outlen );
int blake2b_init_avx( blake2b_state *S, const uint8_t outlen );
int blake2b_init_key_avx( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2b_init_param_avx( blake2b_state *S, const blake2b_param *P );
int blake2b_update_avx( blake2b_state *S, const uint8_t *in, uint64_t inlen );
int blake2b_final_avx( blake2b_state *S, uint8_t *out, uint8_t outlen );
int blake2bp_init_avx( blake2bp_state *S, const uint8_t outlen );
int blake2bp_init_key_avx( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
int blake2bp_update_avx( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
int blake2bp_final_avx( blake2bp_state *S, uint8_t *out, uint8_t outlen );
// Simple API
int blake2b_sse2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp_sse2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b_ssse3( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp_ssse3( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b_sse41( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp_sse41( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2b_avx( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
int blake2bp_avx( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,101 @@
#ifndef __BLAKE2_DIGEST_H__
#define __BLAKE2_DIGEST_H__
#include "blake2.h"
#include <cpuid.h>
#if defined(__cplusplus)
extern "C" {
#endif
typedef int (*blake2b_init_funcptr)( blake2b_state *S, const uint8_t outlen );
typedef int (*blake2b_init_key_funcptr)( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
typedef int (*blake2b_init_param_funcptr)( blake2b_state *S, const blake2b_param *P );
typedef int (*blake2b_update_funcptr)( blake2b_state *S, const uint8_t *in, uint64_t inlen );
typedef int (*blake2b_final_funcptr)( blake2b_state *S, uint8_t *out, uint8_t outlen );
typedef int (*blake2bp_init_funcptr)( blake2bp_state *S, const uint8_t outlen );
typedef int (*blake2bp_init_key_funcptr)( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
typedef int (*blake2bp_update_funcptr)( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
typedef int (*blake2bp_final_funcptr)( blake2bp_state *S, uint8_t *out, uint8_t outlen );
typedef int (*blake2b_funcptr)( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
typedef int (*blake2bp_funcptr)( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
/*
* BLAKE2 function pointers. These are set to the optimized routines
* based on CPU capabilities.
*/
struct blake2_dispatch {
blake2b_init_funcptr blake2b_init;
blake2b_init_key_funcptr blake2b_init_key;
blake2b_init_param_funcptr blake2b_init_param;
blake2b_update_funcptr blake2b_update;
blake2b_final_funcptr blake2b_final;
blake2bp_init_funcptr blake2bp_init;
blake2bp_init_key_funcptr blake2bp_init_key;
blake2bp_update_funcptr blake2bp_update;
blake2bp_final_funcptr blake2bp_final;
blake2b_funcptr blake2b;
blake2bp_funcptr blake2bp;
};
static void blake2_module_init(struct blake2_dispatch *dsp, processor_info_t *pc)
{
dsp->blake2b_init = blake2b_init_sse2;
dsp->blake2b_init_key = blake2b_init_key_sse2;
dsp->blake2b_init_param = blake2b_init_param_sse2;
dsp->blake2b_update = blake2b_update_sse2;
dsp->blake2b_final = blake2b_final_sse2;
dsp->blake2bp_init = blake2bp_init_sse2;
dsp->blake2bp_init_key = blake2bp_init_key_sse2;
dsp->blake2bp_update = blake2bp_update_sse2;
dsp->blake2bp_final = blake2bp_final_sse2;
dsp->blake2b = blake2b_sse2;
dsp->blake2bp = blake2bp_sse2;
if (pc->sse_level == 3 && pc->sse_sub_level == 1) {
dsp->blake2b_init = blake2b_init_ssse3;
dsp->blake2b_init_key = blake2b_init_key_ssse3;
dsp->blake2b_init_param = blake2b_init_param_ssse3;
dsp->blake2b_update = blake2b_update_ssse3;
dsp->blake2b_final = blake2b_final_ssse3;
dsp->blake2bp_init = blake2bp_init_ssse3;
dsp->blake2bp_init_key = blake2bp_init_key_ssse3;
dsp->blake2bp_update = blake2bp_update_ssse3;
dsp->blake2bp_final = blake2bp_final_ssse3;
dsp->blake2b = blake2b_ssse3;
dsp->blake2bp = blake2bp_ssse3;
} else if (pc->sse_level == 4 && pc->sse_sub_level >= 1) {
dsp->blake2b_init = blake2b_init_sse41;
dsp->blake2b_init_key = blake2b_init_key_sse41;
dsp->blake2b_init_param = blake2b_init_param_sse41;
dsp->blake2b_update = blake2b_update_sse41;
dsp->blake2b_final = blake2b_final_sse41;
dsp->blake2bp_init = blake2bp_init_sse41;
dsp->blake2bp_init_key = blake2bp_init_key_sse41;
dsp->blake2bp_update = blake2bp_update_sse41;
dsp->blake2bp_final = blake2bp_final_sse41;
dsp->blake2b = blake2b_sse41;
dsp->blake2bp = blake2bp_sse41;
}
if (pc->avx_level >= 1) {
dsp->blake2b_init = blake2b_init_avx;
dsp->blake2b_init_key = blake2b_init_key_avx;
dsp->blake2b_init_param = blake2b_init_param_avx;
dsp->blake2b_update = blake2b_update_avx;
dsp->blake2b_final = blake2b_final_avx;
dsp->blake2bp_init = blake2bp_init_avx;
dsp->blake2bp_init_key = blake2bp_init_key_avx;
dsp->blake2bp_update = blake2bp_update_avx;
dsp->blake2bp_final = blake2bp_final_avx;
dsp->blake2b = blake2b_avx;
dsp->blake2bp = blake2bp_avx;
}
}
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,68 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE2_H__
#define __BLAKE2B_LOAD_SSE2_H__
#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#endif

View file

@ -0,0 +1,402 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE41_H__
#define __BLAKE2B_LOAD_SSE41_H__
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

View file

@ -0,0 +1,160 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_ROUND_H__
#define __BLAKE2B_ROUND_H__
#define LOAD(p) _mm_load_si128( (__m128i *)(p) )
#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
#define LOADU(p) _mm_loadu_si128( (__m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))
#define LIKELY(x) __builtin_expect((x),1)
/* Microarchitecture-specific macros */
#ifndef HAVE_XOP
#ifdef HAVE_SSSE3
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#else
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
#endif
#else
/* ... */
#endif
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24); \
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63); \
#if defined(HAVE_SSSE3)
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#else
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row4l;\
t1 = row2l;\
row4l = row3l;\
row3l = row3h;\
row3h = row4l;\
row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
t0 = row3l;\
row3l = row3h;\
row3h = t0;\
t0 = row2l;\
t1 = row4l;\
row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
#endif
#if defined(HAVE_SSE41)
#include "blake2b-load-sse41.h"
#else
#include "blake2b-load-sse2.h"
#endif
#define ROUND(r) \
LOAD_MSG_ ##r ##_1(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_2(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
LOAD_MSG_ ##r ##_3(b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_4(b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
#endif

435
crypto/blake2/blake2b.c Normal file
View file

@ -0,0 +1,435 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake2.h"
#include "blake2-impl.h"
#include "blake2-config.h"
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(__AVX__)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif
#include "blake2b-round.h"
#ifndef BLAKE_NAMESPACE
#define BLAKE_NAMESPACE(x) x
#endif
BLAKE_ALIGN( 64 ) static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
static const uint8_t blake2b_sigma[12][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
/* Some helper functions, not necessarily useful */
static inline int blake2b_set_lastnode( blake2b_state *S )
{
S->f[1] = ~0ULL;
return 0;
}
static inline int blake2b_clear_lastnode( blake2b_state *S )
{
S->f[1] = 0ULL;
return 0;
}
static inline int blake2b_set_lastblock( blake2b_state *S )
{
if( S->last_node ) blake2b_set_lastnode( S );
S->f[0] = ~0ULL;
return 0;
}
static inline int blake2b_clear_lastblock( blake2b_state *S )
{
if( S->last_node ) blake2b_clear_lastnode( S );
S->f[0] = 0ULL;
return 0;
}
static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
{
#if __x86_64__
// ADD/ADC chain
__uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
t += inc;
S->t[0] = ( uint64_t )( t >> 0 );
S->t[1] = ( uint64_t )( t >> 64 );
#else
S->t[0] += inc;
S->t[1] += ( S->t[0] < inc );
#endif
return 0;
}
// Parameter-related functions
static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
{
P->digest_length = digest_length;
return 0;
}
static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
{
P->fanout = fanout;
return 0;
}
static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
{
P->depth = depth;
return 0;
}
static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
{
P->leaf_length = leaf_length;
return 0;
}
static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
{
P->node_offset = node_offset;
return 0;
}
static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
{
P->node_depth = node_depth;
return 0;
}
static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
{
P->inner_length = inner_length;
return 0;
}
static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
{
memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
return 0;
}
static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
{
memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
return 0;
}
static inline int blake2b_init0( blake2b_state *S )
{
memset( S, 0, sizeof( blake2b_state ) );
for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
return 0;
}
/* init xors IV with input parameter block */
int BLAKE_NAMESPACE(blake2b_init_param) ( blake2b_state *S, const blake2b_param *P )
{
uint8_t *p, *h, *v;
//blake2b_init0( S );
v = ( uint8_t * )( blake2b_IV );
h = ( uint8_t * )( S->h );
p = ( uint8_t * )( P );
/* IV XOR ParamBlock */
memset( S, 0, sizeof( blake2b_state ) );
for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
return 0;
}
/* Some sort of default parameter block initialization, for sequential blake2b */
int BLAKE_NAMESPACE(blake2b_init) ( blake2b_state *S, const uint8_t outlen )
{
if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
const blake2b_param P =
{
outlen,
0,
1,
1,
0,
0,
0,
0,
{0},
{0},
{0}
};
return BLAKE_NAMESPACE(blake2b_init_param) ( S, &P );
}
int BLAKE_NAMESPACE(blake2b_init_key) ( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
{
if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
const blake2b_param P =
{
outlen,
keylen,
1,
1,
0,
0,
0,
0,
{0},
{0},
{0}
};
if( BLAKE_NAMESPACE(blake2b_init_param) ( S, &P ) < 0 )
return 0;
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset( block, 0, BLAKE2B_BLOCKBYTES );
memcpy( block, key, keylen );
BLAKE_NAMESPACE(blake2b_update) ( S, block, BLAKE2B_BLOCKBYTES );
secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
}
return 0;
}
static inline int BLAKE_NAMESPACE(blake2b_compress) ( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
{
__m128i row1l, row1h;
__m128i row2l, row2h;
__m128i row3l, row3h;
__m128i row4l, row4h;
__m128i b0, b1;
__m128i t0, t1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
#endif
#if defined(HAVE_SSE41)
const __m128i m0 = LOADU( block + 00 );
const __m128i m1 = LOADU( block + 16 );
const __m128i m2 = LOADU( block + 32 );
const __m128i m3 = LOADU( block + 48 );
const __m128i m4 = LOADU( block + 64 );
const __m128i m5 = LOADU( block + 80 );
const __m128i m6 = LOADU( block + 96 );
const __m128i m7 = LOADU( block + 112 );
#else
const uint64_t m0 = ( ( uint64_t * )block )[ 0];
const uint64_t m1 = ( ( uint64_t * )block )[ 1];
const uint64_t m2 = ( ( uint64_t * )block )[ 2];
const uint64_t m3 = ( ( uint64_t * )block )[ 3];
const uint64_t m4 = ( ( uint64_t * )block )[ 4];
const uint64_t m5 = ( ( uint64_t * )block )[ 5];
const uint64_t m6 = ( ( uint64_t * )block )[ 6];
const uint64_t m7 = ( ( uint64_t * )block )[ 7];
const uint64_t m8 = ( ( uint64_t * )block )[ 8];
const uint64_t m9 = ( ( uint64_t * )block )[ 9];
const uint64_t m10 = ( ( uint64_t * )block )[10];
const uint64_t m11 = ( ( uint64_t * )block )[11];
const uint64_t m12 = ( ( uint64_t * )block )[12];
const uint64_t m13 = ( ( uint64_t * )block )[13];
const uint64_t m14 = ( ( uint64_t * )block )[14];
const uint64_t m15 = ( ( uint64_t * )block )[15];
#endif
row1l = LOAD( &S->h[0] );
row1h = LOAD( &S->h[2] );
row2l = LOAD( &S->h[4] );
row2h = LOAD( &S->h[6] );
row3l = LOAD( &blake2b_IV[0] );
row3h = LOAD( &blake2b_IV[2] );
row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) );
row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
ROUND( 10 );
ROUND( 11 );
row1l = _mm_xor_si128( row3l, row1l );
row1h = _mm_xor_si128( row3h, row1h );
STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) );
STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) );
row2l = _mm_xor_si128( row4l, row2l );
row2h = _mm_xor_si128( row4h, row2h );
STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) );
STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) );
return 0;
}
int BLAKE_NAMESPACE(blake2b_update) ( blake2b_state *S, const uint8_t *in, uint64_t inlen )
{
while( inlen > 0 )
{
size_t left = S->buflen;
size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
if( inlen > fill )
{
memcpy( S->buf + left, in, fill ); // Fill buffer
S->buflen += fill;
blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
BLAKE_NAMESPACE(blake2b_compress) ( S, S->buf ); // Compress
memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
S->buflen -= BLAKE2B_BLOCKBYTES;
in += fill;
inlen -= fill;
}
else // inlen <= fill
{
memcpy( S->buf + left, in, inlen );
S->buflen += inlen; // Be lazy, do not compress
in += inlen;
inlen -= inlen;
}
}
return 0;
}
int BLAKE_NAMESPACE(blake2b_final) ( blake2b_state *S, uint8_t *out, uint8_t outlen )
{
if( S->buflen > BLAKE2B_BLOCKBYTES )
{
blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
BLAKE_NAMESPACE(blake2b_compress) ( S, S->buf );
S->buflen -= BLAKE2B_BLOCKBYTES;
memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
}
blake2b_increment_counter( S, S->buflen );
blake2b_set_lastblock( S );
memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
BLAKE_NAMESPACE(blake2b_compress) ( S, S->buf );
memcpy( out, &S->h[0], outlen );
return 0;
}
int BLAKE_NAMESPACE(blake2b) ( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
blake2b_state S[1];
/* Verify parameters */
if ( NULL == in ) return -1;
if ( NULL == out ) return -1;
if( NULL == key ) keylen = 0;
if( keylen )
{
if( BLAKE_NAMESPACE(blake2b_init_key) ( S, outlen, key, keylen ) < 0 ) return -1;
}
else
{
if( BLAKE_NAMESPACE(blake2b_init) ( S, outlen ) < 0 ) return -1;
}
BLAKE_NAMESPACE(blake2b_update) ( S, ( uint8_t * )in, inlen );
BLAKE_NAMESPACE(blake2b_final) ( S, out, outlen );
return 0;
}
#if defined(SUPERCOP)
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
{
return BLAKE_NAMESPACE(blake2b) ( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
}
#endif
#if defined(BLAKE2B_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( int argc, char **argv )
{
uint8_t key[BLAKE2B_KEYBYTES];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
buf[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2B_OUTBYTES];
BLAKE_NAMESPACE(blake2b) ( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
{
puts( "error" );
return -1;
}
}
puts( "ok" );
return 0;
}
#endif

View file

@ -0,0 +1,4 @@
#define HAVE_AVX
#define BLAKE_NAMESPACE(x) x##_avx
#include "blake2b.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSE2
#define BLAKE_NAMESPACE(x) x##_sse2
#include "blake2b.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSE41
#define BLAKE_NAMESPACE(x) x##_sse41
#include "blake2b.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSSE3
#define BLAKE_NAMESPACE(x) x##_ssse3
#include "blake2b.c"

301
crypto/blake2/blake2bp.c Normal file
View file

@ -0,0 +1,301 @@
/*
BLAKE2 reference source code package - optimized C implementations
Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along with
this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#if defined(_OPENMP)
#include <omp.h>
#endif
#include "blake2.h"
#include "blake2-impl.h"
#define PARALLELISM_DEGREE 4
#ifndef BLAKE_NAMESPACE
#define BLAKE_NAMESPACE(x) x
#endif
static inline int blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
{
blake2b_param P[1];
P->digest_length = outlen;
P->key_length = keylen;
P->fanout = PARALLELISM_DEGREE;
P->depth = 2;
P->leaf_length = 0;
P->node_offset = offset;
P->node_depth = 0;
P->inner_length = outlen;
memset( P->reserved, 0, sizeof( P->reserved ) );
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
return BLAKE_NAMESPACE(blake2b_init_param) ( S, P );
}
static inline int blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
{
blake2b_param P[1];
P->digest_length = outlen;
P->key_length = keylen;
P->fanout = PARALLELISM_DEGREE;
P->depth = 2;
P->leaf_length = 0;
P->node_offset = 0;
P->node_depth = 1;
P->inner_length = outlen;
memset( P->reserved, 0, sizeof( P->reserved ) );
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
return BLAKE_NAMESPACE(blake2b_init_param) ( S, P );
}
int BLAKE_NAMESPACE(blake2bp_init) ( blake2bp_state *S, const uint8_t outlen )
{
if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
memset( S->buf, 0, sizeof( S->buf ) );
S->buflen = 0;
if( blake2bp_init_root( S->R, outlen, 0 ) < 0 )
return -1;
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
if( blake2bp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1;
S->R->last_node = 1;
S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
return 0;
}
int BLAKE_NAMESPACE(blake2bp_init_key) ( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
{
if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
if( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
memset( S->buf, 0, sizeof( S->buf ) );
S->buflen = 0;
if( blake2bp_init_root( S->R, outlen, keylen ) < 0 )
return -1;
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
if( blake2bp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1;
S->R->last_node = 1;
S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset( block, 0, BLAKE2B_BLOCKBYTES );
memcpy( block, key, keylen );
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
BLAKE_NAMESPACE(blake2b_update) ( S->S[i], block, BLAKE2B_BLOCKBYTES );
secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
}
return 0;
}
int BLAKE_NAMESPACE(blake2bp_update) ( blake2bp_state *S, const uint8_t *in, uint64_t inlen )
{
size_t left = S->buflen;
size_t fill = sizeof( S->buf ) - left;
if( left && inlen >= fill )
{
memcpy( S->buf + left, in, fill );
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
BLAKE_NAMESPACE(blake2b_update) ( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES );
in += fill;
inlen -= fill;
left = 0;
}
#if defined(_OPENMP)
#pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE)
#else
for( size_t id__ = 0; id__ < PARALLELISM_DEGREE; ++id__ )
#endif
{
#if defined(_OPENMP)
size_t id__ = omp_get_thread_num();
#endif
uint64_t inlen__ = inlen;
const uint8_t *in__ = ( const uint8_t * )in;
in__ += id__ * BLAKE2B_BLOCKBYTES;
while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
{
BLAKE_NAMESPACE(blake2b_update) ( S->S[id__], in__, BLAKE2B_BLOCKBYTES );
in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
}
}
in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES );
inlen %= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
if( inlen > 0 )
memcpy( S->buf + left, in, inlen );
S->buflen = left + inlen;
return 0;
}
int BLAKE_NAMESPACE(blake2bp_final) ( blake2bp_state *S, uint8_t *out, const uint8_t outlen )
{
uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
{
if( S->buflen > i * BLAKE2B_BLOCKBYTES )
{
size_t left = S->buflen - i * BLAKE2B_BLOCKBYTES;
if( left > BLAKE2B_BLOCKBYTES ) left = BLAKE2B_BLOCKBYTES;
BLAKE_NAMESPACE(blake2b_update) ( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, left );
}
BLAKE_NAMESPACE(blake2b_final) ( S->S[i], hash[i], BLAKE2B_OUTBYTES );
}
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
BLAKE_NAMESPACE(blake2b_update) ( S->R, hash[i], BLAKE2B_OUTBYTES );
BLAKE_NAMESPACE(blake2b_final) ( S->R, out, outlen );
return 0;
}
int BLAKE_NAMESPACE(blake2bp) ( uint8_t *out, const void *in, const void *key, uint8_t outlen, uint64_t inlen, uint8_t keylen )
{
uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
blake2b_state S[PARALLELISM_DEGREE][1];
blake2b_state FS[1];
/* Verify parameters */
if ( NULL == in ) return -1;
if ( NULL == out ) return -1;
if ( NULL == key ) keylen = 0;
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
S[PARALLELISM_DEGREE - 1]->last_node = 1; // mark last node
if( keylen > 0 )
{
uint8_t block[BLAKE2B_BLOCKBYTES];
memset( block, 0, BLAKE2B_BLOCKBYTES );
memcpy( block, key, keylen );
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
BLAKE_NAMESPACE(blake2b_update) ( S[i], block, BLAKE2B_BLOCKBYTES );
secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
}
#if defined(_OPENMP)
#pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
#else
for( size_t id__ = 0; id__ < PARALLELISM_DEGREE; ++id__ )
#endif
{
#if defined(_OPENMP)
size_t id__ = omp_get_thread_num();
#endif
uint64_t inlen__ = inlen;
const uint8_t *in__ = ( const uint8_t * )in;
in__ += id__ * BLAKE2B_BLOCKBYTES;
while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
{
BLAKE_NAMESPACE(blake2b_update) ( S[id__], in__, BLAKE2B_BLOCKBYTES );
in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
}
if( inlen__ > id__ * BLAKE2B_BLOCKBYTES )
{
const size_t left = inlen__ - id__ * BLAKE2B_BLOCKBYTES;
const size_t len = left <= BLAKE2B_BLOCKBYTES ? left : BLAKE2B_BLOCKBYTES;
BLAKE_NAMESPACE(blake2b_update) ( S[id__], in__, len );
}
BLAKE_NAMESPACE(blake2b_final) ( S[id__], hash[id__], BLAKE2B_OUTBYTES );
}
if( blake2bp_init_root( FS, outlen, keylen ) < 0 )
return -1;
FS->last_node = 1; // Mark as last node
for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
BLAKE_NAMESPACE(blake2b_update) ( FS, hash[i], BLAKE2B_OUTBYTES );
BLAKE_NAMESPACE(blake2b_final) ( FS, out, outlen );
return 0;
}
#if defined(BLAKE2BP_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( int argc, char **argv )
{
uint8_t key[BLAKE2B_KEYBYTES];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
buf[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2B_OUTBYTES];
//blake2bp( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
blake2bp_state S[1];
BLAKE_NAMESPACE(blake2bp_init_key) ( S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES );
BLAKE_NAMESPACE(blake2bp_update) ( S, buf, i );
BLAKE_NAMESPACE(blake2bp_final) ( S, hash, BLAKE2B_OUTBYTES );
if( 0 != memcmp( hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES ) )
{
puts( "error" );
return -1;
}
}
puts( "ok" );
return 0;
}
#endif

View file

@ -0,0 +1,4 @@
#define HAVE_AVX
#define BLAKE_NAMESPACE(x) x##_avx
#include "blake2bp.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSE2
#define BLAKE_NAMESPACE(x) x##_sse2
#include "blake2bp.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSE41
#define BLAKE_NAMESPACE(x) x##_sse41
#include "blake2bp.c"

View file

@ -0,0 +1,4 @@
#define HAVE_SSSE3
#define BLAKE_NAMESPACE(x) x##_ssse3
#include "blake2bp.c"

View file

@ -36,8 +36,8 @@
#include <openssl/rand.h> #include <openssl/rand.h>
#include <openssl/evp.h> #include <openssl/evp.h>
#include <openssl/hmac.h> #include <openssl/hmac.h>
//#include <sha256.h>
#include <sha512.h> #include <sha512.h>
#include <blake2_digest.h>
#include <crypto_aes.h> #include <crypto_aes.h>
#include <KeccakNISTInterface.h> #include <KeccakNISTInterface.h>
#include <utils.h> #include <utils.h>
@ -48,7 +48,10 @@
#define PROVIDER_X64_OPT 1 #define PROVIDER_X64_OPT 1
static void init_sha512(void); static void init_sha512(void);
static void init_blake2(void);
static int geturandom_bytes(uchar_t rbytes[32]); static int geturandom_bytes(uchar_t rbytes[32]);
static struct blake2_dispatch bdsp;
/* /*
* Checksum properties * Checksum properties
*/ */
@ -59,25 +62,26 @@ static struct {
cksum_t cksum_id; cksum_t cksum_id;
int bytes, mac_bytes; int bytes, mac_bytes;
ckinit_func_ptr init_func; ckinit_func_ptr init_func;
int compatible;
} cksum_props[] = { } cksum_props[] = {
{"CRC64", "Fast 64-bit CRC from LZMA SDK.", {"CRC64", "Extremely Fast 64-bit CRC from LZMA SDK.",
CKSUM_CRC64, 8, 32, NULL}, CKSUM_CRC64, 8, 32, NULL, 0},
{"SKEIN256", "256-bit SKEIN a NIST SHA3 runners-up (90% faster than Keccak).", {"SKEIN256", "256-bit SKEIN a NIST SHA3 runners-up (90% faster than Keccak).",
CKSUM_SKEIN256, 32, 32, NULL}, CKSUM_SKEIN256, 32, 32, NULL, 1},
{"SKEIN512", "512-bit SKEIN", {"SKEIN512", "512-bit SKEIN",
CKSUM_SKEIN512, 64, 64, NULL}, CKSUM_SKEIN512, 64, 64, NULL, 1},
{"SHA256", "Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.", {"SHA256", "SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.",
CKSUM_SHA256, 32, 32, init_sha512}, CKSUM_SHA256, 32, 32, init_sha512, 0},
{"SHA512", "512-bit SHA2 from OpenSSL's crypto library.", {"SHA512", "SHA512 version of Intel's optimized (SSE,AVX) SHA2 for x86.",
CKSUM_SHA512, 64, 64, init_sha512}, CKSUM_SHA512, 64, 64, init_sha512, 0},
{"KECCAK256", "Official 256-bit NIST SHA3 optimized implementation.", {"KECCAK256", "Official 256-bit NIST SHA3 optimized implementation.",
CKSUM_KECCAK256, 32, 32, NULL}, CKSUM_KECCAK256, 32, 32, NULL, 0},
{"KECCAK512", "Official 512-bit NIST SHA3 optimized implementation.", {"KECCAK512", "Official 512-bit NIST SHA3 optimized implementation.",
CKSUM_KECCAK512, 64, 64, NULL}, CKSUM_KECCAK512, 64, 64, NULL, 0},
{"BLAKE256", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.", {"BLAKE256", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.",
CKSUM_BLAKE256, 32, 32, NULL}, CKSUM_BLAKE256, 32, 32, init_blake2, 0},
{"BLAKE512", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.", {"BLAKE512", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.",
CKSUM_BLAKE512, 64, 64, NULL} CKSUM_BLAKE512, 64, 64, init_blake2, 0}
}; };
static int cksum_provider = PROVIDER_OPENSSL; static int cksum_provider = PROVIDER_OPENSSL;
@ -169,6 +173,14 @@ compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes)
uint64_t *ck = (uint64_t *)cksum_buf; uint64_t *ck = (uint64_t *)cksum_buf;
*ck = lzma_crc64(buf, bytes, 0); *ck = lzma_crc64(buf, bytes, 0);
} else if (cksum == CKSUM_BLAKE256) {
if (bdsp.blake2b(cksum_buf, buf, NULL, 32, bytes, 0) != 0)
return (-1);
} else if (cksum == CKSUM_BLAKE512) {
if (bdsp.blake2b(cksum_buf, buf, NULL, 64, bytes, 0) != 0)
return (-1);
} else if (cksum == CKSUM_SKEIN256) { } else if (cksum == CKSUM_SKEIN256) {
Skein_512_Ctxt_t ctx; Skein_512_Ctxt_t ctx;
@ -244,12 +256,19 @@ init_sha512(void)
#endif #endif
} }
static void
init_blake2(void)
{
blake2_module_init(&bdsp, &proc_info);
}
void void
list_checksums(FILE *strm, char *pad) list_checksums(FILE *strm, char *pad)
{ {
int i; int i;
for (i=0; i<(sizeof (cksum_props)/sizeof (cksum_props[0])); i++) { for (i=0; i<(sizeof (cksum_props)/sizeof (cksum_props[0])); i++) {
fprintf(strm, "%s%10s - %s\n", pad, cksum_props[i].name, cksum_props[i].desc); if (!cksum_props[i].compatible)
fprintf(strm, "%s%10s - %s\n", pad, cksum_props[i].name, cksum_props[i].desc);
} }
} }
@ -258,13 +277,16 @@ list_checksums(FILE *strm, char *pad)
* return it's properties. * return it's properties.
*/ */
int int
get_checksum_props(const char *name, int *cksum, int *cksum_bytes, int *mac_bytes) get_checksum_props(const char *name, int *cksum, int *cksum_bytes,
int *mac_bytes, int accept_comptible)
{ {
int i; int i;
for (i=0; i<(sizeof (cksum_props)/sizeof (cksum_props[0])); i++) { for (i=0; i<(sizeof (cksum_props)/sizeof (cksum_props[0])); i++) {
if ((name != NULL && strcmp(name, cksum_props[i].name) == 0) || if ((name != NULL && strcmp(name, cksum_props[i].name) == 0) ||
(*cksum != 0 && *cksum == cksum_props[i].cksum_id)) { (*cksum != 0 && *cksum == cksum_props[i].cksum_id)) {
if (!accept_comptible && cksum_props[i].compatible)
break;
*cksum = cksum_props[i].cksum_id; *cksum = cksum_props[i].cksum_id;
*cksum_bytes = cksum_props[i].bytes; *cksum_bytes = cksum_props[i].bytes;
*mac_bytes = cksum_props[i].mac_bytes; *mac_bytes = cksum_props[i].mac_bytes;
@ -316,7 +338,35 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
aes_ctx_t *actx = (aes_ctx_t *)(cctx->crypto_ctx); aes_ctx_t *actx = (aes_ctx_t *)(cctx->crypto_ctx);
mctx->mac_cksum = cksum; mctx->mac_cksum = cksum;
if (cksum == CKSUM_SKEIN256) { if (cksum == CKSUM_BLAKE256) {
blake2b_state *ctx = (blake2b_state *)malloc(sizeof (blake2b_state));
if (!ctx) return (-1);
if (bdsp.blake2b_init_key(ctx, 32, actx->pkey, KEYLEN) != 0)
return (-1);
mctx->mac_ctx = ctx;
ctx = (blake2b_state *)malloc(sizeof (blake2b_state));
if (!ctx) {
free(mctx->mac_ctx);
return (-1);
}
memcpy(ctx, mctx->mac_ctx, sizeof (blake2b_state));
mctx->mac_ctx_reinit = ctx;
} else if (cksum == CKSUM_BLAKE512) {
blake2b_state *ctx = (blake2b_state *)malloc(sizeof (blake2b_state));
if (!ctx) return (-1);
if (bdsp.blake2b_init_key(ctx, 64, actx->pkey, KEYLEN) != 0)
return (-1);
mctx->mac_ctx = ctx;
ctx = (blake2b_state *)malloc(sizeof (blake2b_state));
if (!ctx) {
free(mctx->mac_ctx);
return (-1);
}
memcpy(ctx, mctx->mac_ctx, sizeof (blake2b_state));
mctx->mac_ctx_reinit = ctx;
} else if (cksum == CKSUM_SKEIN256) {
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)malloc(sizeof (Skein_512_Ctxt_t)); Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)malloc(sizeof (Skein_512_Ctxt_t));
if (!ctx) return (-1); if (!ctx) return (-1);
Skein_512_InitExt(ctx, 256, SKEIN_CFG_TREE_INFO_SEQUENTIAL, Skein_512_InitExt(ctx, 256, SKEIN_CFG_TREE_INFO_SEQUENTIAL,
@ -364,19 +414,6 @@ hmac_init(mac_ctx_t *mctx, int cksum, crypto_ctx_t *cctx)
} }
mctx->mac_ctx_reinit = ctx; mctx->mac_ctx_reinit = ctx;
} else { } else {
/* HMAC_SHA256_Context *ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
if (!ctx) return (-1);
opt_HMAC_SHA256_Init(ctx, actx->pkey, KEYLEN);
mctx->mac_ctx = ctx;
ctx = (HMAC_SHA256_Context *)malloc(sizeof (HMAC_SHA256_Context));
if (!ctx) {
free(mctx->mac_ctx);
return (-1);
}
memcpy(ctx, mctx->mac_ctx, sizeof (HMAC_SHA256_Context));
mctx->mac_ctx_reinit = ctx;*/
HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context)); HMAC_SHA512_Context *ctx = (HMAC_SHA512_Context *)malloc(sizeof (HMAC_SHA512_Context));
if (!ctx) return (-1); if (!ctx) return (-1);
opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN); opt_HMAC_SHA512t256_Init(ctx, actx->pkey, KEYLEN);
@ -457,7 +494,10 @@ hmac_reinit(mac_ctx_t *mctx)
{ {
int cksum = mctx->mac_cksum; int cksum = mctx->mac_cksum;
if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) { if (cksum == CKSUM_BLAKE256 || cksum == CKSUM_BLAKE512) {
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (blake2b_state));
} else if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t)); memcpy(mctx->mac_ctx, mctx->mac_ctx_reinit, sizeof (Skein_512_Ctxt_t));
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) { } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_SHA512 || cksum == CKSUM_CRC64) {
@ -480,7 +520,10 @@ hmac_update(mac_ctx_t *mctx, uchar_t *data, uint64_t len)
{ {
int cksum = mctx->mac_cksum; int cksum = mctx->mac_cksum;
if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) { if (cksum == CKSUM_BLAKE256 || cksum == CKSUM_BLAKE512) {
bdsp.blake2b_update((blake2b_state *)(mctx->mac_ctx), (uint8_t *)data, len);
} else if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
Skein_512_Update((Skein_512_Ctxt_t *)(mctx->mac_ctx), data, len); Skein_512_Update((Skein_512_Ctxt_t *)(mctx->mac_ctx), data, len);
} else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) { } else if (cksum == CKSUM_SHA256 || cksum == CKSUM_CRC64) {
@ -529,7 +572,15 @@ hmac_final(mac_ctx_t *mctx, uchar_t *hash, unsigned int *len)
{ {
int cksum = mctx->mac_cksum; int cksum = mctx->mac_cksum;
if (cksum == CKSUM_SKEIN256) { if (cksum == CKSUM_BLAKE256) {
bdsp.blake2b_final((blake2b_state *)(mctx->mac_ctx), hash, 32);
*len = 32;
} else if (cksum == CKSUM_BLAKE512) {
bdsp.blake2b_final((blake2b_state *)(mctx->mac_ctx), hash, 64);
*len = 64;
} else if (cksum == CKSUM_SKEIN256) {
Skein_512_Final((Skein_512_Ctxt_t *)(mctx->mac_ctx), hash); Skein_512_Final((Skein_512_Ctxt_t *)(mctx->mac_ctx), hash);
*len = 32; *len = 32;
@ -569,7 +620,11 @@ hmac_cleanup(mac_ctx_t *mctx)
{ {
int cksum = mctx->mac_cksum; int cksum = mctx->mac_cksum;
if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) { if (cksum == CKSUM_BLAKE256 || cksum == CKSUM_BLAKE512) {
memset(mctx->mac_ctx, 0, sizeof (blake2b_state));
memset(mctx->mac_ctx_reinit, 0, sizeof (blake2b_state));
} else if (cksum == CKSUM_SKEIN256 || cksum == CKSUM_SKEIN512) {
memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t)); memset(mctx->mac_ctx, 0, sizeof (Skein_512_Ctxt_t));
memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t)); memset(mctx->mac_ctx_reinit, 0, sizeof (Skein_512_Ctxt_t));

View file

@ -49,14 +49,19 @@ extern "C" {
*/ */
typedef enum { typedef enum {
CKSUM_CRC64 = 0x100, CKSUM_CRC64 = 0x100,
CKSUM_SKEIN256 = 0x200, CKSUM_BLAKE256 = 0x200,
CKSUM_SKEIN512 = 0x300, CKSUM_BLAKE512 = 0x300,
CKSUM_SHA256 = 0x400, CKSUM_SHA256 = 0x400,
CKSUM_SHA512 = 0x500, CKSUM_SHA512 = 0x500,
CKSUM_KECCAK256 = 0x600, CKSUM_KECCAK256 = 0x600,
CKSUM_KECCAK512 = 0x700, CKSUM_KECCAK512 = 0x700,
CKSUM_BLAKE256 = 0x800, /*
CKSUM_BLAKE512 = 0x900 * Backwards compatibility options. SKEIN in release 1.2 was replaced with
* Blake2 from 1.3 onwards (for sheer speed of Blake2). We want to be able
* to decode archives created with 1.2. New archives do not use SKEIN.
*/
CKSUM_SKEIN256 = 0x800,
CKSUM_SKEIN512 = 0x900
} cksum_t; } cksum_t;
typedef struct { typedef struct {
@ -78,7 +83,8 @@ typedef struct {
*/ */
int compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes); int compute_checksum(uchar_t *cksum_buf, int cksum, uchar_t *buf, uint64_t bytes);
void list_checksums(FILE *strm, char *pad); void list_checksums(FILE *strm, char *pad);
int get_checksum_props(const char *name, int *cksum, int *cksum_bytes, int *mac_bytes); int get_checksum_props(const char *name, int *cksum, int *cksum_bytes,
int *mac_bytes, int accept_compatible);
void serialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes); void serialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes);
void deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes); void deserialize_checksum(uchar_t *checksum, uchar_t *buf, int cksum_bytes);

View file

@ -125,25 +125,25 @@ APS_NAMESPACE(Init_SHA512) (processor_info_t *pc)
static void static void
_init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS]) _init (SHA512_Context *sc, const uint64_t iv[SHA512_HASH_WORDS])
{ {
int i; int i;
sc->totalLength[0] = 0LL; sc->totalLength[0] = 0LL;
sc->totalLength[1] = 0LL; sc->totalLength[1] = 0LL;
for (i = 0; i < SHA512_HASH_WORDS; i++) for (i = 0; i < SHA512_HASH_WORDS; i++)
sc->hash[i] = iv[i]; sc->hash[i] = iv[i];
sc->bufferLength = 0L; sc->bufferLength = 0L;
} }
void void
APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc) APS_NAMESPACE(SHA512_Init) (SHA512_Context *sc)
{ {
_init (sc, iv512); _init (sc, iv512);
} }
void void
APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc) APS_NAMESPACE(SHA512t256_Init) (SHA512_Context *sc)
{ {
_init (sc, iv256); _init (sc, iv256);
} }
void void
@ -207,50 +207,50 @@ APS_NAMESPACE(SHA512_Update) (SHA512_Context *sc, const void *vdata, size_t len)
void void
APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len) APS_NAMESPACE(SHA512t256_Update) (SHA512_Context *sc, const void *data, size_t len)
{ {
APS_NAMESPACE(SHA512_Update) (sc, data, len); APS_NAMESPACE(SHA512_Update) (sc, data, len);
} }
static void static void
_final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord) _final (SHA512_Context *sc, uint8_t *hash, int hashWords, int halfWord)
{ {
uint32_t bytesToPad; uint32_t bytesToPad;
uint64_t lengthPad[2]; uint64_t lengthPad[2];
int i; int i;
bytesToPad = 240L - sc->bufferLength; bytesToPad = 240L - sc->bufferLength;
if (bytesToPad > 128L) if (bytesToPad > 128L)
bytesToPad -= 128L; bytesToPad -= 128L;
lengthPad[0] = BYTESWAP64(sc->totalLength[0]); lengthPad[0] = BYTESWAP64(sc->totalLength[0]);
lengthPad[1] = BYTESWAP64(sc->totalLength[1]); lengthPad[1] = BYTESWAP64(sc->totalLength[1]);
APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad); APS_NAMESPACE(SHA512_Update) (sc, padding, bytesToPad);
APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L); APS_NAMESPACE(SHA512_Update) (sc, lengthPad, 16L);
if (hash) { if (hash) {
for (i = 0; i < hashWords; i++) { for (i = 0; i < hashWords; i++) {
*((uint64_t *) hash) = BYTESWAP64(sc->hash[i]); *((uint64_t *) hash) = BYTESWAP64(sc->hash[i]);
hash += 8; hash += 8;
} }
if (halfWord) { if (halfWord) {
hash[0] = (uint8_t) (sc->hash[i] >> 56); hash[0] = (uint8_t) (sc->hash[i] >> 56);
hash[1] = (uint8_t) (sc->hash[i] >> 48); hash[1] = (uint8_t) (sc->hash[i] >> 48);
hash[2] = (uint8_t) (sc->hash[i] >> 40); hash[2] = (uint8_t) (sc->hash[i] >> 40);
hash[3] = (uint8_t) (sc->hash[i] >> 32); hash[3] = (uint8_t) (sc->hash[i] >> 32);
} }
} }
} }
void void
APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE]) APS_NAMESPACE(SHA512_Final) (SHA512_Context *sc, uint8_t hash[SHA512_HASH_SIZE])
{ {
_final (sc, hash, SHA512_HASH_WORDS, 0); _final (sc, hash, SHA512_HASH_WORDS, 0);
} }
void void
APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE]) APS_NAMESPACE(SHA512t256_Final) (SHA512_Context *sc, uint8_t hash[SHA512t256_HASH_SIZE])
{ {
_final (sc, hash, SHA512t256_HASH_WORDS, 0); _final (sc, hash, SHA512t256_HASH_WORDS, 0);
} }
#define HASH_CONTEXT SHA512_Context #define HASH_CONTEXT SHA512_Context

18
main.c
View file

@ -745,11 +745,23 @@ start_decompress(const char *filename, const char *to_filename)
} }
cksum = flags & CKSUM_MASK; cksum = flags & CKSUM_MASK;
if (get_checksum_props(NULL, &cksum, &cksum_bytes, &mac_bytes) == -1) {
/*
* Backward compatibility check for SKEIN in archives version 5 or below.
* In newer versions BLAKE uses same IDs as SKEIN.
*/
if (version <= 5) {
if (cksum == CKSUM_BLAKE256) cksum = CKSUM_SKEIN256;
if (cksum == CKSUM_BLAKE512) cksum = CKSUM_SKEIN512;
}
if (get_checksum_props(NULL, &cksum, &cksum_bytes, &mac_bytes, 1) == -1) {
fprintf(stderr, "Invalid checksum algorithm code: %d. File corrupt ?\n", cksum); fprintf(stderr, "Invalid checksum algorithm code: %d. File corrupt ?\n", cksum);
UNCOMP_BAIL; UNCOMP_BAIL;
} }
/*
* Archives older than 5 did not support MACs.
*/
if (version < 5) if (version < 5)
mac_bytes = 0; mac_bytes = 0;
@ -2250,7 +2262,7 @@ main(int argc, char *argv[])
break; break;
case 'S': case 'S':
if (get_checksum_props(optarg, &cksum, &cksum_bytes, &mac_bytes) == -1) { if (get_checksum_props(optarg, &cksum, &cksum_bytes, &mac_bytes, 0) == -1) {
err_exit(0, "Invalid checksum type %s", optarg); err_exit(0, "Invalid checksum type %s", optarg);
} }
break; break;
@ -2344,7 +2356,7 @@ main(int argc, char *argv[])
main_cancel = 0; main_cancel = 0;
if (cksum == 0) if (cksum == 0)
get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes); get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes, 0);
if (!encrypt_type) { if (!encrypt_type) {
/* /*

View file

@ -26,8 +26,6 @@
#ifndef __CPUID_H__ #ifndef __CPUID_H__
#define __CPUID_H__ #define __CPUID_H__
#include "utils.h"
#ifdef __x86_64__ #ifdef __x86_64__
#define VENDOR_STR_MAX 16 #define VENDOR_STR_MAX 16
#define BRAND_STR_MAX 64 #define BRAND_STR_MAX 64
@ -36,6 +34,21 @@
#define MAX_EXT_CPUID_LEVEL 32 #define MAX_EXT_CPUID_LEVEL 32
#define MAX_INTELFN4_LEVEL 4 #define MAX_INTELFN4_LEVEL 4
typedef enum {
PROC_BIGENDIAN_GENERIC = 1,
PROC_LITENDIAN_GENERIC,
PROC_X64_INTEL,
PROC_X64_AMD
} proc_type_t;
typedef struct {
int sse_level;
int sse_sub_level;
int avx_level;
int xop_avail;
proc_type_t proc_type;
} processor_info_t;
/** /**
* This contains only the most basic CPU data, required to do identification * This contains only the most basic CPU data, required to do identification
* and feature recognition. Every processor should be identifiable using this * and feature recognition. Every processor should be identifiable using this

View file

@ -35,6 +35,7 @@
#include <inttypes.h> #include <inttypes.h>
#include <stdint.h> #include <stdint.h>
#include <assert.h> #include <assert.h>
#include <cpuid.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -141,21 +142,6 @@ typedef enum {
DECOMPRESS_THREADS DECOMPRESS_THREADS
} algo_threads_type_t; } algo_threads_type_t;
typedef enum {
PROC_BIGENDIAN_GENERIC = 1,
PROC_LITENDIAN_GENERIC,
PROC_X64_INTEL,
PROC_X64_AMD
} proc_type_t;
typedef struct {
int sse_level;
int sse_sub_level;
int avx_level;
int xop_avail;
proc_type_t proc_type;
} processor_info_t;
#ifndef _IN_UTILS_ #ifndef _IN_UTILS_
extern processor_info_t proc_info; extern processor_info_t proc_info;
#endif #endif