529 lines
15 KiB
C
Executable file
529 lines
15 KiB
C
Executable file
/*
|
|
* This file is a part of Pcompress, a chunked parallel multi-
|
|
* algorithm lossless compression and decompression program.
|
|
*
|
|
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 3 of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program.
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
|
*
|
|
*/
|
|
|
|
/*
|
|
The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
|
|
Michaël Peeters and Gilles Van Assche. For more information, feedback or
|
|
questions, please refer to our website: http://keccak.noekeon.org/
|
|
|
|
Implementation by the designers,
|
|
hereby denoted as "the implementer".
|
|
|
|
To the extent possible under law, the implementer has waived all copyright
|
|
and related or neighboring rights to the source code in this file.
|
|
http://creativecommons.org/publicdomain/zero/1.0/
|
|
*/
|
|
|
|
#include <string.h>
|
|
#include "brg_endian.h"
|
|
#include "KeccakF-1600-opt64-settings.h"
|
|
#include "KeccakF-1600-interface.h"
|
|
|
|
typedef unsigned char UINT8;
|
|
typedef unsigned long long int UINT64;
|
|
|
|
#if defined(__GNUC__)
|
|
#define ALIGN __attribute__ ((aligned(32)))
|
|
#elif defined(_MSC_VER)
|
|
#define ALIGN __declspec(align(32))
|
|
#else
|
|
#define ALIGN
|
|
#endif
|
|
|
|
#if defined(UseSSE)
|
|
#include <x86intrin.h>
|
|
typedef __m128i V64;
|
|
typedef __m128i V128;
|
|
typedef union {
|
|
V128 v128;
|
|
UINT64 v64[2];
|
|
} V6464;
|
|
|
|
#define ANDnu64(a, b) _mm_andnot_si128(a, b)
|
|
#define LOAD64(a) _mm_loadl_epi64((const V64 *)&(a))
|
|
#define CONST64(a) _mm_loadl_epi64((const V64 *)&(a))
|
|
#define ROL64(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
|
|
#define STORE64(a, b) _mm_storel_epi64((V64 *)&(a), b)
|
|
#define XOR64(a, b) _mm_xor_si128(a, b)
|
|
#define XOReq64(a, b) a = _mm_xor_si128(a, b)
|
|
#define SHUFFLEBYTES128(a, b) _mm_shuffle_epi8(a, b)
|
|
|
|
#define ANDnu128(a, b) _mm_andnot_si128(a, b)
|
|
#define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b))
|
|
#define CONST128(a) _mm_load_si128((const V128 *)&(a))
|
|
#define LOAD128(a) _mm_load_si128((const V128 *)&(a))
|
|
#define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a))
|
|
#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
|
|
#define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
|
|
#define XOR128(a, b) _mm_xor_si128(a, b)
|
|
#define XOReq128(a, b) a = _mm_xor_si128(a, b)
|
|
#define GET64LOLO(a, b) _mm_unpacklo_epi64(a, b)
|
|
#define GET64HIHI(a, b) _mm_unpackhi_epi64(a, b)
|
|
#define COPY64HI2LO(a) _mm_shuffle_epi32(a, 0xEE)
|
|
#define COPY64LO2HI(a) _mm_shuffle_epi32(a, 0x44)
|
|
#define ZERO128() _mm_setzero_si128()
|
|
|
|
#ifdef UseOnlySIMD64
|
|
#include "KeccakF-1600-simd64.macros"
|
|
#else
|
|
ALIGN const UINT64 rho8_56[2] = {0x0605040302010007, 0x080F0E0D0C0B0A09};
|
|
#include "KeccakF-1600-simd128.macros"
|
|
#endif
|
|
|
|
#ifdef UseBebigokimisa
|
|
#error "UseBebigokimisa cannot be used in combination with UseSSE"
|
|
#endif
|
|
#elif defined(UseXOP)
|
|
#include <x86intrin.h>
|
|
typedef __m128i V64;
|
|
typedef __m128i V128;
|
|
|
|
#define LOAD64(a) _mm_loadl_epi64((const V64 *)&(a))
|
|
#define CONST64(a) _mm_loadl_epi64((const V64 *)&(a))
|
|
#define STORE64(a, b) _mm_storel_epi64((V64 *)&(a), b)
|
|
#define XOR64(a, b) _mm_xor_si128(a, b)
|
|
#define XOReq64(a, b) a = _mm_xor_si128(a, b)
|
|
|
|
#define ANDnu128(a, b) _mm_andnot_si128(a, b)
|
|
#define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b))
|
|
#define CONST128(a) _mm_load_si128((const V128 *)&(a))
|
|
#define LOAD128(a) _mm_load_si128((const V128 *)&(a))
|
|
#define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a))
|
|
#define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
|
|
#define XOR128(a, b) _mm_xor_si128(a, b)
|
|
#define XOReq128(a, b) a = _mm_xor_si128(a, b)
|
|
#define ZERO128() _mm_setzero_si128()
|
|
|
|
#define SWAP64(a) _mm_shuffle_epi32(a, 0x4E)
|
|
#define GET64LOLO(a, b) _mm_unpacklo_epi64(a, b)
|
|
#define GET64HIHI(a, b) _mm_unpackhi_epi64(a, b)
|
|
#define GET64LOHI(a, b) ((__m128i)_mm_blend_pd((__m128d)a, (__m128d)b, 2))
|
|
#define GET64HILO(a, b) SWAP64(GET64LOHI(b, a))
|
|
#define COPY64HI2LO(a) _mm_shuffle_epi32(a, 0xEE)
|
|
#define COPY64LO2HI(a) _mm_shuffle_epi32(a, 0x44)
|
|
|
|
#define ROL6464same(a, o) _mm_roti_epi64(a, o)
|
|
#define ROL6464(a, r1, r2) _mm_rot_epi64(a, CONST128( rot_##r1##_##r2 ))
|
|
ALIGN const UINT64 rot_0_20[2] = { 0, 20};
|
|
ALIGN const UINT64 rot_44_3[2] = {44, 3};
|
|
ALIGN const UINT64 rot_43_45[2] = {43, 45};
|
|
ALIGN const UINT64 rot_21_61[2] = {21, 61};
|
|
ALIGN const UINT64 rot_14_28[2] = {14, 28};
|
|
ALIGN const UINT64 rot_1_36[2] = { 1, 36};
|
|
ALIGN const UINT64 rot_6_10[2] = { 6, 10};
|
|
ALIGN const UINT64 rot_25_15[2] = {25, 15};
|
|
ALIGN const UINT64 rot_8_56[2] = { 8, 56};
|
|
ALIGN const UINT64 rot_18_27[2] = {18, 27};
|
|
ALIGN const UINT64 rot_62_55[2] = {62, 55};
|
|
ALIGN const UINT64 rot_39_41[2] = {39, 41};
|
|
|
|
#if defined(UseSimulatedXOP)
|
|
// For debugging purposes, when XOP is not available
|
|
#undef ROL6464
|
|
#undef ROL6464same
|
|
#define ROL6464same(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
|
|
V128 ROL6464(V128 a, int r0, int r1)
|
|
{
|
|
V128 a0 = ROL64(a, r0);
|
|
V128 a1 = COPY64HI2LO(ROL64(a, r1));
|
|
return GET64LOLO(a0, a1);
|
|
}
|
|
#endif
|
|
|
|
#include "KeccakF-1600-xop.macros"
|
|
|
|
#ifdef UseBebigokimisa
|
|
#error "UseBebigokimisa cannot be used in combination with UseXOP"
|
|
#endif
|
|
#elif defined(UseMMX)
|
|
#include <mmintrin.h>
|
|
typedef __m64 V64;
|
|
#define ANDnu64(a, b) _mm_andnot_si64(a, b)
|
|
|
|
#if (defined(_MSC_VER) || defined (__INTEL_COMPILER))
|
|
#define LOAD64(a) *(V64*)&(a)
|
|
#define CONST64(a) *(V64*)&(a)
|
|
#define STORE64(a, b) *(V64*)&(a) = b
|
|
#else
|
|
#define LOAD64(a) (V64)a
|
|
#define CONST64(a) (V64)a
|
|
#define STORE64(a, b) a = (UINT64)b
|
|
#endif
|
|
#define ROL64(a, o) _mm_or_si64(_mm_slli_si64(a, o), _mm_srli_si64(a, 64-(o)))
|
|
#define XOR64(a, b) _mm_xor_si64(a, b)
|
|
#define XOReq64(a, b) a = _mm_xor_si64(a, b)
|
|
|
|
#include "KeccakF-1600-simd64.macros"
|
|
|
|
#ifdef UseBebigokimisa
|
|
#error "UseBebigokimisa cannot be used in combination with UseMMX"
|
|
#endif
|
|
#else
|
|
#if defined(_MSC_VER)
|
|
#define ROL64(a, offset) _rotl64(a, offset)
|
|
#elif defined(UseSHLD)
|
|
#define ROL64(x,N) ({ \
|
|
register UINT64 __out; \
|
|
register UINT64 __in = x; \
|
|
__asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
|
|
__out; \
|
|
})
|
|
#else
|
|
#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
|
|
#endif
|
|
|
|
#include "KeccakF-1600-64.macros"
|
|
#endif
|
|
|
|
#include "KeccakF-1600-unrolling.macros"
|
|
|
|
void KeccakPermutationOnWords(UINT64 *state)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromState(A, state)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
|
|
void KeccakPermutationOnWordsAfterXoring(UINT64 *state, const UINT64 *input, unsigned int laneCount)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
unsigned int j;
|
|
|
|
for(j=0; j<laneCount; j++)
|
|
state[j] ^= input[j];
|
|
copyFromState(A, state)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
|
|
#ifdef ProvideFast576
|
|
void KeccakPermutationOnWordsAfterXoring576bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor576bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast832
|
|
void KeccakPermutationOnWordsAfterXoring832bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor832bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1024
|
|
void KeccakPermutationOnWordsAfterXoring1024bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor1024bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1088
|
|
void KeccakPermutationOnWordsAfterXoring1088bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor1088bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1152
|
|
void KeccakPermutationOnWordsAfterXoring1152bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor1152bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1344
|
|
void KeccakPermutationOnWordsAfterXoring1344bits(UINT64 *state, const UINT64 *input)
|
|
{
|
|
declareABCDE
|
|
#if (Unrolling != 24)
|
|
unsigned int i;
|
|
#endif
|
|
|
|
copyFromStateAndXor1344bits(A, state, input)
|
|
rounds
|
|
#if defined(UseMMX)
|
|
_mm_empty();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
void KeccakInitialize()
|
|
{
|
|
}
|
|
|
|
void KeccakInitializeState(unsigned char *state)
|
|
{
|
|
memset(state, 0, 200);
|
|
#ifdef UseBebigokimisa
|
|
((UINT64*)state)[ 1] = ~(UINT64)0;
|
|
((UINT64*)state)[ 2] = ~(UINT64)0;
|
|
((UINT64*)state)[ 8] = ~(UINT64)0;
|
|
((UINT64*)state)[12] = ~(UINT64)0;
|
|
((UINT64*)state)[17] = ~(UINT64)0;
|
|
((UINT64*)state)[20] = ~(UINT64)0;
|
|
#endif
|
|
}
|
|
|
|
void KeccakPermutation(unsigned char *state)
|
|
{
|
|
// We assume the state is always stored as words
|
|
KeccakPermutationOnWords((UINT64*)state);
|
|
}
|
|
|
|
void fromBytesToWord(UINT64 *word, const UINT8 *bytes)
|
|
{
|
|
unsigned int i;
|
|
|
|
*word = 0;
|
|
for(i=0; i<(64/8); i++)
|
|
*word |= (UINT64)(bytes[i]) << (8*i);
|
|
}
|
|
|
|
#ifdef ProvideFast576
|
|
void KeccakAbsorb576bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[9];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<9; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring576bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast832
|
|
void KeccakAbsorb832bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[13];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<13; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring832bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1024
|
|
void KeccakAbsorb1024bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[16];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<16; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring1024bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1088
|
|
void KeccakAbsorb1088bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[17];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<17; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring1088bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1152
|
|
void KeccakAbsorb1152bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[18];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<18; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring1152bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef ProvideFast1344
|
|
void KeccakAbsorb1344bits(unsigned char *state, const unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, (const UINT64*)data);
|
|
#else
|
|
UINT64 dataAsWords[21];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<21; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring1344bits((UINT64*)state, dataAsWords);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
void KeccakAbsorb(unsigned char *state, const unsigned char *data, unsigned int laneCount)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
KeccakPermutationOnWordsAfterXoring((UINT64*)state, (const UINT64*)data, laneCount);
|
|
#else
|
|
UINT64 dataAsWords[25];
|
|
unsigned int i;
|
|
|
|
for(i=0; i<laneCount; i++)
|
|
fromBytesToWord(dataAsWords+i, data+(i*8));
|
|
KeccakPermutationOnWordsAfterXoring((UINT64*)state, dataAsWords, laneCount);
|
|
#endif
|
|
}
|
|
|
|
void fromWordToBytes(UINT8 *bytes, const UINT64 word)
|
|
{
|
|
unsigned int i;
|
|
|
|
for(i=0; i<(64/8); i++)
|
|
bytes[i] = (word >> (8*i)) & 0xFF;
|
|
}
|
|
|
|
#ifdef ProvideFast1024
|
|
void KeccakExtract1024bits(const unsigned char *state, unsigned char *data)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
memcpy(data, state, 128);
|
|
#else
|
|
unsigned int i;
|
|
|
|
for(i=0; i<16; i++)
|
|
fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
|
|
#endif
|
|
#ifdef UseBebigokimisa
|
|
((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
|
|
((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
|
|
((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
|
|
((UINT64*)data)[12] = ~((UINT64*)data)[12];
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
void KeccakExtract(const unsigned char *state, unsigned char *data, unsigned int laneCount)
|
|
{
|
|
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
|
memcpy(data, state, laneCount*8);
|
|
#else
|
|
unsigned int i;
|
|
|
|
for(i=0; i<laneCount; i++)
|
|
fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
|
|
#endif
|
|
#ifdef UseBebigokimisa
|
|
if (laneCount > 1) {
|
|
((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
|
|
if (laneCount > 2) {
|
|
((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
|
|
if (laneCount > 8) {
|
|
((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
|
|
if (laneCount > 12) {
|
|
((UINT64*)data)[12] = ~((UINT64*)data)[12];
|
|
if (laneCount > 17) {
|
|
((UINT64*)data)[17] = ~((UINT64*)data)[17];
|
|
if (laneCount > 20) {
|
|
((UINT64*)data)[20] = ~((UINT64*)data)[20];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|