pcompress/crypto/blake2/blake2b-round.h

/*
 * This file is a part of Pcompress, a chunked parallel multi-
 * algorithm lossless compression and decompression program.
 *
 * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
 * Use is subject to license terms.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 *
 * moinakg@belenix.org, http://moinakg.wordpress.com/
 *      
 */

/*
   BLAKE2 reference source code package - optimized C implementations

   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>

   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.

   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#pragma once
#ifndef __BLAKE2B_ROUND_H__
#define __BLAKE2B_ROUND_H__

#define LOAD(p)  _mm_load_si128( (__m128i *)(p) )
#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)

#define LOADU(p)  _mm_loadu_si128( (__m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)

#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))

#define LIKELY(x) __builtin_expect((x),1)


/* Microarchitecture-specific macros */
#ifndef HAVE_XOP
#ifdef HAVE_SSSE3
#define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#else
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
#endif
#else
/* ... */
#endif


#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -32); \
  row4h = _mm_roti_epi64(row4h, -32); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -24); \
  row2h = _mm_roti_epi64(row2h, -24); \
 
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -16); \
  row4h = _mm_roti_epi64(row4h, -16); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -63); \
  row2h = _mm_roti_epi64(row2h, -63); \
 
#if defined(HAVE_SSSE3)
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0;    \
  \
  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
  row4l = t1; \
  row4h = t0;

#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0; \
  \
  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
  row4l = t1; \
  row4h = t0;
#else

#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row4l;\
  t1 = row2l;\
  row4l = row3l;\
  row3l = row3h;\
  row3h = row4l;\
  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))

#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row3l;\
  row3l = row3h;\
  row3h = t0;\
  t0 = row2l;\
  t1 = row4l;\
  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))

#endif

#if defined(HAVE_SSE41)
#include "blake2b-load-sse41.h"
#else
#include "blake2b-load-sse2.h"
#endif

#define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_2(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
  LOAD_MSG_ ##r ##_3(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_4(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

#endif
Major License text cleanup. 2013-03-07 14:56:48 +00:00			`/*`
			`* This file is a part of Pcompress, a chunked parallel multi-`
			`* algorithm lossless compression and decompression program.`
			`*`
			`* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.`
			`* Use is subject to license terms.`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 3 of the License, or (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with this program.`
			`* If not, see <http://www.gnu.org/licenses/>.`
			`*`
			`* moinakg@belenix.org, http://moinakg.wordpress.com/`
			`*`
			`*/`

Add optimized BLAKE2 implementations with runtime detection of CPU capability (SSE/AVX). Minor cleanups. 2013-01-26 10:09:10 +00:00			`/*`
			`BLAKE2 reference source code package - optimized C implementations`

			`Written in 2012 by Samuel Neves <sneves@dei.uc.pt>`

			`To the extent possible under law, the author(s) have dedicated all copyright`
			`and related and neighboring rights to this software to the public domain`
			`worldwide. This software is distributed without any warranty.`

			`You should have received a copy of the CC0 Public Domain Dedication along with`
			`this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.`
			`*/`
			`#pragma once`
			`#ifndef __BLAKE2B_ROUND_H__`
			`#define __BLAKE2B_ROUND_H__`

			`#define LOAD(p) _mm_load_si128( (__m128i *)(p) )`
			`#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)`

			`#define LOADU(p) _mm_loadu_si128( (__m128i *)(p) )`
			`#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)`

			`#define TOF(reg) _mm_castsi128_ps((reg))`
			`#define TOI(reg) _mm_castps_si128((reg))`

			`#define LIKELY(x) __builtin_expect((x),1)`


			`/* Microarchitecture-specific macros */`
			`#ifndef HAVE_XOP`
			`#ifdef HAVE_SSSE3`
			`#define _mm_roti_epi64(x, c) \`
			`(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \`
			`: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \`
			`: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \`
			`: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \`
			`: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))`
			`#else`
			`#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))`
			`#endif`
			`#else`
			`/* ... */`
			`#endif`



			`#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \`
			`row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \`
			`row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \`
			`\`
			`row4l = _mm_xor_si128(row4l, row1l); \`
			`row4h = _mm_xor_si128(row4h, row1h); \`
			`\`
			`row4l = _mm_roti_epi64(row4l, -32); \`
			`row4h = _mm_roti_epi64(row4h, -32); \`
			`\`
			`row3l = _mm_add_epi64(row3l, row4l); \`
			`row3h = _mm_add_epi64(row3h, row4h); \`
			`\`
			`row2l = _mm_xor_si128(row2l, row3l); \`
			`row2h = _mm_xor_si128(row2h, row3h); \`
			`\`
			`row2l = _mm_roti_epi64(row2l, -24); \`
			`row2h = _mm_roti_epi64(row2h, -24); \`

			`#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \`
			`row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \`
			`row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \`
			`\`
			`row4l = _mm_xor_si128(row4l, row1l); \`
			`row4h = _mm_xor_si128(row4h, row1h); \`
			`\`
			`row4l = _mm_roti_epi64(row4l, -16); \`
			`row4h = _mm_roti_epi64(row4h, -16); \`
			`\`
			`row3l = _mm_add_epi64(row3l, row4l); \`
			`row3h = _mm_add_epi64(row3h, row4h); \`
			`\`
			`row2l = _mm_xor_si128(row2l, row3l); \`
			`row2h = _mm_xor_si128(row2h, row3h); \`
			`\`
			`row2l = _mm_roti_epi64(row2l, -63); \`
			`row2h = _mm_roti_epi64(row2h, -63); \`

			`#if defined(HAVE_SSSE3)`
			`#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \`
			`t0 = _mm_alignr_epi8(row2h, row2l, 8); \`
			`t1 = _mm_alignr_epi8(row2l, row2h, 8); \`
			`row2l = t0; \`
			`row2h = t1; \`
			`\`
			`t0 = row3l; \`
			`row3l = row3h; \`
			`row3h = t0; \`
			`\`
			`t0 = _mm_alignr_epi8(row4h, row4l, 8); \`
			`t1 = _mm_alignr_epi8(row4l, row4h, 8); \`
			`row4l = t1; \`
			`row4h = t0;`

			`#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \`
			`t0 = _mm_alignr_epi8(row2l, row2h, 8); \`
			`t1 = _mm_alignr_epi8(row2h, row2l, 8); \`
			`row2l = t0; \`
			`row2h = t1; \`
			`\`
			`t0 = row3l; \`
			`row3l = row3h; \`
			`row3h = t0; \`
			`\`
			`t0 = _mm_alignr_epi8(row4l, row4h, 8); \`
			`t1 = _mm_alignr_epi8(row4h, row4l, 8); \`
			`row4l = t1; \`
			`row4h = t0;`
			`#else`

			`#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \`
			`t0 = row4l;\`
			`t1 = row2l;\`
			`row4l = row3l;\`
			`row3l = row3h;\`
			`row3h = row4l;\`
			`row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \`
			`row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \`
			`row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \`
			`row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))`

			`#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \`
			`t0 = row3l;\`
			`row3l = row3h;\`
			`row3h = t0;\`
			`t0 = row2l;\`
			`t1 = row4l;\`
			`row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \`
			`row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \`
			`row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \`
			`row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))`

			`#endif`

			`#if defined(HAVE_SSE41)`
			`#include "blake2b-load-sse41.h"`
			`#else`
			`#include "blake2b-load-sse2.h"`
			`#endif`

			`#define ROUND(r) \`
			`LOAD_MSG_ ##r ##_1(b0, b1); \`
			`G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \`
			`LOAD_MSG_ ##r ##_2(b0, b1); \`
			`G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \`
			`DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \`
			`LOAD_MSG_ ##r ##_3(b0, b1); \`
			`G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \`
			`LOAD_MSG_ ##r ##_4(b0, b1); \`
			`G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \`
			`UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);`

			`#endif`