Skip to content

Commit

Permalink
gosthash2012: Import SSE4.1 implementation
Browse files Browse the repository at this point in the history
Cannot detect presence of __SSE4_1__ anymore, since Clang applies
target() only to functions.

Link: https://github.com/adegtyarev/streebog
Signed-off-by: Vitaly Chikunov <[email protected]>
  • Loading branch information
vt-alt committed Nov 27, 2021
1 parent 3a60e6c commit ffa92dc
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 5 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ set(GOST_HASH_2012_SOURCE_FILES
gosthash2012_precalc.h
gosthash2012_ref.h
gosthash2012_sse2.h
gosthash2012_sse41.h
)

set(GOST_GRASSHOPPER_SOURCE_FILES
Expand Down
4 changes: 4 additions & 0 deletions gosthash2012.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
# endif
#endif

#ifdef __x86_64__
# define __GOST3411_HAS_SSE41__
#endif

#ifdef __GOST3411_HAS_SSE2__
# if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
# undef __GOST3411_HAS_SSE2__
Expand Down
50 changes: 46 additions & 4 deletions gosthash2012_dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
# undef STORE
# undef X128R
# undef X128M
# undef EXTRACT
# undef EXTRACT32
# undef EXTRACT64
# undef XLPS128M
# undef XLPS128R
# undef ROUND128
Expand All @@ -59,6 +62,37 @@
# endif
#endif

/* Construct SSE4.1 implementation. */
#ifdef __GOST3411_HAS_SSE41__
# define g g_sse41
# define __GOST3411_USE_SSE41__
# if defined(__clang__)
# pragma clang attribute push (__attribute__((target("sse4.1"))), apply_to = function)
# elif defined(__GNUC__)
# pragma GCC push_options
# pragma GCC target("sse4.1")
# endif
# include "gosthash2012_sse41.h"
# include "gosthash2012_g.h"
# if defined(__clang__)
# pragma clang attribute pop
# elif defined(__GNUC__)
# pragma GCC pop_options
# endif
# undef LOAD
# undef UNLOAD
# undef X128R
# undef X128M
# undef EXTRACT
# undef EXTRACT32
# undef EXTRACT64
# undef XLPS128M
# undef XLPS128R
# undef ROUND128
# undef __GOST3411_USE_SSE41__
# undef g
#endif

#ifdef __GOST3411_HAS_REF__
/* Construct Reference implementation. */
# define g g_ref
Expand All @@ -74,21 +108,29 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
const union uint512_u * RESTRICT m)
{
#if __has_builtin(__builtin_cpu_supports)
# if defined __GOST3411_HAS_SSE41__
if (__builtin_cpu_supports("sse4.1"))
return g_sse41(h, N, m);
# endif
# if defined __GOST3411_HAS_SSE2__
if (__builtin_cpu_supports("sse2"))
return g_sse2(h, N, m);
# elif defined __GOST3411_HAS_REF__
# endif
# if defined __GOST3411_HAS_REF__
g_ref(h, N, m);
# else
# error "No implementation of g() is selected."
# endif
# if !defined __GOST3411_HAS_SSE41__ && \
!defined __GOST3411_HAS_SSE2__ && \
!defined __GOST3411_HAS_REF__
# error "No dynamic implementation of g() is selected."
# endif
#else /* No dynamic dispatcher. */
# if defined __GOST3411_HAS_SSE2__
g_sse2(h, N, m);
# elif defined __GOST3411_HAS_REF__
g_ref(h, N, m);
# else
# error "No implementation of g() is selected."
# error "No static implementation of g() is selected."
# endif
#endif
}
2 changes: 1 addition & 1 deletion gosthash2012_g.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
const union uint512_u * RESTRICT m)
{
#if defined __GOST3411_USE_SSE2__
#if defined __GOST3411_USE_SSE2__ || defined __GOST3411_USE_SSE41__
__m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */
__m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */
unsigned int i;
Expand Down
139 changes: 139 additions & 0 deletions gosthash2012_sse41.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (c) 2013, Alexey Degtyarev <[email protected]>.
* All rights reserved.
*
* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+
*/

#include <mmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>

#ifdef __i386__
#define EXTRACT EXTRACT32
#else
#define EXTRACT EXTRACT64
#endif

#ifndef __ICC
#define _mm_cvtsi64_m64(v) (__m64) v
#define _mm_cvtm64_si64(v) (long long) v
#endif

#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_loadu_si128(&__m128p[0]); \
xmm1 = _mm_loadu_si128(&__m128p[1]); \
xmm2 = _mm_loadu_si128(&__m128p[2]); \
xmm3 = _mm_loadu_si128(&__m128p[3]); \
}

#define STORE(P, xmm0, xmm1, xmm2, xmm3) { \
__m128i *__m128p = (__m128i *) &P[0]; \
_mm_store_si128(&__m128p[0], xmm0); \
_mm_store_si128(&__m128p[1], xmm1); \
_mm_store_si128(&__m128p[2], xmm2); \
_mm_store_si128(&__m128p[3], xmm3); \
}

#define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
xmm0 = _mm_xor_si128(xmm0, xmm4); \
xmm1 = _mm_xor_si128(xmm1, xmm5); \
xmm2 = _mm_xor_si128(xmm2, xmm6); \
xmm3 = _mm_xor_si128(xmm3, xmm7); \
}

#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \
xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \
xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \
xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \
}

#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))

#define _mm_extract_char(src, ndx) (unsigned char) _mm_extract_epi8(src, ndx)

#define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
__m64 mm0, mm1; \
\
mm0 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 0)]); \
mm0 = _mm_xor_64(mm0, Ax[1][_mm_extract_char(xmm0, row + 8)]); \
mm0 = _mm_xor_64(mm0, Ax[2][_mm_extract_char(xmm1, row + 0)]); \
mm0 = _mm_xor_64(mm0, Ax[3][_mm_extract_char(xmm1, row + 8)]); \
mm0 = _mm_xor_64(mm0, Ax[4][_mm_extract_char(xmm2, row + 0)]); \
mm0 = _mm_xor_64(mm0, Ax[5][_mm_extract_char(xmm2, row + 8)]); \
mm0 = _mm_xor_64(mm0, Ax[6][_mm_extract_char(xmm3, row + 0)]); \
mm0 = _mm_xor_64(mm0, Ax[7][_mm_extract_char(xmm3, row + 8)]); \
\
mm1 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 1)]); \
mm1 = _mm_xor_64(mm1, Ax[1][_mm_extract_char(xmm0, row + 9)]); \
mm1 = _mm_xor_64(mm1, Ax[2][_mm_extract_char(xmm1, row + 1)]); \
mm1 = _mm_xor_64(mm1, Ax[3][_mm_extract_char(xmm1, row + 9)]); \
mm1 = _mm_xor_64(mm1, Ax[4][_mm_extract_char(xmm2, row + 1)]); \
mm1 = _mm_xor_64(mm1, Ax[5][_mm_extract_char(xmm2, row + 9)]); \
mm1 = _mm_xor_64(mm1, Ax[6][_mm_extract_char(xmm3, row + 1)]); \
mm1 = _mm_xor_64(mm1, Ax[7][_mm_extract_char(xmm3, row + 9)]); \
\
xmm4 = _mm_set_epi64(mm1, mm0); \
}

#define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
register unsigned long long r0, r1; \
r0 = Ax[0][_mm_extract_char(xmm0, row + 0)]; \
r0 ^= Ax[1][_mm_extract_char(xmm0, row + 8)]; \
r0 ^= Ax[2][_mm_extract_char(xmm1, row + 0)]; \
r0 ^= Ax[3][_mm_extract_char(xmm1, row + 8)]; \
r0 ^= Ax[4][_mm_extract_char(xmm2, row + 0)]; \
r0 ^= Ax[5][_mm_extract_char(xmm2, row + 8)]; \
r0 ^= Ax[6][_mm_extract_char(xmm3, row + 0)]; \
r0 ^= Ax[7][_mm_extract_char(xmm3, row + 8)]; \
\
r1 = Ax[0][_mm_extract_char(xmm0, row + 1)]; \
r1 ^= Ax[1][_mm_extract_char(xmm0, row + 9)]; \
r1 ^= Ax[2][_mm_extract_char(xmm1, row + 1)]; \
r1 ^= Ax[3][_mm_extract_char(xmm1, row + 9)]; \
r1 ^= Ax[4][_mm_extract_char(xmm2, row + 1)]; \
r1 ^= Ax[5][_mm_extract_char(xmm2, row + 9)]; \
r1 ^= Ax[6][_mm_extract_char(xmm3, row + 1)]; \
r1 ^= Ax[7][_mm_extract_char(xmm3, row + 9)]; \
\
xmm4 = _mm_cvtsi64_si128((long long) r0); \
xmm4 = _mm_insert_epi64(xmm4, (long long) r1, 1); \
}

#define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
__m128i tmm0, tmm1, tmm2, tmm3; \
X128M(P, xmm0, xmm1, xmm2, xmm3); \
\
EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm1); \
EXTRACT(4, xmm0, xmm1, xmm2, xmm3, tmm2); \
EXTRACT(6, xmm0, xmm1, xmm2, xmm3, tmm3); \
\
xmm0 = tmm0; \
xmm1 = tmm1; \
xmm2 = tmm2; \
xmm3 = tmm3; \
}

#define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
__m128i tmm0, tmm1, tmm2, tmm3; \
X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
\
EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm1); \
EXTRACT(4, xmm4, xmm5, xmm6, xmm7, tmm2); \
EXTRACT(6, xmm4, xmm5, xmm6, xmm7, tmm3); \
\
xmm4 = tmm0; \
xmm5 = tmm1; \
xmm6 = tmm2; \
xmm7 = tmm3; \
}

#define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \
}

0 comments on commit ffa92dc

Please sign in to comment.