Skip to content

Commit

Permalink
gosthash2012: Import and merge MMX implementations
Browse files Browse the repository at this point in the history
Merged and fixed two MMX implementations. For example,
[1] uses SSE2 register types `__m128i',
[2] GCC's `mmintrin.h' defines `_mm_cvtsi64_m64' only for `__x86_64__',
    but we need MMX exactly for IA-32, since x86_64 it have SSE2 in baseline.

Link: https://github.com/adegtyarev/streebog
Link: https://github.com/sjinks/php-stribog
Signed-off-by: Vitaly Chikunov <[email protected]>
  • Loading branch information
vt-alt committed Nov 27, 2021
1 parent ffa92dc commit b618bbc
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ set(GOST_HASH_2012_SOURCE_FILES
gosthash2012_const.h
gosthash2012_precalc.h
gosthash2012_ref.h
gosthash2012_mmx.h
gosthash2012_sse2.h
gosthash2012_sse41.h
)
Expand Down
13 changes: 13 additions & 0 deletions gosthash2012.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@
/* Can be undef'd to disable ref impl. */
#define __GOST3411_HAS_REF__

#ifdef __i386__
/* Pure MMX is meaningful only for IA-32. */
# ifdef __SSE__
/*
* If user compiled with >= '-msse' compiler can introduce higher level
* microarchitecture optimizations into MMX code.
*/
# warning "MMX implementation will be broken if SSE enabled. Disabling it."
# else
# define __GOST3411_HAS_MMX__
# endif
#endif

#ifdef __SSE2__
# define __GOST3411_HAS_SSE2__
# if !defined(__x86_64__) && !defined(__e2k__)
Expand Down
31 changes: 31 additions & 0 deletions gosthash2012_dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,33 @@
# define __has_builtin(x) 0
#endif

/* Construct MMX implementation. */
#ifdef __GOST3411_HAS_MMX__
# define g g_mmx
# define __GOST3411_USE_MMX__
# if defined(__clang__)
# pragma clang attribute push (__attribute__((target("mmx"))), apply_to = function)
# elif defined(__GNUC__)
# pragma GCC push_options
# pragma GCC target("mmx")
# endif
# include "gosthash2012_mmx.h"
# include "gosthash2012_g.h"
# if defined(__clang__)
# pragma clang attribute pop
# elif defined(__GNUC__)
# pragma GCC pop_options
# endif
# undef XLOAD
# undef STORE
# undef TRANSPOSE
# undef XTRANSPOSE
# undef XLPS32
# undef XLPS
# undef __GOST3411_USE_MMX__
# undef g
#endif

/*
* Construct SSE2 implementation. SSE2 is baseline in x86_64, but a feature
* on IA-32, thus pass target() for IA-32.
Expand Down Expand Up @@ -116,6 +143,10 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
if (__builtin_cpu_supports("sse2"))
return g_sse2(h, N, m);
# endif
# if defined __GOST3411_HAS_MMX__
if (__builtin_cpu_supports("mmx"))
return g_mmx(h, N, m);
# endif
# if defined __GOST3411_HAS_REF__
g_ref(h, N, m);
# endif
Expand Down
5 changes: 4 additions & 1 deletion gosthash2012_g.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
/* This is only required on MMX, but EXTRACT32 is using MMX */
_mm_empty();
# endif
#else
#else /* ref and MMX impl. */
union uint512_u Ki, data;
unsigned int i;

Expand All @@ -60,5 +60,8 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,

X((&data), h, (&data));
X((&data), m, h);
# ifdef __GOST3411_USE_MMX__
_mm_empty();
# endif
#endif
}
113 changes: 113 additions & 0 deletions gosthash2012_mmx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2013, Alexey Degtyarev <[email protected]>.
* Copyright (c) 2013 Vladimir Kolesnikov.
* Copyright (c) 2021 Vitaly Chikunov <[email protected]>.
* All rights reserved.
*
* SPDX-License-Identifier: (BSD-2-Clause OR GPL-2.0+) AND MIT
*/

#include <mmintrin.h>

#define XLPS XLPS32

#define X(x, y, z) { \
z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \
z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \
z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \
z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \
z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \
z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \
z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \
z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \
}

#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
const __m64 *px = (const __m64 *) &x[0]; \
const __m64 *py = (const __m64 *) &y[0]; \
mm0 = _mm_xor_si64(px[0], py[0]); \
mm1 = _mm_xor_si64(px[1], py[1]); \
mm2 = _mm_xor_si64(px[2], py[2]); \
mm3 = _mm_xor_si64(px[3], py[3]); \
mm4 = _mm_xor_si64(px[4], py[4]); \
mm5 = _mm_xor_si64(px[5], py[5]); \
mm6 = _mm_xor_si64(px[6], py[6]); \
mm7 = _mm_xor_si64(px[7], py[7]); \
}

#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
unsigned long long *__m64p = &P->QWORD[0]; \
__m64p[0] = (unsigned long long)(mm0); \
__m64p[1] = (unsigned long long)(mm1); \
__m64p[2] = (unsigned long long)(mm2); \
__m64p[3] = (unsigned long long)(mm3); \
__m64p[4] = (unsigned long long)(mm4); \
__m64p[5] = (unsigned long long)(mm5); \
__m64p[6] = (unsigned long long)(mm6); \
__m64p[7] = (unsigned long long)(mm7); \
}

#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
__m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \
tm0 = _mm_unpacklo_pi8(mm0, mm2); \
tm1 = _mm_unpackhi_pi8(mm0, mm2); \
tm2 = _mm_unpacklo_pi8(mm1, mm3); \
tm3 = _mm_unpackhi_pi8(mm1, mm3); \
tm4 = _mm_unpacklo_pi8(mm4, mm6); \
tm5 = _mm_unpackhi_pi8(mm4, mm6); \
tm6 = _mm_unpacklo_pi8(mm5, mm7); \
tm7 = _mm_unpackhi_pi8(mm5, mm7); \
\
mm0 = _mm_unpacklo_pi8(tm0, tm2); \
mm1 = _mm_unpackhi_pi8(tm0, tm2); \
mm2 = _mm_unpacklo_pi8(tm1, tm3); \
mm3 = _mm_unpackhi_pi8(tm1, tm3); \
mm4 = _mm_unpacklo_pi8(tm4, tm6); \
mm5 = _mm_unpackhi_pi8(tm4, tm6); \
mm6 = _mm_unpacklo_pi8(tm5, tm7); \
mm7 = _mm_unpackhi_pi8(tm5, tm7); \
\
tm2 = _mm_unpacklo_pi32(mm1, mm5); \
tm3 = _mm_unpackhi_pi32(mm1, mm5); \
tm0 = _mm_unpacklo_pi32(mm0, mm4); \
tm1 = _mm_unpackhi_pi32(mm0, mm4); \
mm4 = _mm_unpacklo_pi32(mm2, mm6); \
mm5 = _mm_unpackhi_pi32(mm2, mm6); \
mm6 = _mm_unpacklo_pi32(mm3, mm7); \
mm7 = _mm_unpackhi_pi32(mm3, mm7); \
mm0 = tm0; \
mm1 = tm1; \
mm2 = tm2; \
mm3 = tm3; \
}

#define XTRANSPOSE(x, y, z) { \
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \
XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
}
#define XLPS32(x, y, data) { \
unsigned int xi; \
unsigned char *p; \
ALIGN(16) union uint512_u buf; \
XTRANSPOSE(x, y, (&buf)); \
p = (unsigned char *) &buf; \
for (xi = 0; xi < 8; xi++) \
{ \
__m64 mm0 = (__m64)(Ax[0][*(p++)]); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \
data->QWORD[xi] = (unsigned long long) mm0; \
} \
}

#define ROUND(i, Ki, data) { \
XLPS(Ki, (&C[i]), Ki); \
XLPS(Ki, data, data); \
}

0 comments on commit b618bbc

Please sign in to comment.