From b618bbc3a2a40c0f7467acaf80c27429e63dd20e Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Sat, 27 Nov 2021 04:40:06 +0300 Subject: [PATCH] gosthash2012: Import and merge MMX implementations Merged and fixed two MMX implementations. For example, [1] uses SSE2 register types `__m128i', [2] GCC's `mmintrin.h' defines `_mm_cvtsi64_m64' only for `__x86_64__', but we need MMX exactly for IA-32, since x86_64 it have SSE2 in baseline. Link: https://github.com/adegtyarev/streebog Link: https://github.com/sjinks/php-stribog Signed-off-by: Vitaly Chikunov --- CMakeLists.txt | 1 + gosthash2012.h | 13 +++++ gosthash2012_dispatch.h | 31 +++++++++++ gosthash2012_g.h | 5 +- gosthash2012_mmx.h | 113 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 gosthash2012_mmx.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ef4011579..9e58c7ab1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,6 +129,7 @@ set(GOST_HASH_2012_SOURCE_FILES gosthash2012_const.h gosthash2012_precalc.h gosthash2012_ref.h + gosthash2012_mmx.h gosthash2012_sse2.h gosthash2012_sse41.h ) diff --git a/gosthash2012.h b/gosthash2012.h index 902394b60..1d3b46bc8 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -13,6 +13,19 @@ /* Can be undef'd to disable ref impl. */ #define __GOST3411_HAS_REF__ +#ifdef __i386__ +/* Pure MMX is meaningful only for IA-32. */ +# ifdef __SSE__ +/* + * If user compiled with >= '-msse' compiler can introduce higher level + * microarchitecture optimizations into MMX code. + */ +# warning "MMX implementation will be broken if SSE enabled. Disabling it." +# else +# define __GOST3411_HAS_MMX__ +# endif +#endif + #ifdef __SSE2__ # define __GOST3411_HAS_SSE2__ # if !defined(__x86_64__) && !defined(__e2k__) diff --git a/gosthash2012_dispatch.h b/gosthash2012_dispatch.h index 39671127e..d9fb511ff 100644 --- a/gosthash2012_dispatch.h +++ b/gosthash2012_dispatch.h @@ -20,6 +20,33 @@ # define __has_builtin(x) 0 #endif +/* Construct MMX implementation. */ +#ifdef __GOST3411_HAS_MMX__ +# define g g_mmx +# define __GOST3411_USE_MMX__ +# if defined(__clang__) +# pragma clang attribute push (__attribute__((target("mmx"))), apply_to = function) +# elif defined(__GNUC__) +# pragma GCC push_options +# pragma GCC target("mmx") +# endif +# include "gosthash2012_mmx.h" +# include "gosthash2012_g.h" +# if defined(__clang__) +# pragma clang attribute pop +# elif defined(__GNUC__) +# pragma GCC pop_options +# endif +# undef XLOAD +# undef STORE +# undef TRANSPOSE +# undef XTRANSPOSE +# undef XLPS32 +# undef XLPS +# undef __GOST3411_USE_MMX__ +# undef g +#endif + /* * Construct SSE2 implementation. SSE2 is baseline in x86_64, but a feature * on IA-32, thus pass target() for IA-32. @@ -116,6 +143,10 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, if (__builtin_cpu_supports("sse2")) return g_sse2(h, N, m); # endif +# if defined __GOST3411_HAS_MMX__ + if (__builtin_cpu_supports("mmx")) + return g_mmx(h, N, m); +# endif # if defined __GOST3411_HAS_REF__ g_ref(h, N, m); # endif diff --git a/gosthash2012_g.h b/gosthash2012_g.h index 62bd837b2..b0fcbd91c 100644 --- a/gosthash2012_g.h +++ b/gosthash2012_g.h @@ -41,7 +41,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, /* This is only required on MMX, but EXTRACT32 is using MMX */ _mm_empty(); # endif -#else +#else /* ref and MMX impl. */ union uint512_u Ki, data; unsigned int i; @@ -60,5 +60,8 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, X((&data), h, (&data)); X((&data), m, h); +# ifdef __GOST3411_USE_MMX__ + _mm_empty(); +# endif #endif } diff --git a/gosthash2012_mmx.h b/gosthash2012_mmx.h new file mode 100644 index 000000000..83eae5f70 --- /dev/null +++ b/gosthash2012_mmx.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Alexey Degtyarev . + * Copyright (c) 2013 Vladimir Kolesnikov. + * Copyright (c) 2021 Vitaly Chikunov . + * All rights reserved. + * + * SPDX-License-Identifier: (BSD-2-Clause OR GPL-2.0+) AND MIT + */ + +#include + +#define XLPS XLPS32 + +#define X(x, y, z) { \ + z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ + z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \ + z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \ + z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \ + z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \ + z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \ + z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \ + z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \ +} + +#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + const __m64 *px = (const __m64 *) &x[0]; \ + const __m64 *py = (const __m64 *) &y[0]; \ + mm0 = _mm_xor_si64(px[0], py[0]); \ + mm1 = _mm_xor_si64(px[1], py[1]); \ + mm2 = _mm_xor_si64(px[2], py[2]); \ + mm3 = _mm_xor_si64(px[3], py[3]); \ + mm4 = _mm_xor_si64(px[4], py[4]); \ + mm5 = _mm_xor_si64(px[5], py[5]); \ + mm6 = _mm_xor_si64(px[6], py[6]); \ + mm7 = _mm_xor_si64(px[7], py[7]); \ +} + +#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + unsigned long long *__m64p = &P->QWORD[0]; \ + __m64p[0] = (unsigned long long)(mm0); \ + __m64p[1] = (unsigned long long)(mm1); \ + __m64p[2] = (unsigned long long)(mm2); \ + __m64p[3] = (unsigned long long)(mm3); \ + __m64p[4] = (unsigned long long)(mm4); \ + __m64p[5] = (unsigned long long)(mm5); \ + __m64p[6] = (unsigned long long)(mm6); \ + __m64p[7] = (unsigned long long)(mm7); \ +} + +#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + __m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \ + tm0 = _mm_unpacklo_pi8(mm0, mm2); \ + tm1 = _mm_unpackhi_pi8(mm0, mm2); \ + tm2 = _mm_unpacklo_pi8(mm1, mm3); \ + tm3 = _mm_unpackhi_pi8(mm1, mm3); \ + tm4 = _mm_unpacklo_pi8(mm4, mm6); \ + tm5 = _mm_unpackhi_pi8(mm4, mm6); \ + tm6 = _mm_unpacklo_pi8(mm5, mm7); \ + tm7 = _mm_unpackhi_pi8(mm5, mm7); \ + \ + mm0 = _mm_unpacklo_pi8(tm0, tm2); \ + mm1 = _mm_unpackhi_pi8(tm0, tm2); \ + mm2 = _mm_unpacklo_pi8(tm1, tm3); \ + mm3 = _mm_unpackhi_pi8(tm1, tm3); \ + mm4 = _mm_unpacklo_pi8(tm4, tm6); \ + mm5 = _mm_unpackhi_pi8(tm4, tm6); \ + mm6 = _mm_unpacklo_pi8(tm5, tm7); \ + mm7 = _mm_unpackhi_pi8(tm5, tm7); \ + \ + tm2 = _mm_unpacklo_pi32(mm1, mm5); \ + tm3 = _mm_unpackhi_pi32(mm1, mm5); \ + tm0 = _mm_unpacklo_pi32(mm0, mm4); \ + tm1 = _mm_unpackhi_pi32(mm0, mm4); \ + mm4 = _mm_unpacklo_pi32(mm2, mm6); \ + mm5 = _mm_unpackhi_pi32(mm2, mm6); \ + mm6 = _mm_unpacklo_pi32(mm3, mm7); \ + mm7 = _mm_unpackhi_pi32(mm3, mm7); \ + mm0 = tm0; \ + mm1 = tm1; \ + mm2 = tm2; \ + mm3 = tm3; \ +} + +#define XTRANSPOSE(x, y, z) { \ + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \ + XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ +} +#define XLPS32(x, y, data) { \ + unsigned int xi; \ + unsigned char *p; \ + ALIGN(16) union uint512_u buf; \ + XTRANSPOSE(x, y, (&buf)); \ + p = (unsigned char *) &buf; \ + for (xi = 0; xi < 8; xi++) \ + { \ + __m64 mm0 = (__m64)(Ax[0][*(p++)]); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \ + data->QWORD[xi] = (unsigned long long) mm0; \ + } \ +} + +#define ROUND(i, Ki, data) { \ + XLPS(Ki, (&C[i]), Ki); \ + XLPS(Ki, data, data); \ +}