gosthash2012: Import and merge MMX implementations

Merged and fixed two MMX implementations. For example, [1] uses SSE2 register types `__m128i', [2] GCC's `mmintrin.h' defines `_mm_cvtsi64_m64' only for `__x86_64__', but we need MMX exactly for IA-32, since x86_64 it have SSE2 in baseline. Link: https://github.com/adegtyarev/streebog Link: https://github.com/sjinks/php-stribog Signed-off-by: Vitaly Chikunov <[email protected]>
gost-engine · Nov 27, 2021 · b618bbc · b618bbc
1 parent ffa92dc
commit b618bbc
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -129,6 +129,7 @@ set(GOST_HASH_2012_SOURCE_FILES
         gosthash2012_const.h
         gosthash2012_precalc.h
         gosthash2012_ref.h
+        gosthash2012_mmx.h
         gosthash2012_sse2.h
         gosthash2012_sse41.h
         )

diff --git a/gosthash2012.h b/gosthash2012.h
@@ -13,6 +13,19 @@
 /* Can be undef'd to disable ref impl. */
 #define __GOST3411_HAS_REF__
 
+#ifdef __i386__
+/* Pure MMX is meaningful only for IA-32. */
+# ifdef __SSE__
+/*
+ * If user compiled with >= '-msse' compiler can introduce higher level
+ * microarchitecture optimizations into MMX code.
+ */
+#  warning "MMX implementation will be broken if SSE enabled. Disabling it."
+# else
+#  define __GOST3411_HAS_MMX__
+# endif
+#endif
+
 #ifdef __SSE2__
 # define __GOST3411_HAS_SSE2__
 # if !defined(__x86_64__) && !defined(__e2k__)

diff --git a/gosthash2012_dispatch.h b/gosthash2012_dispatch.h
@@ -20,6 +20,33 @@
 # define __has_builtin(x) 0
 #endif
 
+/* Construct MMX implementation. */
+#ifdef __GOST3411_HAS_MMX__
+# define g g_mmx
+# define __GOST3411_USE_MMX__
+# if defined(__clang__)
+#  pragma clang attribute push (__attribute__((target("mmx"))), apply_to = function)
+# elif defined(__GNUC__)
+#  pragma GCC push_options
+#  pragma GCC target("mmx")
+# endif
+# include "gosthash2012_mmx.h"
+# include "gosthash2012_g.h"
+# if defined(__clang__)
+#  pragma clang attribute pop
+# elif defined(__GNUC__)
+#  pragma GCC pop_options
+# endif
+# undef XLOAD
+# undef STORE
+# undef TRANSPOSE
+# undef XTRANSPOSE
+# undef XLPS32
+# undef XLPS
+# undef __GOST3411_USE_MMX__
+# undef g
+#endif
+
 /*
  * Construct SSE2 implementation. SSE2 is baseline in x86_64, but a feature
  * on IA-32, thus pass target() for IA-32.
@@ -116,6 +143,10 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
     if (__builtin_cpu_supports("sse2"))
 	return g_sse2(h, N, m);
 # endif
+# if defined __GOST3411_HAS_MMX__
+    if (__builtin_cpu_supports("mmx"))
+	return g_mmx(h, N, m);
+# endif
 # if defined __GOST3411_HAS_REF__
     g_ref(h, N, m);
 # endif

diff --git a/gosthash2012_g.h b/gosthash2012_g.h
@@ -41,7 +41,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
     /* This is only required on MMX, but EXTRACT32 is using MMX */
     _mm_empty();
 # endif
-#else
+#else /* ref and MMX impl. */
     union uint512_u Ki, data;
     unsigned int i;
 
@@ -60,5 +60,8 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
 
     X((&data), h, (&data));
     X((&data), m, h);
+# ifdef __GOST3411_USE_MMX__
+    _mm_empty();
+# endif
 #endif
 }
diff --git a/gosthash2012_mmx.h b/gosthash2012_mmx.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013, Alexey Degtyarev <[email protected]>.
+ * Copyright (c) 2013 Vladimir Kolesnikov.
+ * Copyright (c) 2021 Vitaly Chikunov <[email protected]>.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: (BSD-2-Clause OR GPL-2.0+) AND MIT
+ */
+
+#include <mmintrin.h>
+
+#define XLPS XLPS32
+
+#define X(x, y, z) { \
+    z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \
+    z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \
+    z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \
+    z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \
+    z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \
+    z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \
+    z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \
+    z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \
+}
+
+#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
+    const __m64 *px = (const __m64 *) &x[0]; \
+    const __m64 *py = (const __m64 *) &y[0]; \
+    mm0 = _mm_xor_si64(px[0], py[0]); \
+    mm1 = _mm_xor_si64(px[1], py[1]); \
+    mm2 = _mm_xor_si64(px[2], py[2]); \
+    mm3 = _mm_xor_si64(px[3], py[3]); \
+    mm4 = _mm_xor_si64(px[4], py[4]); \
+    mm5 = _mm_xor_si64(px[5], py[5]); \
+    mm6 = _mm_xor_si64(px[6], py[6]); \
+    mm7 = _mm_xor_si64(px[7], py[7]); \
+}
+
+#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
+    unsigned long long *__m64p = &P->QWORD[0]; \
+    __m64p[0] = (unsigned long long)(mm0); \
+    __m64p[1] = (unsigned long long)(mm1); \
+    __m64p[2] = (unsigned long long)(mm2); \
+    __m64p[3] = (unsigned long long)(mm3); \
+    __m64p[4] = (unsigned long long)(mm4); \
+    __m64p[5] = (unsigned long long)(mm5); \
+    __m64p[6] = (unsigned long long)(mm6); \
+    __m64p[7] = (unsigned long long)(mm7); \
+}
+
+#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
+    __m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \
+    tm0 = _mm_unpacklo_pi8(mm0, mm2); \
+    tm1 = _mm_unpackhi_pi8(mm0, mm2); \
+    tm2 = _mm_unpacklo_pi8(mm1, mm3); \
+    tm3 = _mm_unpackhi_pi8(mm1, mm3); \
+    tm4 = _mm_unpacklo_pi8(mm4, mm6); \
+    tm5 = _mm_unpackhi_pi8(mm4, mm6); \
+    tm6 = _mm_unpacklo_pi8(mm5, mm7); \
+    tm7 = _mm_unpackhi_pi8(mm5, mm7); \
+    \
+    mm0 = _mm_unpacklo_pi8(tm0, tm2); \
+    mm1 = _mm_unpackhi_pi8(tm0, tm2); \
+    mm2 = _mm_unpacklo_pi8(tm1, tm3); \
+    mm3 = _mm_unpackhi_pi8(tm1, tm3); \
+    mm4 = _mm_unpacklo_pi8(tm4, tm6); \
+    mm5 = _mm_unpackhi_pi8(tm4, tm6); \
+    mm6 = _mm_unpacklo_pi8(tm5, tm7); \
+    mm7 = _mm_unpackhi_pi8(tm5, tm7); \
+    \
+    tm2 = _mm_unpacklo_pi32(mm1, mm5); \
+    tm3 = _mm_unpackhi_pi32(mm1, mm5); \
+    tm0 = _mm_unpacklo_pi32(mm0, mm4); \
+    tm1 = _mm_unpackhi_pi32(mm0, mm4); \
+    mm4 = _mm_unpacklo_pi32(mm2, mm6); \
+    mm5 = _mm_unpackhi_pi32(mm2, mm6); \
+    mm6 = _mm_unpacklo_pi32(mm3, mm7); \
+    mm7 = _mm_unpackhi_pi32(mm3, mm7); \
+    mm0 = tm0; \
+    mm1 = tm1; \
+    mm2 = tm2; \
+    mm3 = tm3; \
+}
+
+#define XTRANSPOSE(x, y, z) { \
+    __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \
+    XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
+    TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
+    STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
+}
+#define XLPS32(x, y, data) { \
+    unsigned int xi; \
+    unsigned char *p; \
+    ALIGN(16) union uint512_u buf; \
+    XTRANSPOSE(x, y, (&buf)); \
+    p = (unsigned char *) &buf; \
+    for (xi = 0; xi < 8; xi++) \
+    { \
+	__m64 mm0 =             (__m64)(Ax[0][*(p++)]); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \
+	mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \
+        data->QWORD[xi] = (unsigned long long) mm0; \
+    } \
+}
+
+#define ROUND(i, Ki, data) { \
+    XLPS(Ki, (&C[i]), Ki); \
+    XLPS(Ki, data, data); \
+}