From 29716df957401e9c357348e9b73c95a38cdcd34f Mon Sep 17 00:00:00 2001 From: Anthony Roberts Date: Fri, 2 Aug 2024 16:55:57 +0100 Subject: [PATCH] Add support for clang-cl on Windows (#633) This commit adds support for clang-cl (clang, pretending to be MSVC) to SSE2NEON on Windows ARM64 platforms. This change is part of some Blender work, as using clang-cl provides a ~20-40% speedup compared to MSVC. Compiled with the following command line (via a VS2022 Native ARM64 Tools CMD window): msbuild sse2neon.vcxproj /p:Configuration=Release /p:CLToolExe=clang-cl.exe /p:CLToolPath="C:\Program Files\LLVM\bin\" Known failures in test suite: Test mm_cvttpd_epi32 Test rdtsc Co-authored-by: Anthony Roberts --- sse2neon.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index 45da43ff..50f56465 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -205,7 +205,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) } /* Compiler barrier */ -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) #define SSE2NEON_BARRIER() _ReadWriteBarrier() #else #define SSE2NEON_BARRIER() \ @@ -888,7 +888,7 @@ FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) __n64 a1 = {a}, b1 = {b}; return vreinterpretq_u64_p128(vmull_p64(a1, b1)); #else @@ -1799,7 +1799,7 @@ FORCE_INLINE void _mm_free(void *addr) FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) { uint64_t value; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) value = _ReadStatusReg(ARM64_FPCR); #else __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ @@ -1809,7 +1809,7 @@ FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) { -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) _WriteStatusReg(ARM64_FPCR, value); #else __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ @@ -2278,7 +2278,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) FORCE_INLINE void _mm_prefetch(char const *p, int i) { (void) i; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) switch (i) { case _MM_HINT_NTA: __prefetch2(p, 1); @@ -4921,7 +4921,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause FORCE_INLINE void _mm_pause(void) { -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) __isb(_ARM64_BARRIER_SY); #else __asm__ __volatile__("isb\n"); @@ -5830,7 +5830,7 @@ FORCE_INLINE __m128d _mm_undefined_pd(void) #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128d a; -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) a = _mm_setzero_pd(); #endif return a; @@ -8272,7 +8272,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) FORCE_INLINE int _sse2neon_clz(unsigned int x) { -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) unsigned long cnt = 0; if (_BitScanReverse(&cnt, x)) return 31 - cnt; @@ -8284,7 +8284,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x) FORCE_INLINE int _sse2neon_ctz(unsigned int x) { -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) unsigned long cnt = 0; if (_BitScanForward(&cnt, x)) return cnt; @@ -9200,7 +9200,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__clang__) uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) @@ -9388,7 +9388,7 @@ FORCE_INLINE uint64_t _rdtsc(void) * bits wide and it is attributed with the flag 'cap_user_time_short' * is true. */ -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__clang__) val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); #else __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));