From 24b36a35aba2296f075a0fba72a8d9d2b6c97441 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sun, 1 Sep 2024 09:33:44 +0000
Subject: [PATCH 1/2] Wire up CMake to enable compiling files with AArch64
 SVE/SVE2

This commit enables compilation for the new `sve/` and `sve2/`
subdirectories under `CommonLib/arm/`, assuming that the appropriate
`-march=...` flags are available.

Introduce a new `set_if_compiler_supports_arm_extensions` function to
set up variables based on whether the compiler supports SVE and SVE2,
then use these along with the existing requirements (SVE2 requires SVE
to be enabled, SVE requires Neon to be enabled) to determine what
features ultimately end up being enabled.

The implementation of the extension checking helper function needs to
check (a) whether the flag is supported, and (b) whether code will
successfully compile when using that flag. The latter is needed since
there are some old versions of LLVM that are missing the
`arm_neon_sve_bridge.h` header, and LLVM currently fails to compile SVE
code when targeting Windows.
---
 CMakeLists.txt                           | 29 ++++++-----
 cmake/modules/vvencCompilerSupport.cmake | 66 ++++++++++++++++++++++++
 source/Lib/vvenc/CMakeLists.txt          | 33 +++++++++++-
 3 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62eba9b0d..6b72eab64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,9 +32,12 @@ if( VVENC_TARGET_ARCH MATCHES "ARM\|AARCH64" )
     set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-lax-vector-conversions" )
     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-lax-vector-conversions" )
   endif()
+
+  # Check if SVE/SVE2 flags are supported by the compiler.
+  set_if_compiler_supports_arm_extensions( FLAG_sve FLAG_sve2 )
 endif()
 
-# we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86
+# We enable x86 intrinsics for all target architectures, because they are implemented through SIMD-everywhere on non-x86.
 set( VVENC_ENABLE_X86_SIMD TRUE                      CACHE BOOL "Enable x86 intrinsics" )
 set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "Enable Arm Neon intrinsics" )
 set( VVENC_ENABLE_ARM_SIMD_SVE FALSE                 CACHE BOOL "Enable Arm SVE intrinsics" )
@@ -81,27 +84,25 @@ if( VVENC_TARGET_ARCH STREQUAL "AARCH64" )
   if( VVENC_ENABLE_ARM_SIMD )
     message( STATUS "AArch64 Neon intrinsics enabled" )
     add_compile_definitions( TARGET_SIMD_ARM=1 )
-  else()
-    message( STATUS "AArch64 Neon intrinsics disabled, disabling AArch64 SVE/SVE2 intrinsics" )
-    # If Neon is disabled make sure that SVE/SVE2 are also disabled.
-    set( VVENC_ENABLE_ARM_SIMD_SVE FALSE )
-    set( VVENC_ENABLE_ARM_SIMD_SVE2 FALSE )
   endif()
 
-  if( VVENC_ENABLE_ARM_SIMD_SVE )
+  # If Neon is disabled or SVE is not supported, make sure that SVE is disabled.
+  if ( NOT VVENC_ENABLE_ARM_SIMD OR NOT FLAG_sve )
+    message( STATUS "Disabling AArch64 SVE/SVE2 intrinsics" )
+    set( VVENC_ENABLE_ARM_SIMD_SVE FALSE )
+  elseif( VVENC_ENABLE_ARM_SIMD_SVE )
     message( STATUS "AArch64 SVE intrinsics enabled" )
     add_compile_definitions( TARGET_SIMD_ARM_SVE=1 )
-  else()
-    message( STATUS "AArch64 SVE intrinsics disabled, disabling AArch64 SVE2 intrinsics" )
-    # If SVE is disabled make sure that SVE2 are also disabled.
-    set( VVENC_ENABLE_ARM_SIMD_SVE2 FALSE )
   endif()
 
-  if( VVENC_ENABLE_ARM_SIMD_SVE2 )
+  # If SVE is disabled or SVE2 is not supported, make sure that SVE2 is disabled.
+  if ( NOT VVENC_ENABLE_ARM_SIMD_SVE OR NOT FLAG_sve2 )
+    message( STATUS "Disabling AArch64 SVE2 intrinsics" )
+    # If SVE is disabled make sure that SVE2 are also disabled.
+    set( VVENC_ENABLE_ARM_SIMD_SVE2 FALSE )
+  elseif( VVENC_ENABLE_ARM_SIMD_SVE2 )
     message( STATUS "AArch64 SVE2 intrinsics enabled" )
     add_compile_definitions( TARGET_SIMD_ARM_SVE2=1 )
-  else()
-    message( STATUS "AArch64 SVE2 intrinsics disabled" )
   endif()
 endif()
 
diff --git a/cmake/modules/vvencCompilerSupport.cmake b/cmake/modules/vvencCompilerSupport.cmake
index 5463385a3..141a7d997 100644
--- a/cmake/modules/vvencCompilerSupport.cmake
+++ b/cmake/modules/vvencCompilerSupport.cmake
@@ -1,5 +1,6 @@
 include( CheckCCompilerFlag )
 include( CheckCSourceCompiles )
+include( CheckCXXSourceCompiles )
 
 function( set_if_compiler_supports_flag output_var flag )
   string( REGEX REPLACE "[-.]" "_" SUPPORTED_flag_var "SUPPORTED${flag}" )
@@ -84,6 +85,71 @@ function( _emscripten_enable_wasm_simd128 )
   endif()
 endfunction()
 
+function( _set_if_compiler_supports_sve_flag output_var sve_flag )
+  set_if_compiler_supports_flag( tmp_var "${sve_flag}" )
+  if( NOT tmp_var )
+    return()
+  endif()
+
+  set( OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} )
+  set( OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE} )
+  set( CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${sve_flag}" )
+  set( CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY )
+
+  # Check whether the compiler can compile SVE functions that require
+  # backup/restore of SVE registers according to AAPCS.
+  # https://github.com/llvm/llvm-project/issues/80009.
+  set( SVE_COMPILATION_TEST "
+#include <arm_sve.h>
+void other();
+svfloat32_t func(svfloat32_t a) {
+other();
+return a;
+}
+int main() { return 0; }" )
+
+  check_c_source_compiles( "${SVE_COMPILATION_TEST}" SVE_COMPILATION_C_TEST_COMPILED )
+  check_cxx_source_compiles( "${SVE_COMPILATION_TEST}" SVE_COMPILATION_CXX_TEST_COMPILED )
+
+  # Check if arm_neon_sve_bridge.h is available.
+  set( SVE_HEADER_TEST "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+int main() { return 0; }")
+  check_c_source_compiles( "${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED )
+  check_cxx_source_compiles( "${SVE_HEADER_TEST}" SVE_HEADER_CXX_TEST_COMPILED )
+
+  set( CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS} )
+  set( CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE} )
+
+  if( SVE_COMPILATION_C_TEST_COMPILED AND SVE_COMPILATION_CXX_TEST_COMPILED AND
+      SVE_HEADER_C_TEST_COMPILED AND SVE_HEADER_CXX_TEST_COMPILED )
+    set( ${output_var} "${tmp_var}" PARENT_SCOPE )
+  endif()
+endfunction()
+
+# Check if the compiler supports the AArch64 SVE and SVE2 extensions, and set
+# variables for flags used to enable them to avoid duplication.
+function( set_if_compiler_supports_arm_extensions output_flag_sve output_flag_sve2 )
+  if( NOT(( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" ) OR
+          ( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" )))
+    return()
+  endif()
+  if( UNIX OR MINGW )
+    # SVE is an optional feature from Armv8.2-A.
+    set( _flag_sve "-march=armv8.2-a+sve" )
+    _set_if_compiler_supports_sve_flag( _sve_supported "${_flag_sve}" )
+    set( ${output_flag_sve} "${_sve_supported}" PARENT_SCOPE )
+
+    # SVE2 is an optional feature from Armv9.0-A.
+    set( _flag_sve2 "-march=armv9-a+sve2" )
+    _set_if_compiler_supports_sve_flag( _sve2_supported "${_flag_sve2}" )
+    set( ${output_flag_sve2} "${_sve2_supported}" PARENT_SCOPE )
+  endif()
+endfunction()
 
 function( check_problematic_compiler output_var compiler_id first_bad_version first_fixed_version )
   if( CMAKE_CXX_COMPILER_ID STREQUAL "${compiler_id}"
diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt
index 2d1986f19..9fc3cf3c2 100644
--- a/source/Lib/vvenc/CMakeLists.txt
+++ b/source/Lib/vvenc/CMakeLists.txt
@@ -49,6 +49,8 @@ if( VVENC_ENABLE_ARM_SIMD )
   file( GLOB ARM_INC_FILES CONFIGURE_DEPENDS      "../CommonLib/arm/*.h"        )
 
   file( GLOB ARM_NEON_SRC_FILES CONFIGURE_DEPENDS "../CommonLib/arm/neon/*.cpp" )
+  file( GLOB ARM_SVE_SRC_FILES CONFIGURE_DEPENDS "../CommonLib/arm/sve/*.cpp" )
+  file( GLOB ARM_SVE2_SRC_FILES CONFIGURE_DEPENDS "../CommonLib/arm/sve2/*.cpp" )
 endif()
 
 # get public/extern include files
@@ -133,7 +135,36 @@ endif()
 if( VVENC_ENABLE_ARM_SIMD )
   # Set needed compile definitions.
   set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON )
-  add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} )
+
+  if(( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" ) OR ( ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" ))
+    # Neon is mandatory in AArch64, so no additional compile flags needed here.
+  else()
+    set_if_compiler_supports_flag( FLAG_mfpu_neon "-mfpu=neon" )
+    set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mfpu_neon}" )
+  endif()
+  list(APPEND ARM_SRC_FILES ${ARM_NEON_SRC_FILES})
+
+  if( VVENC_ENABLE_ARM_SIMD_SVE )
+    set_property( SOURCE ${ARM_SVE_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON USE_SVE )
+    set_property( SOURCE ${ARM_SVE_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS ${FLAG_sve} )
+    list(APPEND ARM_SRC_FILES ${ARM_SVE_SRC_FILES})
+  endif()
+
+  if( VVENC_ENABLE_ARM_SIMD_SVE2 )
+    set_property( SOURCE ${ARM_SVE2_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON USE_SVE USE_SVE2 )
+    set_property( SOURCE ${ARM_SVE2_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS ${FLAG_sve2} )
+    list(APPEND ARM_SRC_FILES ${ARM_SVE2_SRC_FILES})
+  endif()
+
+  add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_SRC_FILES} )
+
+  # Disable LTO for the files compiled with special architecture flags.
+  set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
+                                              INTERPROCEDURAL_OPTIMIZATION                OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_RELEASE        OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
+                                              INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL     OFF )
+
   set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib )
 endif()
 

From 4c8bfef9bf377ecc8a8013eb9702967cba502cf1 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 5 Sep 2024 20:18:28 +0000
Subject: [PATCH 2/2] Add AArch64 SVE implementation for TCoeffOps
 fastFwdCore_2D

The SVE 16-bit dot-product instructions allow us to accumulate twice as
much data per instruction compared to Neon multiply-add instructions,
giving a good speedup for the fastFwdCore_2D kernels.

Compared to Neon with a fixed vector length of 128 bits, SVE allows
different micro-architectures to expose a number of different vector
lengths: 128, 256, 512, 1024, or 2048 bits. To take advantage of this we
can rewrite the innermost loop of fastFwdCore_2D to be expressed in
terms of the number of vectors to process rather than the number of
elements, and then pick the number of iterations at setup-time by
inspecting the vector length. This allows us to largely avoid needing an
entire set of kernels for each possible vector length.

One caveat to the notion of having completely vector-length agnostic
kernels is that when the vector-length is known to be exactly 128-bits
(the same as Neon) we can make use of some Neon instructions to speed up
processing the data after the accumulation. This is possible since Neon
and SVE registers share the low 128-bits of each vector register.

For this commit we have not attempted to add kernels that process less
than a full vector's worth of data per inner loop iteration, which would
enable using these kernels on machines with very wide vectors (512,
1024, or 2048 bits). This is technically straightforward since SVE
supports partial vectors via predication, however there are no known
long-vector micro-architectures available at present to justify
maintaining such code.

Running a video encoding job on SVE-capable machines using the
--preset=fast setting shows the following improvements in reported FPS:

Neoverse V1 (VL=256 bits): ~1.3%
Neoverse V2 (VL=128 bits): ~2.6%
---
 source/Lib/CommonLib/arm/InitARM.cpp       |  12 +-
 source/Lib/CommonLib/arm/sve/Trafo_sve.cpp | 251 +++++++++++++++++++++
 2 files changed, 259 insertions(+), 4 deletions(-)
 create mode 100644 source/Lib/CommonLib/arm/sve/Trafo_sve.cpp

diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
index 9d664c02d..5c7da279e 100644
--- a/source/Lib/CommonLib/arm/InitARM.cpp
+++ b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -118,8 +118,14 @@ void TCoeffOps::initTCoeffOpsARM()
   {
     _initTCoeffOpsARM<NEON>();
   }
+#if TARGET_SIMD_ARM_SVE
+  if( vext >= SVE )
+  {
+    _initTCoeffOpsARM<SVE>();
+  }
+#endif  // TARGET_SIMD_ARM_SVE
 }
-#endif
+#endif  // ENABLE_SIMD_TRAFO
 
 #if ENABLE_SIMD_OPT_BDOF
 void InterPredInterpolation::initInterPredictionARM()
@@ -135,8 +141,6 @@ void InterPredInterpolation::initInterPredictionARM()
 }
 #endif
 
-
-
-#endif   // TARGET_SIMD_ARM
+#endif  // TARGET_SIMD_ARM
 
 }   // namespace
diff --git a/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp b/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp
new file mode 100644
index 000000000..474827b57
--- /dev/null
+++ b/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp
@@ -0,0 +1,251 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+/**
+ * \file Trafo_sve.cpp
+ * \brief SVE implementation of TCoeffOps for AArch64.
+ */
+
+//  ====================================================================================================================
+//  Includes
+//  ====================================================================================================================
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+
+#include "TrQuant.h"
+#include "TrQuant_EMT.h"
+
+//! \ingroup CommonLib
+//! \{
+
+#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_TRAFO
+
+#include <arm_neon_sve_bridge.h>
+#include <arm_sve.h>
+
+namespace vvenc
+{
+
+static svint16_t load_narrow_to_s16( const int32_t* src )
+{
+  svint32_t lo = svld1_vnum_s32( svptrue_b32(), src, 0 );
+  svint32_t hi = svld1_vnum_s32( svptrue_b32(), src, 1 );
+  return svuzp1_s16( svreinterpret_s16_s32( lo ), svreinterpret_s16_s32( hi ) );
+}
+
+static int64_t shift_and_round( int64_t x, int shift )
+{
+  return ( x + ( 1 << ( shift - 1 ) ) ) >> shift;
+}
+
+template<int vlBits>
+static inline void fastFwdCore_reduce_x4_sve( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                              int shift );
+
+template<>
+inline void fastFwdCore_reduce_x4_sve<128>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                            int shift )
+{
+  // For a 128-bit vector length we do not need to reduce the sum down, use
+  // svget_neonq to operate on the Neon vectors directly so we can use pairwise
+  // additions to incrementally sum each vector.
+  int64x2_t v01   = vpaddq_s64( svget_neonq_s64( v0 ), svget_neonq_s64( v1 ) );
+  int64x2_t v23   = vpaddq_s64( svget_neonq_s64( v2 ), svget_neonq_s64( v3 ) );
+  int32x4_t v0123 = vuzp1q_s32( vreinterpretq_s32_s64( v01 ), vreinterpretq_s32_s64( v23 ) );
+  v0123           = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
+  vst1q_s32( dst, v0123 );
+}
+
+template<>
+inline void fastFwdCore_reduce_x4_sve<256>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                            int shift )
+{
+  // Halve the data width such that we only utilise the low half (128 bits) of each vector.
+  svint32_t v0_s32 = svuzp1_s32( svreinterpret_s32_s64( v0 ), svreinterpret_s32_s64( v0 ) );
+  svint32_t v1_s32 = svuzp1_s32( svreinterpret_s32_s64( v1 ), svreinterpret_s32_s64( v1 ) );
+  svint32_t v2_s32 = svuzp1_s32( svreinterpret_s32_s64( v2 ), svreinterpret_s32_s64( v2 ) );
+  svint32_t v3_s32 = svuzp1_s32( svreinterpret_s32_s64( v3 ), svreinterpret_s32_s64( v3 ) );
+
+  // Now that we have data in the low 128 bits of each vector, use svget_neonq
+  // to operate on the Neon vectors directly and use pairwise additions to
+  // incrementally sum each vector.
+  int32x4_t v01   = vpaddq_s32( svget_neonq_s32( v0_s32 ), svget_neonq_s32( v1_s32 ) );
+  int32x4_t v23   = vpaddq_s32( svget_neonq_s32( v2_s32 ), svget_neonq_s32( v3_s32 ) );
+  int32x4_t v0123 = vpaddq_s32( v01, v23 );
+  v0123           = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
+  vst1q_s32( dst, v0123 );
+}
+
+template<int vlBits, unsigned trVecs>
+static void fastFwdCore_nVec_sve( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line,
+                                  unsigned reducedLine, unsigned cutoff, int shift )
+{
+  CHECK( cutoff % 4 != 0, "Cutoff should be a multiple of four" );
+  CHECK( cutoff == 0, "Cutoff should be non-zero" );
+  CHECK( shift == 0, "Shift must be at least one" );
+
+  unsigned trSize = trVecs * svcnth();
+  unsigned i      = 0;
+  for( ; i < ( reducedLine & ~3U ); i += 4 )
+  {
+    for( int j = 0; j < cutoff; j += 4 )
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      svint64_t sum00 = svdup_n_s64( 0 );
+      svint64_t sum01 = svdup_n_s64( 0 );
+      svint64_t sum02 = svdup_n_s64( 0 );
+      svint64_t sum03 = svdup_n_s64( 0 );
+      svint64_t sum10 = svdup_n_s64( 0 );
+      svint64_t sum11 = svdup_n_s64( 0 );
+      svint64_t sum12 = svdup_n_s64( 0 );
+      svint64_t sum13 = svdup_n_s64( 0 );
+      svint64_t sum20 = svdup_n_s64( 0 );
+      svint64_t sum21 = svdup_n_s64( 0 );
+      svint64_t sum22 = svdup_n_s64( 0 );
+      svint64_t sum23 = svdup_n_s64( 0 );
+      svint64_t sum30 = svdup_n_s64( 0 );
+      svint64_t sum31 = svdup_n_s64( 0 );
+      svint64_t sum32 = svdup_n_s64( 0 );
+      svint64_t sum33 = svdup_n_s64( 0 );
+      for( int k = 0; k < trVecs; ++k )
+      {
+        svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
+        svint16_t s1 = load_narrow_to_s16( srci + 1 * trSize );
+        svint16_t s2 = load_narrow_to_s16( srci + 2 * trSize );
+        svint16_t s3 = load_narrow_to_s16( srci + 3 * trSize );
+        svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
+        svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
+        svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
+        svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
+        sum00        = svdot_s64( sum00, s0, c0 );
+        sum01        = svdot_s64( sum01, s0, c1 );
+        sum02        = svdot_s64( sum02, s0, c2 );
+        sum03        = svdot_s64( sum03, s0, c3 );
+        sum10        = svdot_s64( sum10, s1, c0 );
+        sum11        = svdot_s64( sum11, s1, c1 );
+        sum12        = svdot_s64( sum12, s1, c2 );
+        sum13        = svdot_s64( sum13, s1, c3 );
+        sum20        = svdot_s64( sum20, s2, c0 );
+        sum21        = svdot_s64( sum21, s2, c1 );
+        sum22        = svdot_s64( sum22, s2, c2 );
+        sum23        = svdot_s64( sum23, s2, c3 );
+        sum30        = svdot_s64( sum30, s3, c0 );
+        sum31        = svdot_s64( sum31, s3, c1 );
+        sum32        = svdot_s64( sum32, s3, c2 );
+        sum33        = svdot_s64( sum33, s3, c3 );
+
+        srci += svcnth();
+        tcj += svcnth();
+      }
+      TCoeff* dstij = dst + j * line + i;
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 0 * line, sum00, sum10, sum20, sum30, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 1 * line, sum01, sum11, sum21, sum31, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 2 * line, sum02, sum12, sum22, sum32, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 3 * line, sum03, sum13, sum23, sum33, shift );
+    }
+  }
+  for( ; i < reducedLine; ++i )
+  {
+    for( int j = 0; j < cutoff; j += 4 )
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      svint64_t sum00 = svdup_n_s64( 0 );
+      svint64_t sum01 = svdup_n_s64( 0 );
+      svint64_t sum02 = svdup_n_s64( 0 );
+      svint64_t sum03 = svdup_n_s64( 0 );
+      for( int k = 0; k < trVecs; ++k )
+      {
+        svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
+        svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
+        svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
+        svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
+        svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
+        sum00        = svdot_s64( sum00, s0, c0 );
+        sum01        = svdot_s64( sum01, s0, c1 );
+        sum02        = svdot_s64( sum02, s0, c2 );
+        sum03        = svdot_s64( sum03, s0, c3 );
+
+        srci += svcnth();
+        tcj += svcnth();
+      }
+      TCoeff* dstij         = dst + j * line + i;
+      dstij[ 0 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum00 ), shift );
+      dstij[ 1 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum01 ), shift );
+      dstij[ 2 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum02 ), shift );
+      dstij[ 3 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum03 ), shift );
+    }
+  }
+}
+
+template<>
+void TCoeffOps::_initTCoeffOpsARM<SVE>()
+{
+  // Wire up kernels based on how many vector iterations we need in the inner
+  // loop. Use Neon if we don't have at least one vector of work to do. Arm
+  // Neoverse micro-architectures only currently exist with vector lengths of
+  // 128 and 256 bits, so don't bother specialising for other vector lengths.
+  switch( svcnth() )
+  {
+  case 8:  // SVE VL = 128-bits
+    fastFwdCore_2D[ 1 ] = fastFwdCore_nVec_sve<128, 1>;
+    fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<128, 2>;
+    fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<128, 4>;
+    fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<128, 8>;
+    break;
+  case 16:  // SVE VL = 256-bits
+    fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<256, 1>;
+    fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<256, 2>;
+    fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<256, 4>;
+    break;
+  default:
+    // Don't use SVE for other vector lengths, fall back to Neon.
+    break;
+  }
+}
+
+}  // namespace vvenc
+
+#endif
+//! \}