diff --git a/AUTHORS.md b/AUTHORS.md
index ea8c981e1..e9b601d1a 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -12,3 +12,4 @@
 * Christian Stoffers, , Fraunhofer HHI
 * Gabriel Hege, , Fraunhofer HHI
 * Jens Güther, , Fraunhofer HHI
+* Florian Eisenreich, , Fraunhofer HHI
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fb95898b..5565dcb4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,8 +21,18 @@ endif()
 set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules" )
 message( STATUS "CMAKE_MODULE_PATH: updating module path to: ${CMAKE_MODULE_PATH}" )
 
+# check for arm architecture support
+set( VVENC_ARM_SIMD_DEFAULT FALSE )
+if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm"
+    OR "${CMAKE_CXX_COMPILER}" MATCHES "aarch64\|arm"
+    OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64\|armv" )
+    AND NOT "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86\|x64" )
+  set( VVENC_ARM_SIMD_DEFAULT TRUE )
+endif()
+
 # we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86
 set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" )
+set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" )
 
 include( vvencCompilerSupport )
 
@@ -39,8 +49,13 @@ if( VVENC_ENABLE_X86_SIMD )
     check_missing_intrinsics()
   endif()
 
-  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_SIMD_X86" )
-  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTARGET_SIMD_X86" )
+  message( STATUS "x86 SIMD intrinsics enabled (using SIMDE for non-x86 targets)" )
+  add_compile_definitions( TARGET_SIMD_X86 )
+endif()
+
+if( VVENC_ENABLE_ARM_SIMD )
+  message( STATUS "ARM SIMD intrinsics enabled" )
+  add_compile_definitions( TARGET_SIMD_ARM )
 endif()
 
 if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h
index e379b2c78..83e1c556d 100644
--- a/include/vvenc/vvencCfg.h
+++ b/include/vvenc/vvencCfg.h
@@ -769,7 +769,11 @@ typedef struct vvenc_config
   int8_t              m_sliceTypeAdapt;                                                  // enable slice type adaptation (STA)
   bool                m_treatAsSubPic;
 
-  int                 m_RCMaxBitrate;                                                    // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR))
+#define VVENC_SET_MAXRATE_FACTOR(f) (-((int)(f*16+0.5)))
+  int                 m_RCMaxBitrate;                                                    // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR),
+                                                                                         // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate).
+                                                                                         // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate.
+                                                                                         // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier
   int                 m_reservedInt;
   double              m_reservedDouble[9];
 
diff --git a/source/App/vvencFFapp/EncApp.cpp b/source/App/vvencFFapp/EncApp.cpp
index a6a449d45..88b558bcb 100644
--- a/source/App/vvencFFapp/EncApp.cpp
+++ b/source/App/vvencFFapp/EncApp.cpp
@@ -307,7 +307,7 @@ int EncApp::encode()
     int64_t frameCount =  apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput );
     frameCount = std::max<int64_t>( 0, frameCount-appCfg.m_FrameSkip );
     int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded;
-    cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
+    cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, vvencCfg.m_verbosity, "vvenc [info]: " );
     bool statsInfoReady = false;
 
     // loop over input YUV data
@@ -370,6 +370,7 @@ int EncApp::encode()
           if( statsInfoReady )
           {
             msgApp( VVENC_INFO, cStats.getInfoString().c_str() );
+            fflush( stdout );
           }
         }
 
@@ -383,6 +384,7 @@ int EncApp::encode()
     if( appCfg.m_printStats )
     {
       msgApp( VVENC_INFO, cStats.getFinalStats().c_str() );
+      fflush( stdout );
     }
   }
 
diff --git a/source/App/vvencapp/vvencapp.cpp b/source/App/vvencapp/vvencapp.cpp
index 19612ab1a..39a644b84 100644
--- a/source/App/vvencapp/vvencapp.cpp
+++ b/source/App/vvencapp/vvencapp.cpp
@@ -357,7 +357,7 @@ int main( int argc, char* argv[] )
     int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded;
 
     apputils::Stats cStats;
-    cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " );
+    cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, vvenccfg.m_verbosity, "vvenc [info]: " );
     bool statsInfoReady = false;
 
     while( !bEof || !bEncodeDone )
@@ -404,6 +404,7 @@ int main( int argc, char* argv[] )
           if( statsInfoReady )
           {
             msgApp( nullptr, VVENC_INFO, cStats.getInfoString().c_str() );
+            fflush( stdout );
           }
         }
 
@@ -426,6 +427,7 @@ int main( int argc, char* argv[] )
     if( vvencappCfg.m_printStats )
     {
       msgApp( nullptr, VVENC_INFO, cStats.getFinalStats().c_str() );
+      fflush( stdout );
     }
   }
 
diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h
index 1aa271491..ad3e8d2b5 100644
--- a/source/Lib/CommonLib/AdaptiveLoopFilter.h
+++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 struct AlfClassifier
 {
   AlfClassifier() {}
diff --git a/source/Lib/CommonLib/AffineGradientSearch.h b/source/Lib/CommonLib/AffineGradientSearch.h
index f06995157..ca1a07d6c 100644
--- a/source/Lib/CommonLib/AffineGradientSearch.h
+++ b/source/Lib/CommonLib/AffineGradientSearch.h
@@ -53,6 +53,8 @@ namespace vvenc {
   //! \ingroup CommonLib
   //! \{
 
+using namespace x86_simd;
+
   class AffineGradientSearch
   {
   public:
diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h
index f86cc846a..adf927e40 100644
--- a/source/Lib/CommonLib/Buffer.h
+++ b/source/Lib/CommonLib/Buffer.h
@@ -66,6 +66,9 @@ struct vvencYUVBuffer;
 
 namespace vvenc {
 
+using namespace x86_simd;
+using namespace arm_simd;
+
 // ---------------------------------------------------------------------------
 // AreaBuf struct
 // ---------------------------------------------------------------------------
@@ -81,6 +84,22 @@ struct PelBufferOps
   template<X86_VEXT vext>
   void _initPelBufOpsX86();
 #endif
+	
+#if ENABLE_SIMD_OPT_BUFFER && defined( TARGET_SIMD_ARM )
+  void initPelBufOpsARM();
+  template<ARM_VEXT vext>
+  void _initPelBufOpsARM();
+#endif
+
+#define INCX( ptr, stride ) { ptr++; }
+#define INCY( ptr, stride ) { ptr += ( stride ); }
+#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
+#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
+#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
+#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
+#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
+#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) ) // need in loopFilter.cpp + some ARM files
+
   void ( *roundGeo )      ( const Pel* src, Pel* dest, const int numSamples, unsigned rshift, int offset, const ClpRng &clpRng);
   void ( *addAvg )        ( const Pel* src0, const Pel* src1, Pel* dst, int numsamples, unsigned shift, int offset, const ClpRng& clpRng );
   void ( *reco  )         ( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, const ClpRng& clpRng );
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index 178c9b139..e4b7e0329 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -636,22 +636,6 @@ static inline T* aligned_malloc(size_t len, size_t alignement) {
 #    define ALWAYS_INLINE
 #endif
 
-#ifdef TARGET_SIMD_X86
-typedef enum
-{
-  UNDEFINED = -1,
-  SCALAR = 0,
-  SSE41,
-  SSE42,
-  AVX,
-  AVX2,
-  AVX512
-} X86_VEXT;
-#endif
-
-template <typename ValueType> inline ValueType leftShiftU  (const ValueType value, const unsigned shift) { return value << shift; }
-template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }
-
 #if defined( _WIN32 ) && defined( TARGET_SIMD_X86 )
 static inline unsigned int bit_scan_reverse( int a )
 {
@@ -672,6 +656,59 @@ static inline unsigned int bit_scan_reverse( int a )
 }
 #endif
 
+#if ENABLE_SIMD_LOG2
+static inline int getLog2( int val )
+{
+  return bit_scan_reverse( val );
+}
+#else
+extern int8_t g_aucLog2[MAX_CU_SIZE + 1];
+static inline int getLog2( int val )
+{
+  CHECKD( g_aucLog2[2] != 1, "g_aucLog2[] has not been initialized yet." );
+  if( val > 0 && val < (int) sizeof( g_aucLog2 ) )
+  {
+    return g_aucLog2[val];
+  }
+  return std::log2( val );
+}
+#endif
+
+#if ENABLE_SIMD_OPT
+
+namespace x86_simd
+{
+#ifdef TARGET_SIMD_X86
+  typedef enum
+  {
+    UNDEFINED = -1,
+    SCALAR = 0,
+    SSE41,
+    SSE42,
+    AVX,
+    AVX2,
+    AVX512
+  } X86_VEXT;
+#endif
+}
+
+namespace arm_simd
+{
+#ifdef TARGET_SIMD_ARM
+  typedef enum
+  {
+    UNDEFINED = -1,
+    SCALAR    = 0,
+    NEON,
+  } ARM_VEXT;
+#endif   // TARGET_SIMD_ARM
+}   // namespace arm_simd
+
+#endif //ENABLE_SIMD_OPT
+
+template <typename ValueType> inline ValueType leftShiftU  (const ValueType value, const unsigned shift) { return value << shift; }
+template <typename ValueType> inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; }
+
 #if ENABLE_SIMD_LOG2 && defined( TARGET_SIMD_X86 )
 static inline int floorLog2( int val )
 {
diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp
index 8f883a8ec..67ecb3855 100644
--- a/source/Lib/CommonLib/DepQuant.cpp
+++ b/source/Lib/CommonLib/DepQuant.cpp
@@ -1518,7 +1518,7 @@ namespace DQIntern
 
 #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
       // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
-      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > SCALAR )
+      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR )
       {
         const int sbbSize = tuPars.m_sbbSize;
         // move the pointer to the beginning of the current subblock
diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
index 7d02c1949..7660fc018 100644
--- a/source/Lib/CommonLib/InterPrediction.h
+++ b/source/Lib/CommonLib/InterPrediction.h
@@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // forward declaration
 class Mv;
 
diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp
index 20b522582..112090b75 100644
--- a/source/Lib/CommonLib/InterpolationFilter.cpp
+++ b/source/Lib/CommonLib/InterpolationFilter.cpp
@@ -1071,12 +1071,16 @@ void InterpolationFilter::xWeightedGeoBlk(const ClpRngs &clpRngs, const CodingUn
 void InterpolationFilter::initInterpolationFilter( bool enable )
 {
 #if ENABLE_SIMD_OPT_MCIF
-#ifdef TARGET_SIMD_X86
   if ( enable )
   {
+#ifdef TARGET_SIMD_X86
     initInterpolationFilterX86();
-  }
 #endif
+		
+#ifdef TARGET_SIMD_ARM
+    initInterpolationFilterARM();
+#endif
+  }
 #endif
 }
 
diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h
index 6e3c83ee4..7fd05e03d 100644
--- a/source/Lib/CommonLib/InterpolationFilter.h
+++ b/source/Lib/CommonLib/InterpolationFilter.h
@@ -55,6 +55,9 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+using namespace arm_simd;
+
 #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
 #define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
 #define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally
@@ -117,6 +120,12 @@ class InterpolationFilter
   template <X86_VEXT vext>
   void _initInterpolationFilterX86();
 #endif
+	
+#ifdef TARGET_SIMD_ARM
+  void initInterpolationFilterARM();
+  template <ARM_VEXT vext>
+  void _initInterpolationFilterARM();
+#endif
 
   void filterN2_2D(const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY,                                        const ClpRng& clpRng);
   void filter4x4  (const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY,   bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, bool useAltHpelIf = false, int nFilterIdx = 0);
diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h
index 16b1aa740..78fded61b 100644
--- a/source/Lib/CommonLib/IntraPrediction.h
+++ b/source/Lib/CommonLib/IntraPrediction.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // ====================================================================================================================
 // Class definition
 // ====================================================================================================================
diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp
index d77f4e771..1738e988d 100644
--- a/source/Lib/CommonLib/LoopFilter.cpp
+++ b/source/Lib/CommonLib/LoopFilter.cpp
@@ -90,15 +90,6 @@ const uint8_t LoopFilter::sm_betaTable[MAX_QP + 1] =
 // utility functions
 // ====================================================================================================================
 
-#define INCX( ptr, stride ) { ptr++; }
-#define INCY( ptr, stride ) { ptr += ( stride ); }
-#define OFFSETX( ptr, stride, x ) { ptr += ( x ); }
-#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); }
-#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); }
-#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) )
-#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) )
-#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) )
-
 #define BsSet( val, compIdx ) (   ( val ) << ( ( compIdx ) << 1 ) )     
 #define BsGet( val, compIdx ) ( ( ( val ) >> ( ( compIdx ) << 1 ) ) & 3 )
 
diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h
index 9e2276407..3833c25fa 100644
--- a/source/Lib/CommonLib/LoopFilter.h
+++ b/source/Lib/CommonLib/LoopFilter.h
@@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
 
 #define DEBLOCK_SMALLEST_BLOCK  8
 
diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h
index 9956d83cb..2a5e3195f 100644
--- a/source/Lib/CommonLib/MCTF.h
+++ b/source/Lib/CommonLib/MCTF.h
@@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 class NoMallocThreadPool;
 
 //! \ingroup EncoderLib
diff --git a/source/Lib/CommonLib/Quant.h b/source/Lib/CommonLib/Quant.h
index 4e1a041f5..1d136a3a8 100644
--- a/source/Lib/CommonLib/Quant.h
+++ b/source/Lib/CommonLib/Quant.h
@@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 // ====================================================================================================================
 // Constants
 // ====================================================================================================================
diff --git a/source/Lib/CommonLib/QuantRDOQ2.cpp b/source/Lib/CommonLib/QuantRDOQ2.cpp
index d9a32a99f..17b69cd16 100644
--- a/source/Lib/CommonLib/QuantRDOQ2.cpp
+++ b/source/Lib/CommonLib/QuantRDOQ2.cpp
@@ -584,7 +584,7 @@ int QuantRDOQ2::xRateDistOptQuantFast( TransformUnit &tu, const ComponentID &com
 
   const bool scanFirstBlk = !bUseScalingList && log2CGSize == 4 && cctx.log2CGWidth() == 2;
 #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
-  const bool isSimd       = read_x86_extension_flags() > SCALAR;
+  const bool isSimd       = read_x86_extension_flags() > x86_simd::SCALAR;
 #endif
 
   int subSetId = iScanPos >> log2CGSize;
diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp
index 7495fa18f..5928bf2d2 100644
--- a/source/Lib/CommonLib/RdCost.cpp
+++ b/source/Lib/CommonLib/RdCost.cpp
@@ -141,6 +141,9 @@ void RdCost::create()
 #ifdef TARGET_SIMD_X86
   initRdCostX86();
 #endif
+#ifdef TARGET_SIMD_ARM
+  initRdCostARM();
+#endif
 #endif
 
   m_costMode      = VVENC_COST_STANDARD_LOSSY;
diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h
index b87fadfa1..eeed5e2b7 100644
--- a/source/Lib/CommonLib/RdCost.h
+++ b/source/Lib/CommonLib/RdCost.h
@@ -57,6 +57,9 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+using namespace arm_simd;
+
 class DistParam;
 
 // ====================================================================================================================
@@ -144,6 +147,12 @@ class RdCost
   template <X86_VEXT vext>
   void          _initRdCostX86();
 #endif
+	
+#ifdef TARGET_SIMD_ARM
+  void initRdCostARM();
+  template<ARM_VEXT vext>
+  void _initRdCostARM();
+#endif   // TARGET_SIMD_ARM
 
   void          setReshapeParams    ( const uint32_t* pPLUT, double chrWght)    { m_reshapeLumaLevelToWeightPLUT = pPLUT; m_chromaWeight = chrWght; }
   void          setDistortionWeight ( const ComponentID compID, const double distortionWeight ) { m_distortionWeight[compID] = distortionWeight; }
@@ -201,6 +210,11 @@ class RdCost
   }
   void           getMotionCostIBC(int add) { m_dCostIBC = m_dLambdaMotionSAD + add; }
   Distortion     getBvCostMultiplePredsIBC(int x, int y, bool useIMV);
+	
+  static Distortion xGetSAD8          ( const DistParam& pcDtParam );
+  static Distortion xGetSAD16         ( const DistParam& pcDtParam ); // needs to be public for xGetSAD_MxN_SIMD ( NOTE: they are all public in vvdec )
+  static void       xGetSAD16X5       ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos ); // needs to be public for xGetSADX5_16xN_SIMD ( NOTE: they are all public in vvdec )
+	
 private:
          Distortion xGetSSE_WTD       ( const DistParam& pcDtParam ) const;
 
@@ -215,15 +229,12 @@ class RdCost
 
   static Distortion xGetSAD           ( const DistParam& pcDtParam );
   static Distortion xGetSAD4          ( const DistParam& pcDtParam );
-  static Distortion xGetSAD8          ( const DistParam& pcDtParam );
-  static Distortion xGetSAD16         ( const DistParam& pcDtParam );
   static Distortion xGetSAD32         ( const DistParam& pcDtParam );
   static Distortion xGetSAD64         ( const DistParam& pcDtParam );
   static Distortion xGetSAD128        ( const DistParam& pcDtParam );
   static Distortion xGetSADwMask      ( const DistParam &pcDtParam );
   
   static void       xGetSAD8X5        ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos );
-  static void       xGetSAD16X5       ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos );
   
   static Distortion xCalcHADs2x2      ( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur );
   static Distortion xGetHAD2SADs      ( const DistParam& pcDtParam );
@@ -255,6 +266,11 @@ class RdCost
   static Distortion xGetSADwMask_SIMD( const DistParam &pcDtParam );
 #endif
 
+#ifdef TARGET_SIMD_ARM
+  template <ARM_VEXT vext>
+  static void xGetSADX5_16xN_SIMD   ( const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos );
+#endif
+	
   unsigned int   getBitsMultiplePredsIBC(int x, int y, bool useIMV);
 public:
 
diff --git a/source/Lib/CommonLib/SampleAdaptiveOffset.h b/source/Lib/CommonLib/SampleAdaptiveOffset.h
index 5b81b7f4c..2100c7478 100644
--- a/source/Lib/CommonLib/SampleAdaptiveOffset.h
+++ b/source/Lib/CommonLib/SampleAdaptiveOffset.h
@@ -53,6 +53,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 template<typename T> static inline int sgn( T val )
 {
   return ( T( 0 ) < val ) - ( val < T( 0 ) );
diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h
index 875fa004f..3b7261f0c 100644
--- a/source/Lib/CommonLib/TrQuant.h
+++ b/source/Lib/CommonLib/TrQuant.h
@@ -58,6 +58,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 typedef void FwdTrans(const TCoeff*, TCoeff*, int, int, int, int);
 typedef void InvTrans(const TCoeff*, TCoeff*, int, int, int, int, const TCoeff, const TCoeff);
 
diff --git a/source/Lib/CommonLib/TrQuant_EMT.h b/source/Lib/CommonLib/TrQuant_EMT.h
index 23c640e5c..ba1c6db04 100644
--- a/source/Lib/CommonLib/TrQuant_EMT.h
+++ b/source/Lib/CommonLib/TrQuant_EMT.h
@@ -52,6 +52,8 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+using namespace x86_simd;
+
 #if ENABLE_SIMD_TRAFO
 struct TCoeffOps
 {
@@ -61,8 +63,8 @@ struct TCoeffOps
   void initTCoeffOpsX86();
   template<X86_VEXT vext>
   void _initTCoeffOpsX86();
-
 #endif
+	
   void( *cpyResi8 )         ( const TCoeff*      src,        Pel*    dst, ptrdiff_t stride, unsigned width, unsigned height );
   void( *cpyResi4 )         ( const TCoeff*      src,        Pel*    dst, ptrdiff_t stride, unsigned width, unsigned height );
   void( *cpyCoeff8 )        ( const Pel*         src, ptrdiff_t stride,   TCoeff* dst, unsigned width, unsigned height );
diff --git a/source/Lib/CommonLib/arm/BufferARM.h b/source/Lib/CommonLib/arm/BufferARM.h
new file mode 100644
index 000000000..aa6a2f482
--- /dev/null
+++ b/source/Lib/CommonLib/arm/BufferARM.h
@@ -0,0 +1,385 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/** \file     BufferARM.h
+    \brief    SIMD averaging.
+*/
+
+//! \ingroup CommonLib
+//! \{
+
+#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1
+
+
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+#include "CommonLib/Unit.h"
+#include "CommonLib/Buffer.h"
+#include "CommonLib/InterpolationFilter.h"
+
+#if ENABLE_SIMD_OPT_BUFFER
+#ifdef TARGET_SIMD_ARM
+
+namespace vvenc
+{
+
+template<ARM_VEXT vext>
+void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, const Pel* lut )
+{
+  if( ( width & 31 ) == 0 )
+  {
+    int16x8x4_t xtmp1;
+    int16x8x4_t xtmp2;
+    int16x8x4_t xtmp3;
+    int16x8x4_t xtmp4;
+
+    for( int y = 0; y < height; y += 4 )
+    {
+      for( int x = 0; x < width; x += 32 )
+      {
+        xtmp1.val[ 0 ][ 0 ] = lut[ src[ x + 0 ] ];
+        xtmp1.val[ 1 ][ 0 ] = lut[ src[ x + 1 ] ];
+        xtmp1.val[ 2 ][ 0 ] = lut[ src[ x + 2 ] ];
+        xtmp1.val[ 3 ][ 0 ] = lut[ src[ x + 3 ] ];
+        xtmp1.val[ 0 ][ 1 ] = lut[ src[ x + 4 ] ];
+        xtmp1.val[ 1 ][ 1 ] = lut[ src[ x + 5 ] ];
+        xtmp1.val[ 2 ][ 1 ] = lut[ src[ x + 6 ] ];
+        xtmp1.val[ 3 ][ 1 ] = lut[ src[ x + 7 ] ];
+        xtmp1.val[ 0 ][ 2 ] = lut[ src[ x + 8 ] ];
+        xtmp1.val[ 1 ][ 2 ] = lut[ src[ x + 9 ] ];
+        xtmp1.val[ 2 ][ 2 ] = lut[ src[ x + 10 ] ];
+        xtmp1.val[ 3 ][ 2 ] = lut[ src[ x + 11 ] ];
+        xtmp1.val[ 0 ][ 3 ] = lut[ src[ x + 12 ] ];
+        xtmp1.val[ 1 ][ 3 ] = lut[ src[ x + 13 ] ];
+        xtmp1.val[ 2 ][ 3 ] = lut[ src[ x + 14 ] ];
+        xtmp1.val[ 3 ][ 3 ] = lut[ src[ x + 15 ] ];
+        xtmp1.val[ 0 ][ 4 ] = lut[ src[ x + 16 ] ];
+        xtmp1.val[ 1 ][ 4 ] = lut[ src[ x + 17 ] ];
+        xtmp1.val[ 2 ][ 4 ] = lut[ src[ x + 18 ] ];
+        xtmp1.val[ 3 ][ 4 ] = lut[ src[ x + 19 ] ];
+        xtmp1.val[ 0 ][ 5 ] = lut[ src[ x + 20 ] ];
+        xtmp1.val[ 1 ][ 5 ] = lut[ src[ x + 21 ] ];
+        xtmp1.val[ 2 ][ 5 ] = lut[ src[ x + 22 ] ];
+        xtmp1.val[ 3 ][ 5 ] = lut[ src[ x + 23 ] ];
+        xtmp1.val[ 0 ][ 6 ] = lut[ src[ x + 24 ] ];
+        xtmp1.val[ 1 ][ 6 ] = lut[ src[ x + 25 ] ];
+        xtmp1.val[ 2 ][ 6 ] = lut[ src[ x + 26 ] ];
+        xtmp1.val[ 3 ][ 6 ] = lut[ src[ x + 27 ] ];
+        xtmp1.val[ 0 ][ 7 ] = lut[ src[ x + 28 ] ];
+        xtmp1.val[ 1 ][ 7 ] = lut[ src[ x + 29 ] ];
+        xtmp1.val[ 2 ][ 7 ] = lut[ src[ x + 30 ] ];
+        xtmp1.val[ 3 ][ 7 ] = lut[ src[ x + 31 ] ];
+
+        xtmp2.val[ 0 ][ 0 ] = lut[ src[ x + 1 * srcStride ] ];
+        xtmp2.val[ 1 ][ 0 ] = lut[ src[ x + 1 * srcStride + 1 ] ];
+        xtmp2.val[ 2 ][ 0 ] = lut[ src[ x + 1 * srcStride + 2 ] ];
+        xtmp2.val[ 3 ][ 0 ] = lut[ src[ x + 1 * srcStride + 3 ] ];
+        xtmp2.val[ 0 ][ 1 ] = lut[ src[ x + 1 * srcStride + 4 ] ];
+        xtmp2.val[ 1 ][ 1 ] = lut[ src[ x + 1 * srcStride + 5 ] ];
+        xtmp2.val[ 2 ][ 1 ] = lut[ src[ x + 1 * srcStride + 6 ] ];
+        xtmp2.val[ 3 ][ 1 ] = lut[ src[ x + 1 * srcStride + 7 ] ];
+        xtmp2.val[ 0 ][ 2 ] = lut[ src[ x + 1 * srcStride + 8 ] ];
+        xtmp2.val[ 1 ][ 2 ] = lut[ src[ x + 1 * srcStride + 9 ] ];
+        xtmp2.val[ 2 ][ 2 ] = lut[ src[ x + 1 * srcStride + 10 ] ];
+        xtmp2.val[ 3 ][ 2 ] = lut[ src[ x + 1 * srcStride + 11 ] ];
+        xtmp2.val[ 0 ][ 3 ] = lut[ src[ x + 1 * srcStride + 12 ] ];
+        xtmp2.val[ 1 ][ 3 ] = lut[ src[ x + 1 * srcStride + 13 ] ];
+        xtmp2.val[ 2 ][ 3 ] = lut[ src[ x + 1 * srcStride + 14 ] ];
+        xtmp2.val[ 3 ][ 3 ] = lut[ src[ x + 1 * srcStride + 15 ] ];
+        xtmp2.val[ 0 ][ 4 ] = lut[ src[ x + 1 * srcStride + 16 ] ];
+        xtmp2.val[ 1 ][ 4 ] = lut[ src[ x + 1 * srcStride + 17 ] ];
+        xtmp2.val[ 2 ][ 4 ] = lut[ src[ x + 1 * srcStride + 18 ] ];
+        xtmp2.val[ 3 ][ 4 ] = lut[ src[ x + 1 * srcStride + 19 ] ];
+        xtmp2.val[ 0 ][ 5 ] = lut[ src[ x + 1 * srcStride + 20 ] ];
+        xtmp2.val[ 1 ][ 5 ] = lut[ src[ x + 1 * srcStride + 21 ] ];
+        xtmp2.val[ 2 ][ 5 ] = lut[ src[ x + 1 * srcStride + 22 ] ];
+        xtmp2.val[ 3 ][ 5 ] = lut[ src[ x + 1 * srcStride + 23 ] ];
+        xtmp2.val[ 0 ][ 6 ] = lut[ src[ x + 1 * srcStride + 24 ] ];
+        xtmp2.val[ 1 ][ 6 ] = lut[ src[ x + 1 * srcStride + 25 ] ];
+        xtmp2.val[ 2 ][ 6 ] = lut[ src[ x + 1 * srcStride + 26 ] ];
+        xtmp2.val[ 3 ][ 6 ] = lut[ src[ x + 1 * srcStride + 27 ] ];
+        xtmp2.val[ 0 ][ 7 ] = lut[ src[ x + 1 * srcStride + 28 ] ];
+        xtmp2.val[ 1 ][ 7 ] = lut[ src[ x + 1 * srcStride + 29 ] ];
+        xtmp2.val[ 2 ][ 7 ] = lut[ src[ x + 1 * srcStride + 30 ] ];
+        xtmp2.val[ 3 ][ 7 ] = lut[ src[ x + 1 * srcStride + 31 ] ];
+
+        xtmp3.val[ 0 ][ 0 ] = lut[ src[ x + 2 * srcStride + 0 ] ];
+        xtmp3.val[ 1 ][ 0 ] = lut[ src[ x + 2 * srcStride + 1 ] ];
+        xtmp3.val[ 2 ][ 0 ] = lut[ src[ x + 2 * srcStride + 2 ] ];
+        xtmp3.val[ 3 ][ 0 ] = lut[ src[ x + 2 * srcStride + 3 ] ];
+        xtmp3.val[ 0 ][ 1 ] = lut[ src[ x + 2 * srcStride + 4 ] ];
+        xtmp3.val[ 1 ][ 1 ] = lut[ src[ x + 2 * srcStride + 5 ] ];
+        xtmp3.val[ 2 ][ 1 ] = lut[ src[ x + 2 * srcStride + 6 ] ];
+        xtmp3.val[ 3 ][ 1 ] = lut[ src[ x + 2 * srcStride + 7 ] ];
+        xtmp3.val[ 0 ][ 2 ] = lut[ src[ x + 2 * srcStride + 8 ] ];
+        xtmp3.val[ 1 ][ 2 ] = lut[ src[ x + 2 * srcStride + 9 ] ];
+        xtmp3.val[ 2 ][ 2 ] = lut[ src[ x + 2 * srcStride + 10 ] ];
+        xtmp3.val[ 3 ][ 2 ] = lut[ src[ x + 2 * srcStride + 11 ] ];
+        xtmp3.val[ 0 ][ 3 ] = lut[ src[ x + 2 * srcStride + 12 ] ];
+        xtmp3.val[ 1 ][ 3 ] = lut[ src[ x + 2 * srcStride + 13 ] ];
+        xtmp3.val[ 2 ][ 3 ] = lut[ src[ x + 2 * srcStride + 14 ] ];
+        xtmp3.val[ 3 ][ 3 ] = lut[ src[ x + 2 * srcStride + 15 ] ];
+        xtmp3.val[ 0 ][ 4 ] = lut[ src[ x + 2 * srcStride + 16 ] ];
+        xtmp3.val[ 1 ][ 4 ] = lut[ src[ x + 2 * srcStride + 17 ] ];
+        xtmp3.val[ 2 ][ 4 ] = lut[ src[ x + 2 * srcStride + 18 ] ];
+        xtmp3.val[ 3 ][ 4 ] = lut[ src[ x + 2 * srcStride + 19 ] ];
+        xtmp3.val[ 0 ][ 5 ] = lut[ src[ x + 2 * srcStride + 20 ] ];
+        xtmp3.val[ 1 ][ 5 ] = lut[ src[ x + 2 * srcStride + 21 ] ];
+        xtmp3.val[ 2 ][ 5 ] = lut[ src[ x + 2 * srcStride + 22 ] ];
+        xtmp3.val[ 3 ][ 5 ] = lut[ src[ x + 2 * srcStride + 23 ] ];
+        xtmp3.val[ 0 ][ 6 ] = lut[ src[ x + 2 * srcStride + 24 ] ];
+        xtmp3.val[ 1 ][ 6 ] = lut[ src[ x + 2 * srcStride + 25 ] ];
+        xtmp3.val[ 2 ][ 6 ] = lut[ src[ x + 2 * srcStride + 26 ] ];
+        xtmp3.val[ 3 ][ 6 ] = lut[ src[ x + 2 * srcStride + 27 ] ];
+        xtmp3.val[ 0 ][ 7 ] = lut[ src[ x + 2 * srcStride + 28 ] ];
+        xtmp3.val[ 1 ][ 7 ] = lut[ src[ x + 2 * srcStride + 29 ] ];
+        xtmp3.val[ 2 ][ 7 ] = lut[ src[ x + 2 * srcStride + 30 ] ];
+        xtmp3.val[ 3 ][ 7 ] = lut[ src[ x + 2 * srcStride + 31 ] ];
+
+        // interleaved assign -> there is only interleaved storing/loading
+        xtmp4.val[ 0 ][ 0 ] = lut[ src[ x + 3 * srcStride + 0 ] ];
+        xtmp4.val[ 1 ][ 0 ] = lut[ src[ x + 3 * srcStride + 1 ] ];
+        xtmp4.val[ 2 ][ 0 ] = lut[ src[ x + 3 * srcStride + 2 ] ];
+        xtmp4.val[ 3 ][ 0 ] = lut[ src[ x + 3 * srcStride + 3 ] ];
+        xtmp4.val[ 0 ][ 1 ] = lut[ src[ x + 3 * srcStride + 4 ] ];
+        xtmp4.val[ 1 ][ 1 ] = lut[ src[ x + 3 * srcStride + 5 ] ];
+        xtmp4.val[ 2 ][ 1 ] = lut[ src[ x + 3 * srcStride + 6 ] ];
+        xtmp4.val[ 3 ][ 1 ] = lut[ src[ x + 3 * srcStride + 7 ] ];
+        xtmp4.val[ 0 ][ 2 ] = lut[ src[ x + 3 * srcStride + 8 ] ];
+        xtmp4.val[ 1 ][ 2 ] = lut[ src[ x + 3 * srcStride + 9 ] ];
+        xtmp4.val[ 2 ][ 2 ] = lut[ src[ x + 3 * srcStride + 10 ] ];
+        xtmp4.val[ 3 ][ 2 ] = lut[ src[ x + 3 * srcStride + 11 ] ];
+        xtmp4.val[ 0 ][ 3 ] = lut[ src[ x + 3 * srcStride + 12 ] ];
+        xtmp4.val[ 1 ][ 3 ] = lut[ src[ x + 3 * srcStride + 13 ] ];
+        xtmp4.val[ 2 ][ 3 ] = lut[ src[ x + 3 * srcStride + 14 ] ];
+        xtmp4.val[ 3 ][ 3 ] = lut[ src[ x + 3 * srcStride + 15 ] ];
+        xtmp4.val[ 0 ][ 4 ] = lut[ src[ x + 3 * srcStride + 16 ] ];
+        xtmp4.val[ 1 ][ 4 ] = lut[ src[ x + 3 * srcStride + 17 ] ];
+        xtmp4.val[ 2 ][ 4 ] = lut[ src[ x + 3 * srcStride + 18 ] ];
+        xtmp4.val[ 3 ][ 4 ] = lut[ src[ x + 3 * srcStride + 19 ] ];
+        xtmp4.val[ 0 ][ 5 ] = lut[ src[ x + 3 * srcStride + 20 ] ];
+        xtmp4.val[ 1 ][ 5 ] = lut[ src[ x + 3 * srcStride + 21 ] ];
+        xtmp4.val[ 2 ][ 5 ] = lut[ src[ x + 3 * srcStride + 22 ] ];
+        xtmp4.val[ 3 ][ 5 ] = lut[ src[ x + 3 * srcStride + 23 ] ];
+        xtmp4.val[ 0 ][ 6 ] = lut[ src[ x + 3 * srcStride + 24 ] ];
+        xtmp4.val[ 1 ][ 6 ] = lut[ src[ x + 3 * srcStride + 25 ] ];
+        xtmp4.val[ 2 ][ 6 ] = lut[ src[ x + 3 * srcStride + 26 ] ];
+        xtmp4.val[ 3 ][ 6 ] = lut[ src[ x + 3 * srcStride + 27 ] ];
+        xtmp4.val[ 0 ][ 7 ] = lut[ src[ x + 3 * srcStride + 28 ] ];
+        xtmp4.val[ 1 ][ 7 ] = lut[ src[ x + 3 * srcStride + 29 ] ];
+        xtmp4.val[ 2 ][ 7 ] = lut[ src[ x + 3 * srcStride + 30 ] ];
+        xtmp4.val[ 3 ][ 7 ] = lut[ src[ x + 3 * srcStride + 31 ] ];
+
+        // deinterleaved storing
+        vst4q_s16( &dst[ x ], xtmp1 );
+        vst4q_s16( &dst[ x + 1 * dstStride ], xtmp2 );
+        vst4q_s16( &dst[ x + 2 * dstStride ], xtmp3 );
+        vst4q_s16( &dst[ x + 3 * dstStride ], xtmp4 );
+      }
+      src += ( srcStride << 2 );
+      dst += ( dstStride << 2 );
+    }
+  }
+  else if( ( width & 15 ) == 0 )
+  {
+    int16x8x2_t xtmp1;
+    int16x8x2_t xtmp2;
+    int16x8x2_t xtmp3;
+    int16x8x2_t xtmp4;
+
+    for( int y = 0; y < height; y += 4 )
+    {
+      for( int x = 0; x < width; x += 16 )
+      {
+        // vld2q_s16( &src[ x ] );
+
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 0 ] ], xtmp1.val[ 0 ], 0 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 ] ], xtmp1.val[ 1 ], 0 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 ] ], xtmp1.val[ 0 ], 1 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 ] ], xtmp1.val[ 1 ], 1 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 4 ] ], xtmp1.val[ 0 ], 2 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 5 ] ], xtmp1.val[ 1 ], 2 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 6 ] ], xtmp1.val[ 0 ], 3 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 7 ] ], xtmp1.val[ 1 ], 3 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 8 ] ], xtmp1.val[ 0 ], 4 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 9 ] ], xtmp1.val[ 1 ], 4 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 10 ] ], xtmp1.val[ 0 ], 5 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 11 ] ], xtmp1.val[ 1 ], 5 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 12 ] ], xtmp1.val[ 0 ], 6 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 13 ] ], xtmp1.val[ 1 ], 6 );
+        xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 14 ] ], xtmp1.val[ 0 ], 7 );
+        xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 15 ] ], xtmp1.val[ 1 ], 7 );
+
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 0 ] ], xtmp2.val[ 0 ], 0 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 1 ] ], xtmp2.val[ 1 ], 0 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 2 ] ], xtmp2.val[ 0 ], 1 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 3 ] ], xtmp2.val[ 1 ], 1 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 4 ] ], xtmp2.val[ 0 ], 2 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 5 ] ], xtmp2.val[ 1 ], 2 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 6 ] ], xtmp2.val[ 0 ], 3 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 7 ] ], xtmp2.val[ 1 ], 3 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 8 ] ], xtmp2.val[ 0 ], 4 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 9 ] ], xtmp2.val[ 1 ], 4 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 10 ] ], xtmp2.val[ 0 ], 5 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 11 ] ], xtmp2.val[ 1 ], 5 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 12 ] ], xtmp2.val[ 0 ], 6 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 13 ] ], xtmp2.val[ 1 ], 6 );
+        xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 14 ] ], xtmp2.val[ 0 ], 7 );
+        xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 15 ] ], xtmp2.val[ 1 ], 7 );
+
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 0 ] ], xtmp3.val[ 0 ], 0 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 1 ] ], xtmp3.val[ 1 ], 0 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 2 ] ], xtmp3.val[ 0 ], 1 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 3 ] ], xtmp3.val[ 1 ], 1 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 4 ] ], xtmp3.val[ 0 ], 2 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 5 ] ], xtmp3.val[ 1 ], 2 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 6 ] ], xtmp3.val[ 0 ], 3 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 7 ] ], xtmp3.val[ 1 ], 3 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 8 ] ], xtmp3.val[ 0 ], 4 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 9 ] ], xtmp3.val[ 1 ], 4 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 10 ] ], xtmp3.val[ 0 ], 5 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 11 ] ], xtmp3.val[ 1 ], 5 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 12 ] ], xtmp3.val[ 0 ], 6 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 13 ] ], xtmp3.val[ 1 ], 6 );
+        xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 14 ] ], xtmp3.val[ 0 ], 7 );
+        xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 15 ] ], xtmp3.val[ 1 ], 7 );
+
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 0 ] ], xtmp4.val[ 0 ], 0 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 1 ] ], xtmp4.val[ 1 ], 0 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 2 ] ], xtmp4.val[ 0 ], 1 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 3 ] ], xtmp4.val[ 1 ], 1 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 4 ] ], xtmp4.val[ 0 ], 2 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 5 ] ], xtmp4.val[ 1 ], 2 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 6 ] ], xtmp4.val[ 0 ], 3 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 7 ] ], xtmp4.val[ 1 ], 3 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 8 ] ], xtmp4.val[ 0 ], 4 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 9 ] ], xtmp4.val[ 1 ], 4 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 10 ] ], xtmp4.val[ 0 ], 5 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 11 ] ], xtmp4.val[ 1 ], 5 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 12 ] ], xtmp4.val[ 0 ], 6 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 13 ] ], xtmp4.val[ 1 ], 6 );
+        xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 14 ] ], xtmp4.val[ 0 ], 7 );
+        xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 15 ] ], xtmp4.val[ 1 ], 7 );
+
+        vst2q_s16( &dst[ x ], xtmp1 );
+        vst2q_s16( &dst[ x + 1 * dstStride ], xtmp2 );
+        vst2q_s16( &dst[ x + 2 * dstStride ], xtmp3 );
+        vst2q_s16( &dst[ x + 3 * dstStride ], xtmp4 );
+      }
+      src += ( srcStride << 2 );
+      dst += ( dstStride << 2 );
+    }
+  }
+  else if( ( width & 7 ) == 0 )
+  {
+    int16x8_t xtmp1;
+    int16x8_t xtmp2;
+    int16x8_t xtmp3;
+    int16x8_t xtmp4;
+
+    for( int y = 0; y < height; y += 4 )
+    {
+      for( int x = 0; x < width; x += 8 )
+      {
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 0 ] ], xtmp1, 0 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 1 ] ], xtmp1, 1 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 2 ] ], xtmp1, 2 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 3 ] ], xtmp1, 3 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 4 ] ], xtmp1, 4 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 5 ] ], xtmp1, 5 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 6 ] ], xtmp1, 6 );
+        xtmp1 = vsetq_lane_s16( lut[ src[ x + 7 ] ], xtmp1, 7 );
+
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 0 ] ], xtmp2, 0 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 1 ] ], xtmp2, 1 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 2 ] ], xtmp2, 2 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 3 ] ], xtmp2, 3 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 4 ] ], xtmp2, 4 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 5 ] ], xtmp2, 5 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 6 ] ], xtmp2, 6 );
+        xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 7 ] ], xtmp2, 7 );
+
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 0 ] ], xtmp3, 0 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 1 ] ], xtmp3, 1 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 2 ] ], xtmp3, 2 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 3 ] ], xtmp3, 3 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 4 ] ], xtmp3, 4 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 5 ] ], xtmp3, 5 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 6 ] ], xtmp3, 6 );
+        xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 7 ] ], xtmp3, 7 );
+
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 0 ] ], xtmp4, 0 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 1 ] ], xtmp4, 1 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 2 ] ], xtmp4, 2 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 3 ] ], xtmp4, 3 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 4 ] ], xtmp4, 4 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 5 ] ], xtmp4, 5 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 6 ] ], xtmp4, 6 );
+        xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 7 ] ], xtmp4, 7 );
+
+        vst1q_s16( &dst[ x ], xtmp1 );
+        vst1q_s16( &dst[ x + 1 * dstStride ], xtmp2 );
+        vst1q_s16( &dst[ x + 2 * dstStride ], xtmp3 );
+        vst1q_s16( &dst[ x + 3 * dstStride ], xtmp4 );
+      }
+
+      src += ( srcStride << 2 );
+      dst += ( dstStride << 2 );
+    }
+  }
+
+  return;
+}
+
+template<ARM_VEXT vext>
+void PelBufferOps::_initPelBufOpsARM()
+{
+  applyLut = applyLut_SIMD<vext>;
+}
+
+template void PelBufferOps::_initPelBufOpsARM<SIMDARM>();
+
+}   // namespace vvdec
+
+#endif   // TARGET_SIMD_ARM
+#endif     // ENABLE_SIMD_OPT_BUFFER
+//! \}
diff --git a/source/Lib/CommonLib/arm/CommonDefARM.cpp b/source/Lib/CommonLib/arm/CommonDefARM.cpp
new file mode 100644
index 000000000..cf29a9439
--- /dev/null
+++ b/source/Lib/CommonLib/arm/CommonDefARM.cpp
@@ -0,0 +1,64 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/** \file     CommonDefX86.cpp
+*/
+
+#include "CommonDefARM.h"
+
+namespace vvenc
+{
+using namespace arm_simd;
+
+ARM_VEXT read_arm_extension_flags( ARM_VEXT request )
+{
+  static ARM_VEXT ext_flags = NEON;   // We assume NEON is always supported for relevant ARM processors
+
+  if( request != UNDEFINED )
+  {
+    ext_flags = request;
+  }
+
+  return ext_flags;
+};
+
+}   // namespace vvdec
diff --git a/source/Lib/CommonLib/arm/CommonDefARM.h b/source/Lib/CommonLib/arm/CommonDefARM.h
new file mode 100644
index 000000000..006d12617
--- /dev/null
+++ b/source/Lib/CommonLib/arm/CommonDefARM.h
@@ -0,0 +1,66 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/** \file     CommonDefARM.h
+ */
+
+#pragma once
+
+#include "CommonDef.h"
+
+#ifdef TARGET_SIMD_ARM
+
+#ifdef USE_NEON
+#define SIMDARM NEON
+#include <arm_neon.h>
+#endif
+
+namespace vvenc
+{
+using namespace arm_simd;
+
+ARM_VEXT read_arm_extension_flags( ARM_VEXT request = arm_simd::UNDEFINED );
+// std::string read_arm_extension_name();
+
+}   // namespace vvdec
+
+#endif   // TARGET_SIMD_ARM
diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
new file mode 100644
index 000000000..d97d716b3
--- /dev/null
+++ b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -0,0 +1,115 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/*
+ * \ingroup CommonLib
+ * \file    InitARM.cpp
+ * \brief   Initialize encoder SIMD functions.
+ */
+
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+#include "CommonLib/InterpolationFilter.h"
+#include "CommonLib/TrQuant.h"
+#include "CommonLib/RdCost.h"
+#include "CommonLib/Buffer.h"
+#include "CommonLib/TrQuant_EMT.h"
+#include "CommonLib/IntraPrediction.h"
+#include "CommonLib/LoopFilter.h"
+#include "CommonLib/Picture.h"
+
+#include "CommonLib/AdaptiveLoopFilter.h"
+#include "CommonLib/SampleAdaptiveOffset.h"
+
+namespace vvenc
+{
+
+#ifdef TARGET_SIMD_ARM
+
+#if ENABLE_SIMD_OPT_MCIF
+void InterpolationFilter::initInterpolationFilterARM()
+{
+  auto vext = read_arm_extension_flags();
+  switch( vext )
+  {
+  case NEON:
+    _initInterpolationFilterARM<NEON>();
+    break;
+  default:
+    break;
+  }
+}
+#endif
+
+#if ENABLE_SIMD_OPT_BUFFER
+void PelBufferOps::initPelBufOpsARM()
+{
+  auto vext = read_arm_extension_flags();
+  switch( vext )
+  {
+  case NEON:
+    _initPelBufOpsARM<NEON>();
+    break;
+  default:
+    break;
+  }
+}
+#endif
+
+#if ENABLE_SIMD_OPT_DIST
+void RdCost::initRdCostARM()
+{
+  auto vext = read_arm_extension_flags();
+  switch( vext )
+  {
+  case NEON:
+    _initRdCostARM<NEON>();
+    break;
+  default:
+    break;
+  }
+}
+#endif
+
+#endif   // TARGET_SIMD_ARM
+
+}   // namespace vvdec
diff --git a/source/Lib/CommonLib/arm/InterpolationFilterARM.h b/source/Lib/CommonLib/arm/InterpolationFilterARM.h
new file mode 100644
index 000000000..fe631ebfd
--- /dev/null
+++ b/source/Lib/CommonLib/arm/InterpolationFilterARM.h
@@ -0,0 +1,295 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/**
+ * \file
+ * \brief Implementation of InterpolationFilter class
+ */
+//  ====================================================================================================================
+//  Includes
+//  ====================================================================================================================
+
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+#include "../InterpolationFilter.h"
+
+
+namespace vvenc
+{
+
+#ifdef TARGET_SIMD_ARM
+#if __ARM_ARCH >= 8
+
+template<ARM_VEXT vext>
+static void simdInterpolateN2_2D( const ClpRng& clpRng, const Pel* src, const int srcStride, Pel* dst, const int dstStride, int width, int height, TFilterCoeff const *ch, TFilterCoeff const *cv )
+{
+  const int shift1st  = IF_FILTER_PREC_BILINEAR - ( IF_INTERNAL_PREC_BILINEAR - clpRng.bd );
+  const int offset1st = 1 << ( shift1st - 1 );
+
+  const int shift2nd  = 4;
+  const int offset2nd = 1 << ( shift2nd - 1 );
+
+  int16x8_t mmOffset1 = vdupq_n_s16( offset1st );
+  int16x8_t mmOffset2 = vdupq_n_s16( offset2nd );
+  int16x8_t mmCoeffH  = vdupq_n_s16( ch[ 1 ] );
+  int16x8_t mmCoeffV  = vdupq_n_s16( cv[ 1 ] );
+
+  int16x8_t mmLastH[ 16 ];
+
+  int16x8_t mmLast4H;
+
+  // workaround for over-sensitive compilers
+  mmLastH[ 0 ] = vdupq_n_s16( 0 );
+
+  int16x8_t shift1inv = vdupq_n_s16( -shift1st );
+  int16x8_t shift2inv = vdupq_n_s16( -shift2nd );
+
+  for( int row = -1; row < height; row++ )
+  {
+    int16x8_t mmPix  = vld1q_s16( src );
+    int16x8_t mmPix1 = vld1q_s16( src + 1 );
+
+    int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 );
+
+    mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH );
+    mmFiltered = vshlq_s16( mmFiltered, shift1inv );
+
+    if( row >= 0 )
+    {
+      int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mmLast4H, 16 );
+      mmFiltered2           = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mmLast4H ), mmCoeffV );
+      mmFiltered2           = vshlq_s16( mmFiltered2, shift2inv );
+
+      vst1q_lane_s64( (int64_t*) dst, (int64x2_t) mmFiltered2, 0 );
+    }
+
+    mmLast4H = mmFiltered;
+
+    for( int x = 4; x < width; x += 8 )
+    {
+      int16x8_t mmPix  = vld1q_s16( src + x );
+      int16x8_t mmPix1 = vld1q_s16( src + x + 1 );
+
+      int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 );
+      mmFiltered           = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH );
+      mmFiltered           = vshlq_s16( mmFiltered, shift1inv );
+
+      int       idx   = x >> 3;
+      int16x8_t mLast = mmLastH[ idx ];
+      mmLastH[ idx ]  = mmFiltered;
+
+      if( row >= 0 )
+      {
+        int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mLast, 16 );
+        mmFiltered2           = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mLast ), mmCoeffV );
+        mmFiltered2           = vshlq_s16( mmFiltered2, shift2inv );
+
+        vst1q_s16( ( dst + x ), mmFiltered2 );
+      }
+    }
+    if( row >= 0 )
+      dst += dstStride;
+
+    src += srcStride;
+  }
+}
+
+template<ARM_VEXT vext, bool isLast>
+void simdFilter16xX_N8( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV )
+{
+  OFFSET( src, srcStride, -3, -3 );
+
+  int       offset1st, offset2nd;
+  int       headRoom  = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
+  const int shift1st  = IF_FILTER_PREC - headRoom;
+  int       shift2nd  = IF_FILTER_PREC;
+  int       extHeight = height + 7;
+  // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
+  // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
+
+  //  shift1st -= headRoom;
+  offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
+
+  if( isLast )
+  {
+    shift2nd += headRoom;
+    offset2nd = 1 << ( shift2nd - 1 );
+    offset2nd += IF_INTERNAL_OFFS << IF_FILTER_PREC;
+  }
+  else
+  {
+    offset2nd = 0;
+  }
+  const int32x4_t voffset1 = vdupq_n_s32( offset1st );
+
+  const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
+  const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
+
+  int64x1x2_t vcoeff0 = vld2_s64( (int64_t*) coeffH );
+  int16x8_t vsum;
+  int32x4_t vsuma, vsumb;
+
+  int32x4_t vsrcv[ 2 ][ 9 ];
+
+  int32x4_t invshift1st = vdupq_n_s32( -shift1st );
+  int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );
+
+  for( int row = 0; row < extHeight; row++ )
+  {
+    int32x4_t vsrc0, vsrc1;
+    int16x4_t vsrca00, vsrca01, vsrca10, vsrca11;
+    int16x4_t vsrcb00, vsrcb01, vsrcb10, vsrcb11;
+
+    vsrca00 = vld1_s16( &src[ 0 ] );
+    vsrca01 = vld1_s16( &src[ 1 ] );
+    vsrca10 = vld1_s16( &src[ 2 ] );
+    vsrca11 = vld1_s16( &src[ 3 ] );
+
+    for( int j = 0; j < 2; j++ )
+    {
+      vsrcb00 = vld1_s16( &src[ ( j << 3 ) + 4 ] );
+      vsrcb01 = vld1_s16( &src[ ( j << 3 ) + 5 ] );
+      vsrcb10 = vld1_s16( &src[ ( j << 3 ) + 6 ] );
+      vsrcb11 = vld1_s16( &src[ ( j << 3 ) + 7 ] );
+
+      vsuma[ 0 ] = vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsuma[ 1 ] = vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsuma[ 2 ] = vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsuma[ 3 ] = vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+
+      vsumb[ 0 ] = vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsumb[ 1 ] = vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsumb[ 2 ] = vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+      vsumb[ 3 ] = vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) );
+
+      vsrc1[ 0 ] = vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc1[ 1 ] = vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc1[ 2 ] = vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc1[ 3 ] = vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+
+      vsrca00 = vld1_s16( &src[ ( j << 3 ) + 8 ] );
+      vsrca01 = vld1_s16( &src[ ( j << 3 ) + 9 ] );
+      vsrca10 = vld1_s16( &src[ ( j << 3 ) + 10 ] );
+      vsrca11 = vld1_s16( &src[ ( j << 3 ) + 11 ] );
+
+      vsrc0[ 0 ] = vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc0[ 1 ] = vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc0[ 2 ] = vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+      vsrc0[ 3 ] = vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) );
+
+      vsuma = vaddq_s32( vsuma, vsrc1 );
+      vsumb = vaddq_s32( vsumb, vsrc0 );
+
+      vsuma = vaddq_s32( vsuma, voffset1 );
+      vsumb = vaddq_s32( vsumb, voffset1 );
+
+      vsuma = vshlq_s32( vsuma, invshift1st );
+      vsumb = vshlq_s32( vsumb, invshift1st );
+
+      vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb );
+
+      if( row < 7 )
+      {
+        vsrcv[ j ][ row + 1 ] = (int32x4_t) vsum;
+      }
+      else
+      {
+        vsrcv[ j ][ 8 ] = (int32x4_t) vsum;
+        vsuma = vsumb = vdupq_n_s32( offset2nd );
+
+        for( int i = 0; i < 8; i += 2 )
+        {
+          vsrc0               = vsrcv[ j ][ i + 1 ];
+          vsrc1               = vsrcv[ j ][ i + 2 ];
+          int16x4_t vsrc0l    = vget_low_s16( (int16x8_t) vsrc0 );               // 0a 0b 0c 0d
+          int16x4_t vsrc0h    = vget_high_s16( (int16x8_t) vsrc0 );              // 0e 0f 0g 0h
+          int16x4_t vsrc1l    = vget_low_s16( (int16x8_t) vsrc1 );               // 1a 1b 1c 1d
+          int16x4_t vsrc1h    = vget_high_s16( (int16x8_t) vsrc1 );              // 1e 1f 1g 1h
+          vsuma               = vmlal_n_s16( vsuma, vsrc0l, coeffV[ i ] );       // 0a * c0 + offset2nd, 0b * c0 + offset2nd, ...
+          vsuma               = vmlal_n_s16( vsuma, vsrc1l, coeffV[ i + 1 ] );   // 1a * c1 + 0a * c1 + offset2nd, 1b * c1 + 0b * c0 + offset2nd, ...
+          vsumb               = vmlal_n_s16( vsumb, vsrc0h, coeffV[ i ] );
+          vsumb               = vmlal_n_s16( vsumb, vsrc1h, coeffV[ i + 1 ] );
+          vsrcv[ j ][ i ]     = vsrc0;
+          vsrcv[ j ][ i + 1 ] = vsrc1;
+        }
+        vsuma = vshlq_s32( vsuma, invshift2nd );
+        vsumb = vshlq_s32( vsumb, invshift2nd );
+
+        vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb );
+
+        if( isLast )   // clip
+        {
+          vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
+        }
+
+        vst1q_s16( &dst[ j << 3 ], vsum );
+
+        INCY( dst, j * dstStride );
+      }
+    }
+
+    INCY( src, srcStride );
+  }
+}
+
+template<ARM_VEXT vext>
+void InterpolationFilter::_initInterpolationFilterARM()
+{
+  m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8<vext, false>;
+  m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8<vext, true>;
+
+  m_filterN2_2D = simdInterpolateN2_2D<vext>;
+}
+
+#else    //  !__ARM_ARCH >= 8
+
+template<ARM_VEXT vext>
+void InterpolationFilter::_initInterpolationFilterARM()
+{}
+
+#endif   //  !__ARM_ARCH >= 8
+
+template void InterpolationFilter::_initInterpolationFilterARM<SIMDARM>();
+
+#endif   // #ifdef TARGET_SIMD_ARM
+
+}   // namespace vvdec
diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h
new file mode 100644
index 000000000..0d5be3728
--- /dev/null
+++ b/source/Lib/CommonLib/arm/RdCostARM.h
@@ -0,0 +1,265 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+/** \file     RdCostARM.h
+    \brief    RD cost computation class, SIMD version
+*/
+
+#include <math.h>
+#include <limits>
+
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+#include "../RdCost.h"
+
+namespace vvenc
+{
+
+#ifdef TARGET_SIMD_ARM
+#if __ARM_ARCH >= 8
+
+template<ARM_VEXT vext, bool isWdt16>
+Distortion xGetSAD_MxN_SIMD( const DistParam& rcDtParam )
+{
+  if( rcDtParam.bitDepth > 10 )
+    return isWdt16 ? RdCost::xGetSAD16( rcDtParam ) : RdCost::xGetSAD8( rcDtParam );
+
+  //  assert( rcDtParam.iCols == iWidth);
+  const short*    pSrc1       = (const short*) rcDtParam.org.buf;
+  const short*    pSrc2       = (const short*) rcDtParam.cur.buf;
+  const int       iRows       = rcDtParam.org.height;
+  const int       iSubShift   = rcDtParam.subShift;
+  const ptrdiff_t iStrideSrc1 = rcDtParam.org.stride << iSubShift;
+  const ptrdiff_t iStrideSrc2 = rcDtParam.cur.stride << iSubShift;
+
+  uint32_t uiSum = 0;
+
+  int16x8_t vsum16 = vdupq_n_s16( 0 );
+
+  for( int i = 0; i < ( iRows >> 3 ); i++ )
+  {
+    // 0
+    int16x8_t vsrc1 = vld1q_s16( pSrc1 );
+    int16x8_t vsrc2 = vld1q_s16( pSrc2 );
+
+    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+
+    if( isWdt16 )
+    {
+      vsrc1 = vld1q_s16( pSrc1 + 8 );
+      vsrc2 = vld1q_s16( pSrc2 + 8 );
+
+      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+    }
+
+    pSrc1 += iStrideSrc1;
+    pSrc2 += iStrideSrc2;
+
+    // 1
+    vsrc1 = vld1q_s16( pSrc1 );
+    vsrc2 = vld1q_s16( pSrc2 );
+
+    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+
+    if( isWdt16 )
+    {
+      vsrc1 = vld1q_s16( pSrc1 + 8 );
+      vsrc2 = vld1q_s16( pSrc2 + 8 );
+
+      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+    }
+
+    pSrc1 += iStrideSrc1;
+    pSrc2 += iStrideSrc2;
+
+    // 2
+    vsrc1 = vld1q_s16( pSrc1 );
+    vsrc2 = vld1q_s16( pSrc2 );
+
+    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+
+    if( isWdt16 )
+    {
+      vsrc1 = vld1q_s16( pSrc1 + 8 );
+      vsrc2 = vld1q_s16( pSrc2 + 8 );
+
+      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+    }
+
+    pSrc1 += iStrideSrc1;
+    pSrc2 += iStrideSrc2;
+
+    // 3
+    vsrc1 = vld1q_s16( pSrc1 );
+    vsrc2 = vld1q_s16( pSrc2 );
+
+    vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+
+    if( isWdt16 )
+    {
+      vsrc1 = vld1q_s16( pSrc1 + 8 );
+      vsrc2 = vld1q_s16( pSrc2 + 8 );
+
+      vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 );
+    }
+
+    pSrc1 += iStrideSrc1;
+    pSrc2 += iStrideSrc2;
+  }
+
+  uiSum = vaddlvq_s16( vsum16 );
+  uiSum <<= iSubShift;
+  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
+}
+
+template<ARM_VEXT vext, bool isCalCentrePos>
+void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost )
+{
+  int        i, j;
+  const Pel* piOrg      = rcDtParam.org.buf;
+  const Pel* piCur      = rcDtParam.cur.buf - 4;
+  int        height     = rcDtParam.org.height;
+  int        iSubShift  = rcDtParam.subShift;
+  int        iSubStep   = ( 1 << iSubShift );
+  ptrdiff_t  iStrideCur = rcDtParam.cur.stride * iSubStep;
+  ptrdiff_t  iStrideOrg = rcDtParam.org.stride * iSubStep;
+
+  int16x8_t sum0 = vdupq_n_s16( 0 );
+  int16x8_t sum1 = vdupq_n_s16( 0 );
+  int16x8_t sum2 = vdupq_n_s16( 0 );
+  int16x8_t sum3 = vdupq_n_s16( 0 );
+  int16x8_t sum4 = vdupq_n_s16( 0 );
+
+  for( i = 0; i < height; i += iSubStep )
+  {
+    for( j = 0; j < 16; j += 8 )
+    {
+      int16x8_t s0 = vld1q_s16( piOrg + j + 0 );
+      int16x8_t s1 = vld1q_s16( piCur + j + 0 );
+      int16x8_t s2 = vcombine_s16( vld1_s16( piOrg + j + 8 ), vdup_n_s16( 0 ) );
+      int16x8_t s3 = vcombine_s16( vld1_s16( piCur + j + 8 ), vdup_n_s16( 0 ) );
+
+      int16x8_t org0, org1, org2, org3, org4;
+      org0 = s0;
+      org1 = vextq_s16( s0, s2, 1 );
+      if( isCalCentrePos )
+        org2 = vextq_s16( s0, s2, 2 );
+      org3 = vextq_s16( s0, s2, 3 );
+      org4 = vextq_s16( s0, s2, 4 );
+
+      int16x8_t cur0, cur1, cur2, cur3, cur4;
+      cur4 = s1;
+      cur0 = vextq_s16( s1, s3, 4 );
+      cur1 = vextq_s16( s1, s3, 3 );
+      if( isCalCentrePos )
+        cur2 = vextq_s16( s1, s3, 2 );
+      cur3 = vextq_s16( s1, s3, 1 );
+
+      sum0 = vabaq_s16( sum0, org0, cur0 );   // komplett insane
+      sum1 = vabaq_s16( sum1, org1, cur1 );
+      if( isCalCentrePos )
+        sum2 = vabaq_s16( sum2, org2, cur2 );
+      sum3 = vabaq_s16( sum3, org3, cur3 );
+      sum4 = vabaq_s16( sum4, org4, cur4 );
+    }
+
+    INCY( piOrg, iStrideOrg );
+    INCY( piCur, iStrideCur );
+  }
+
+  int32x4_t sum = { vaddlvq_s16( sum0 ), vaddlvq_s16( sum1 ), vaddlvq_s16( sum3 ), vaddlvq_s16( sum4 ) };
+
+  int32x4_t sumTwo;
+  if( isCalCentrePos )
+    sumTwo = vdupq_n_s32( vaddlvq_s16( sum2 ) );
+
+  // vshlq_n_s32 doesnt work because iSubShift ist not a const.
+  sum = vshlq_s32( sum, vdupq_n_s32( iSubShift ) );
+  if( isCalCentrePos )
+    sumTwo = vshlq_s32( sumTwo, vdupq_n_s32( iSubShift ) );
+
+  sum = vshrq_n_s32( sum, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) );
+  if( isCalCentrePos )
+    sumTwo = vshrq_n_s32( sumTwo, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) );
+
+  vst1q_lane_u64( (uint64_t*) &cost[ 0 ], (uint64x2_t) sum, 0 );
+  if( isCalCentrePos )
+    cost[ 2 ] = vgetq_lane_s32( sumTwo, 0 );
+  vst1q_lane_u64( (uint64_t*) &cost[ 3 ], (uint64x2_t) sum, 1 );
+}
+
+template <ARM_VEXT vext>
+void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos)
+{
+  if( rcDtParam.bitDepth > 10 )
+  {
+    RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos );
+    return;
+  }
+
+  if( isCalCentrePos )
+    xGetSADX5_16xN_SIMDImp<vext, true>( rcDtParam, cost );
+  else
+    xGetSADX5_16xN_SIMDImp<vext, false>( rcDtParam, cost );
+}
+
+template<ARM_VEXT vext>
+void RdCost::_initRdCostARM()
+{
+  m_afpDistortFunc[0][DF_SAD8   ] = xGetSAD_MxN_SIMD<vext, false>;
+  m_afpDistortFunc[0][DF_SAD16  ] = xGetSAD_MxN_SIMD<vext, true>;
+	m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD<vext>;
+}
+
+#else    // !__ARM_ARCH >= 8
+
+template<ARM_VEXT vext>
+void RdCost::_initRdCostARM()
+{}
+
+#endif   // !__ARM_ARCH >= 8
+
+template void RdCost::_initRdCostARM<SIMDARM>();
+
+#endif   // TARGET_SIMD_ARM
+
+}   // namespace vvenc
diff --git a/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp b/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp
new file mode 100644
index 000000000..f89806f63
--- /dev/null
+++ b/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+#include "../BufferARM.h"
diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
new file mode 100644
index 000000000..eb6520c2e
--- /dev/null
+++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+#include "../InterpolationFilterARM.h"
diff --git a/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp b/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp
new file mode 100644
index 000000000..0d8616eb1
--- /dev/null
+++ b/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+#include "../RdCostARM.h"
diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h
index 234302159..e0574e79f 100644
--- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h
+++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h
@@ -49,6 +49,8 @@ POSSIBILITY OF SUCH DAMAGE.
 namespace vvenc
 {
 
+using namespace x86_simd;
+
 template<X86_VEXT vext>
 void simdDeriveClassificationBlk(AlfClassifier *classifier, const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, const int shift, const int vbCTUHeight, int vbPos)
 {
diff --git a/source/Lib/CommonLib/x86/CommonDefX86.h b/source/Lib/CommonLib/x86/CommonDefX86.h
index df537dc7c..ad2529f80 100644
--- a/source/Lib/CommonLib/x86/CommonDefX86.h
+++ b/source/Lib/CommonLib/x86/CommonDefX86.h
@@ -84,10 +84,13 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc
 {
+
+using namespace x86_simd;
+
 const std::string& vext_to_string( X86_VEXT vext );
 X86_VEXT           string_to_vext( const std::string& ext_name );
 
-X86_VEXT           read_x86_extension_flags( X86_VEXT request = UNDEFINED );
+X86_VEXT           read_x86_extension_flags( X86_VEXT request = x86_simd::UNDEFINED );
 const std::string& read_x86_extension_name();
 
 
diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
index 50de7a132..b11f75589 100644
--- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
+++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp
@@ -623,7 +623,7 @@ static alf_float_t calcErrorForCoeffsLin_13_SSE( const AlfCovariance::TKE& E, co
 const AlfCovariance& AlfCovariance::operator+= ( const AlfCovariance& src )
 {
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF
-  if( numCoeff == 13 && read_x86_extension_flags() > SCALAR )
+  if( numCoeff == 13 && read_x86_extension_flags() > x86_simd::SCALAR )
   {
     for( int b0 = 0; b0 < numBins; b0++ )
     {
@@ -737,7 +737,7 @@ alf_float_t AlfCovariance::calcErrorForCoeffs<false>( const int *clip, const int
   if( numCoeff == 13 )
   {
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF
-    if( read_x86_extension_flags() > SCALAR )
+    if( read_x86_extension_flags() > x86_simd::SCALAR )
       return calcErrorForCoeffsLin_13_SSE( E, y, coeff, invFactor );
     else
 #endif
@@ -3172,7 +3172,7 @@ void EncAdaptiveLoopFilter::getPreBlkStats(AlfCovariance* alfCovariance, const A
   const int halfFilterLength = shape.filterLength >> 1;
 
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF
-  const bool useSimd = read_x86_extension_flags() > SCALAR;
+  const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR;
 #else
   const bool useSimd = false;
 #endif
@@ -6135,7 +6135,7 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const
   int effStride = recStride << getComponentScaleY(compID, m_chromaFormat);
 
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF
-  const bool useSimd = read_x86_extension_flags() > SCALAR;
+  const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR;
 #endif
 
   Pel ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][16];
diff --git a/source/Lib/apputils/IStreamIO.h b/source/Lib/apputils/IStreamIO.h
index c73b33b46..6a3b2c934 100644
--- a/source/Lib/apputils/IStreamIO.h
+++ b/source/Lib/apputils/IStreamIO.h
@@ -673,6 +673,22 @@ class IStreamToAbbr
     const std::vector<SVPair<A>>* toMap;
 };
 
+template<bool isFloat> class FloatRoundingOffset
+{
+};
+
+template<> class FloatRoundingOffset<true>
+{
+public:
+  static const int offset = 0;
+};
+
+template<> class FloatRoundingOffset<false>
+{
+public:
+  static const int offset = 1;
+};
+
 template<typename T, typename A>
 inline std::istream& operator >> ( std::istream& in, IStreamToAbbr<T,A>& toValue )
 {
@@ -698,7 +714,10 @@ inline std::istream& operator >> ( std::istream& in, IStreamToAbbr<T,A>& toValue
 
       double value = strtod(str.c_str(), NULL); // convert input string to double
       value *= map.value;                       // scale depending on given abbreviation/scaling factor
-      *toValue.dstVal = (T)value;
+      double roundDir    = value < 0 ? -1 : ( value > 0 ? 1 : 0 );
+      double roundOffset = ( FloatRoundingOffset<std::is_floating_point<T>::value>::offset / 2.0 );
+      value += roundDir * roundOffset;
+      *toValue.dstVal = ( T ) value;
       return in;
     }
   }
diff --git a/source/Lib/apputils/Stats.h b/source/Lib/apputils/Stats.h
index d1d62b669..8c2176f1e 100644
--- a/source/Lib/apputils/Stats.h
+++ b/source/Lib/apputils/Stats.h
@@ -54,6 +54,12 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <numeric>
 #include <cmath>
 
+#if defined (_WIN32) || defined (WIN32) || defined (_WIN64) || defined (WIN64)
+  #include <io.h>
+#elif __linux || __APPLE__
+  #include <unistd.h>
+#endif
+
 #include "vvenc/vvenc.h"
 
 
@@ -165,10 +171,11 @@ class Stats
   {
   }
 
-  int init( int framerate, int framescale, int maxFrames, std::string prependString = "" )
+  int init( int framerate, int framescale, int maxFrames,  vvencMsgLevel verbosity, std::string prependString = "" )
   {
     m_framerate   = (framerate/(double)framescale);
     m_maxFrames   = maxFrames;
+    m_verbosity   = verbosity;
     m_preString   = prependString;
     m_bytes       = 0;
     m_bytesCur    = 0;
@@ -181,6 +188,12 @@ class Stats
     m_AUStats[VVENC_P_SLICE].reset(m_framerate);
     m_AUStats[VVENC_B_SLICE].reset(m_framerate);
 
+#if defined (_WIN32) || defined (WIN32) || defined (_WIN64) || defined (WIN64)
+    m_istty = _isatty( _fileno(stdout));
+#elif __linux || __APPLE__
+    m_istty = isatty( fileno(stdout));
+#endif
+
     return 0;
   }
 
@@ -207,20 +220,29 @@ class Stats
     return 0;
   }
 
-  std::string getInfoString()
+  std::string getInfoString( bool finalInfo = false )
   {
     std::stringstream css;
-    m_tEnd = std::chrono::steady_clock::now();
+    m_tEnd     = std::chrono::steady_clock::now();
     m_tGlobEnd = std::chrono::steady_clock::now();
 
     if( m_bytesCur )
     {
-      double bitrate = (m_bytesCur*8 * m_framerate / (double)m_framesCur );
-      double dTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(m_tEnd-m_tStart).count() / 1000.0;
-      double dGlobTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(m_tGlobEnd-m_tGlobStart).count() / 1000.0;
-      double dFps = dTime ? (double)m_framesCur / dTime : 0;
-      double dFpsAvg = dGlobTime ? (double)m_frames / dGlobTime : 0;
+      double dTime      = (double)std::chrono::duration_cast<std::chrono::milliseconds>(m_tEnd-m_tStart).count() / 1000.0;
+      double dGlobTime  = (double)std::chrono::duration_cast<std::chrono::milliseconds>(m_tGlobEnd-m_tGlobStart).count() / 1000.0;        
+      double bitrateAvg = m_bytes*8 * m_framerate/(double)m_frames / 1000.0;
+      double dFpsAvg    = dGlobTime ? (double)m_frames / dGlobTime : 0;
+      
+      double bitrate    = finalInfo ? bitrateAvg : (m_bytesCur*8 * m_framerate / (double)m_framesCur/ 1000.0 );     
+      double dFps       = finalInfo ? dFpsAvg    : (dTime ? (double)m_framesCur / dTime : 0);
+      
+      if( bitrate > (double)m_maxratekbps )
+      {
+        m_maxratekbps = static_cast<int>(bitrate);
+      }
+      int setwBR = std::max( 8,  (int)std::log10(m_maxratekbps) + 4 );
 
+      if ( m_verbosity <= VVENC_INFO && m_istty ) css << "\r";
       css << m_preString << "stats: ";
       if( m_maxFrames > 0 )
       {
@@ -234,11 +256,12 @@ class Stats
         css << " frame= " << std::setfill(' ') << std::setw(4) << m_frames;
       }
 
-      css << " fps= " << std::setfill(' ') << std::setw(4) << dFps << " avg_fps= " << std::setfill(' ') << std::setw(4) << dFpsAvg;
-      css << std::fixed << std::setprecision(2) << " bitrate= " << std::setfill(' ') << std::setw(7) << bitrate/1000.0 << " kbps";
-
-      bitrate = m_bytes*8 * m_framerate/(double)m_frames;
-      css << " avg_bitrate= " << std::setfill(' ') << std::setw(7) << bitrate/1000.0 << " kbps";
+      css << std::fixed << std::setprecision(1);
+      css << " fps= "     << std::setfill(' ') << std::setw(5) << dFps;
+      css << " avg_fps= " << std::setfill(' ') << std::setw(5) << dFpsAvg;
+      css << std::fixed << std::setprecision(2);
+      css << " bitrate= "     << std::setfill(' ') << std::setw(setwBR) << bitrate    << " kbps";
+      css << " avg_bitrate= " << std::setfill(' ') << std::setw(setwBR) << bitrateAvg << " kbps";
 
       int sec   = std::ceil(dGlobTime);
       int days  = sec/86400;
@@ -269,7 +292,9 @@ class Stats
         css << std::setfill('0') << std::setw(2) << min << "m:";
         css << std::setfill('0') << std::setw(2) << sec << "s";
       }
-      css << std::setprecision(-1) << std::endl;
+      css << std::setprecision(-1) << "    ";
+
+      if ( m_verbosity > VVENC_INFO || 0 == m_istty ) css << std::endl;
     }
 
     m_bytesCur = 0;
@@ -290,7 +315,9 @@ class Stats
       double dGlobTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(m_tGlobEnd-m_tGlobStart).count() / 1000.0;
       double dFpsAvg = dGlobTime ? (double)m_frames / dGlobTime : 0;
 
-      css << std::endl << m_preString << "stats summary:";
+      css << getInfoString( true );
+      css << std::endl;
+      css << m_preString << "stats summary:";
       css << " frame= " << m_frames;
       if( m_maxFrames > 0 )
       {
@@ -326,10 +353,13 @@ class Stats
 
   double      m_framerate  = 1.0;
   int         m_maxFrames  = 0;
+  vvencMsgLevel m_verbosity = VVENC_INFO;
   std::string m_preString;
+  int         m_istty      = 0;
 
   uint64_t    m_bytes      = 0;
   uint64_t    m_bytesCur   = 0;
+  int         m_maxratekbps = 0;
 
   int         m_frames     = 0;
   int         m_framesCur  = 0;
diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h
index 9aff214b6..98abe766f 100644
--- a/source/Lib/apputils/VVEncAppCfg.h
+++ b/source/Lib/apputils/VVEncAppCfg.h
@@ -380,6 +380,16 @@ const std::vector<SVPair<int>> BitrateAbrevToIntMap =
   { "bps",                1 }   // bit/sec
 };
 
+const std::vector<SVPair<int>> BitrateOrScaleAbrevToIntMap =
+{
+  { "Mbps",         1000000 },  // mega bit/sec
+  { "M",            1000000 },
+  { "kbps",            1000 },  // kilo bit/sec
+  { "k",               1000 },
+  { "bps",                1 },  //      bit/sec
+  { "x",                -16 }   // negative value: multiplier of target bitrate, with a fixed-point accuracy of 4 bit
+};
+
 //// ====================================================================================================================
 //// string <-> enum
 //// ====================================================================================================================
@@ -494,8 +504,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
   IStreamToRefVec<uint32_t>         toNumTiles                   ( { &c->m_numTileCols, &c->m_numTileRows }, true, 'x'       );
 
   IStreamToFunc<BitDepthAndColorSpace>    toInputFormatBitdepth  ( setInputBitDepthAndColorSpace, this, c, &BitColorSpaceToIntMap, YUV420_8 );
-  IStreamToAbbr<int,int>                  toBitrate              ( &c->m_RCTargetBitrate, &BitrateAbrevToIntMap );
-  IStreamToAbbr<int,int>                  toMaxRate              ( &c->m_RCMaxBitrate,    &BitrateAbrevToIntMap );
+  IStreamToAbbr<int,int>                  toBitrate              ( &c->m_RCTargetBitrate,             &BitrateAbrevToIntMap );
+  IStreamToAbbr<int,int>                  toMaxRate              ( &c->m_RCMaxBitrate,                &BitrateOrScaleAbrevToIntMap );
   IStreamToEnum<vvencDecodingRefreshType> toDecRefreshType       ( &c->m_DecodingRefreshType,         &DecodingRefreshTypeToEnumMap );
 
   IStreamToEnum<int>                toAud                        ( &c->m_AccessUnitDelimiter,         &FlagToIntMap );
@@ -618,7 +628,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("bitrate,b",                                       toBitrate,                                           "bitrate for rate control (0: constant-QP encoding without rate control; otherwise\n"
                                                                                                              "bits/second; use e.g. 1.5M, 1.5Mbps, 1500k, 1500kbps, 1500000bps, 1500000)")
     ("maxrate,m",                                       toMaxRate,                                           "approximate maximum instantaneous bitrate for constrained VBR in rate control (0:\n"
-                                                                                                             "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000)")
+                                                                                                             "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000), use suffix 'x' "
+                                                                                                             "to specify as a multiple of target bitrate")
     ("passes,p",                                        c->m_RCNumPasses,                                    "number of encoding passes with rate control (1: single-pass, -1, 2: two-pass RC)")
     ("pass",                                            c->m_RCPass,                                         "rate control pass for two-pass rate control (-1: both, 1: first, 2: second pass)")
     ("rcstatsfile",                                     m_RCStatsFileName,                                   "rate control statistics file name")
@@ -1134,7 +1145,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("bitrate",                                         toBitrate,                                           "bitrate for rate control (0: constant-QP encoding without rate control, otherwise "
                                                                                                              "bits/second; use e.g. 1.5M, 1.5Mbps, 1500k, 1500kbps, 1500000bps, 1500000)")
     ("maxrate",                                         toMaxRate,                                           "approximate maximum instantaneous bitrate for constrained VBR in rate control (0: "
-                                                                                                             "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000)")
+                                                                                                             "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000), use suffix 'x' "
+                                                                                                             "to specify as a multiple of target bitrate")
     ("qpa",                                             toQPA,                                               "Enable perceptually motivated QP adaptation, XPSNR based (0:off, 1:on)", true)
     ;
   }
diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt
index 96bcfc471..b224d9f76 100644
--- a/source/Lib/vvenc/CMakeLists.txt
+++ b/source/Lib/vvenc/CMakeLists.txt
@@ -42,15 +42,21 @@ if( VVENC_ENABLE_X86_SIMD )
   #file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" )
 endif()
 
+if( VVENC_ENABLE_ARM_SIMD )
+  file( GLOB ARM_SRC_FILES      "../CommonLib/arm/*.cpp"      )
+  file( GLOB ARM_INC_FILES      "../CommonLib/arm/*.h"        )
+
+  file( GLOB ARM_NEON_SRC_FILES "../CommonLib/arm/neon/*.cpp" )
+endif()
+
 # get public/extern include files
 file( GLOB PUBLIC_INC_FILES  "../../../include/${LIB_NAME}/*.h" )
 
-# get all source files
-set( SRC_FILES ${BASE_SRC_FILES} ${X86_SRC_FILES} )
-
-# get all include files
-file( GLOB PRIVATE_INC_FILES ${BASE_INC_FILES} ${X86_INC_FILES}  )
+# get all private include files
+set( PRIVATE_INC_FILES ${BASE_INC_FILES} ${X86_INC_FILES} ${ARM_INC_FILES} )
 
+# get all source files
+set( SRC_FILES ${BASE_SRC_FILES} ${X86_SRC_FILES} ${ARM_SRC_FILES} )
 set( INC_FILES ${PRIVATE_INC_FILES} ${PUBLIC_INC_FILES}  )
 
 # NATVIS files for Visual Studio
@@ -69,7 +75,7 @@ add_compile_definitions( ${LIB_NAME_UC}_SOURCE )
 
 # set PRIVATE include directories for all targets in this directory
 include_directories( $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../../include> $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}> )
-include_directories( . .. ../DecoderLib ../EncoderLib ../CommonLib ../CommonLib/x86 ../apputils )
+include_directories( . .. ../DecoderLib ../EncoderLib ../CommonLib ../CommonLib/x86 ../CommonLib/arm ../apputils )
 include_directories( SYSTEM ../../../thirdparty )
 
 # set common warning flags
@@ -122,12 +128,27 @@ if( VVENC_ENABLE_X86_SIMD )
   set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES FOLDER lib )
 endif()
 
+if( VVENC_ENABLE_ARM_SIMD )
+  # set needed compile definitions
+  set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON )
+
+  add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} )
+  # NEON is enabled by default for all files, so don't need to disable LTO
+  # set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES
+  #                                             INTERPROCEDURAL_OPTIMIZATION                OFF
+  #                                             INTERPROCEDURAL_OPTIMIZATION_RELEASE        OFF
+  #                                             INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
+  #                                             INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL     OFF )
+
+  set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib )
+endif()
+
 # set resource file for MSVC compilers
 if( MSVC )
   set( RESOURCE_FILE ${LIB_NAME}.rc )
 endif()
 
-add_library( ${LIB_NAME} ${SRC_FILES} $<$<TARGET_EXISTS:${LIB_NAME}_x86_simd>:$<TARGET_OBJECTS:${LIB_NAME}_x86_simd>> ${INC_FILES} ${NATVIS_FILES} ${RESOURCE_FILE} )
+add_library( ${LIB_NAME} ${SRC_FILES} $<$<TARGET_EXISTS:${LIB_NAME}_x86_simd>:$<TARGET_OBJECTS:${LIB_NAME}_x86_simd>> $<$<TARGET_EXISTS:${LIB_NAME}_arm_simd>:$<TARGET_OBJECTS:${LIB_NAME}_arm_simd>> ${INC_FILES} ${NATVIS_FILES} ${RESOURCE_FILE} )
 
 target_compile_definitions( ${LIB_NAME} PUBLIC $<$<STREQUAL:$<TARGET_PROPERTY:${LIB_NAME},TYPE>,SHARED_LIBRARY>:${LIB_NAME_UC}_DYN_LINK> )
 
diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp
index 9765a5045..24a32987e 100644
--- a/source/Lib/vvenc/vvencCfg.cpp
+++ b/source/Lib/vvenc/vvencCfg.cpp
@@ -762,10 +762,14 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
     vvenc_confirmParameter( c, c->m_pictureTimingSEIEnabled,   "Enabling pictureTiming SEI requires rate control" );
     vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0,          "Specifying a maximum bitrate requires rate control" );
   }
-  else if ( c->m_RCMaxBitrate <= 0 )
+  else if ( c->m_RCMaxBitrate == 0 )
   {
     c->m_RCMaxBitrate = INT32_MAX;
   }
+  else if( c->m_RCMaxBitrate < 0 )
+  {
+    c->m_RCMaxBitrate = ( -c->m_RCMaxBitrate * c->m_RCTargetBitrate + 8 ) >> 4;
+  }
 
   vvenc_confirmParameter( c, c->m_HdrMode < VVENC_HDR_OFF || c->m_HdrMode > VVENC_SDR_BT470BG,  "Sdr/Hdr Mode must be in the range 0 - 8" );
 
diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp
index 0f957e853..a12a75f80 100644
--- a/source/Lib/vvenc/vvencimpl.cpp
+++ b/source/Lib/vvenc/vvencimpl.cpp
@@ -803,23 +803,28 @@ const char* VVEncImpl::setSIMDExtension( const char* simdId )
       THROW( "requested SIMD level (" << simdReqStr << ") not supported by current CPU (max " << read_x86_extension_name() << ")." );
     }
 
-#  if ENABLE_SIMD_OPT_BUFFER
+#if ENABLE_SIMD_OPT_BUFFER
+  #if defined( TARGET_SIMD_X86 )
     g_pelBufOP.initPelBufOpsX86();
-#  endif
-#  if ENABLE_SIMD_TRAFO
-    g_tCoeffOps.initTCoeffOpsX86();
-#  endif
+  #endif
+  #if defined( TARGET_SIMD_ARM )
+    g_pelBufOP.initPelBufOpsARM();
+  #endif
+#endif
+#if ENABLE_SIMD_TRAFO
+  g_tCoeffOps.initTCoeffOpsX86();
+#endif
 
     return read_x86_extension_name().c_str();
   }
-#  if HANDLE_EXCEPTION
+#if HANDLE_EXCEPTION
   catch( Exception& e )
   {
     MsgLog msg;
     msg.log( VVENC_ERROR, "\n%s\n", e.what() );
     return nullptr;
   }
-#  endif   // HANDLE_EXCEPTION
+#endif   // HANDLE_EXCEPTION
 #else      // !TARGET_SIMD_X86
   if( !simdReqStr.empty() && simdReqStr != "SCALAR" )
   {