diff --git a/cfg/experimental/lowdelay_medium.cfg b/cfg/experimental/lowdelay_medium.cfg
index 5534da9e4..2a30481ed 100644
--- a/cfg/experimental/lowdelay_medium.cfg
+++ b/cfg/experimental/lowdelay_medium.cfg
@@ -70,7 +70,7 @@ DualITree                     : 1      # separate partitioning of luma and chrom
 MinQTLumaISlice               : 8
 MinQTChromaISliceInChromaSamples : 4   # minimum QT size in chroma samples for chroma separate tree
 MinQTNonISlice                : 8
-MaxMTTDepth                   : 221111
+MaxMTTDepth                   : 1
 MaxMTTDepthI                  : 2
 MaxNumMergeCand               : 6      # Maximum number of merge candidates
 
@@ -130,7 +130,7 @@ ContentBasedFastQtbt          : 1      # Signal based QTBT speed-up: 0: disabled
 PBIntraFast                   : 1      # Intra mode pre-check dependent on best Inter mode, skip intra if it is not probable (0:off ... 2:fastest)
 FastQtBtEnc                   : 1      # Fast encoding setting for QTBT
 FastHAD                       : 0      # Use fast sub-sampled hadamard for square blocks >=32x32
-FastMrg                       : 3      # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster
+FastMrg                       : 2      # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster
 FastLocalDualTreeMode         : 1      # Fast intra pass coding for local dual-tree in intra coding region: 0: disable, 1: use threshold, 2: one intra mode only
 FastSubPel                    : 1      # Fast sub-pel ME: 0: disabled, 1: enabled
 FastIntraTools                : 1      # Speedup intra tools: LFNST, ISP, MTS
diff --git a/cfg/randomaccess_medium.cfg b/cfg/randomaccess_medium.cfg
index 9377b27f4..abd3ff023 100644
--- a/cfg/randomaccess_medium.cfg
+++ b/cfg/randomaccess_medium.cfg
@@ -22,7 +22,7 @@ BipredSearchRange             : 4           # Search range for bi-prediction ref
 HadamardME                    : 1           # Use of hadamard measure for fractional ME
 FEN                           : 3           # Fast encoder decision
 FDM                           : 1           # Fast Decision for Merge RD cost
-NumRefPics                    : 222111      # Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)
+NumRefPics                    : 222221      # Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)
 NumRefPicsSCC                 : 0           # Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics)
 
 #======== Quantization =============
@@ -58,7 +58,7 @@ DualITree                     : 1      # separate partitioning of luma and chrom
 MinQTLumaISlice               : 8
 MinQTChromaISliceInChromaSamples : 4   # minimum QT size in chroma samples for chroma separate tree
 MinQTNonISlice                : 8
-MaxMTTDepth                   : 221111
+MaxMTTDepth                   : 1
 MaxMTTDepthI                  : 2
 MaxNumMergeCand               : 6      # Maximum number of merge candidates
 
@@ -118,7 +118,7 @@ ContentBasedFastQtbt          : 1      # Signal based QTBT speed-up: 0: disabled
 PBIntraFast                   : 1      # Intra mode pre-check dependent on best Inter mode, skip intra if it is not probable (0:off ... 2:fastest)
 FastQtBtEnc                   : 1      # Fast encoding setting for QTBT
 FastHAD                       : 0      # Use fast sub-sampled hadamard for square blocks >=32x32
-FastMrg                       : 3      # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster
+FastMrg                       : 2      # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster
 FastLocalDualTreeMode         : 1      # Fast intra pass coding for local dual-tree in intra coding region: 0: disable, 1: use threshold, 2: one intra mode only
 FastSubPel                    : 1      # Fast sub-pel ME: 0: disabled, 1: enabled
 FastIntraTools                : 1      # Speedup intra tools: LFNST, ISP, MTS
diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h
index f8d63fd8f..c1f46ced9 100644
--- a/include/vvenc/vvencCfg.h
+++ b/include/vvenc/vvencCfg.h
@@ -774,7 +774,7 @@ typedef struct vvenc_config
                                                                                          // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate).
                                                                                          // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate.
                                                                                          // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier
-  int                 m_reservedInt;
+  int                 m_forceScc;
   double              m_reservedDouble[9];
 
   // internal state variables
diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h
index 48532587e..b804a2e49 100644
--- a/source/Lib/CommonLib/CommonDef.h
+++ b/source/Lib/CommonLib/CommonDef.h
@@ -582,9 +582,11 @@ inline std::string prnt( const char* fmt, ...)
 
 #if ( _WIN32 && ( _MSC_VER > 1300 ) ) || defined (__MINGW64_VERSION_MAJOR)
 #define xMalloc( type, len )        _aligned_malloc( sizeof(type)*(len), MEMORY_ALIGN_DEF_SIZE )
+#define xMalloc2( type, len, alg )  _aligned_malloc( sizeof(type)*(len), alg )
 #define xFree( ptr )                _aligned_free  ( ptr )
 #elif defined (__MINGW32__)
 #define xMalloc( type, len )        __mingw_aligned_malloc( sizeof(type)*(len), MEMORY_ALIGN_DEF_SIZE )
+#define xMalloc2( type, len, alg )  __mingw_aligned_malloc( sizeof(type)*(len), alg )
 #define xFree( ptr )                __mingw_aligned_free( ptr )
 #else
 namespace detail {
@@ -599,11 +601,13 @@ static inline T* aligned_malloc(size_t len, size_t alignement) {
 }
 }
 #define xMalloc( type, len )        detail::aligned_malloc<type>( len, MEMORY_ALIGN_DEF_SIZE )
+#define xMalloc2( type, len, alg )  detail::aligned_malloc<type>( len, alg )
 #define xFree( ptr )                free( ptr )
 #endif
 
 #else
 #define xMalloc( type, len )        malloc   ( sizeof(type)*(len) )
+#define xMalloc2( type, len, alg )  malloc   ( sizeof(type)*(len) )
 #define xFree( ptr )                free     ( ptr )
 #endif //#if ALIGNED_MALLOC
 
diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp
index 961e03133..a77964072 100644
--- a/source/Lib/CommonLib/DepQuant.cpp
+++ b/source/Lib/CommonLib/DepQuant.cpp
@@ -44,10 +44,6 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "TrQuant.h"
 #include "CodingStructure.h"
 #include "UnitTools.h"
-#ifdef TARGET_SIMD_X86
-#  include "x86/CommonDefX86.h"
-#  include <simde/x86/sse4.1.h>
-#endif
 
 #include <bitset>
 
@@ -59,105 +55,6 @@ namespace vvenc {
 
 namespace DQIntern
 {
-  /*================================================================================*/
-  /*=====                                                                      =====*/
-  /*=====   R A T E   E S T I M A T O R                                        =====*/
-  /*=====                                                                      =====*/
-  /*================================================================================*/
-
-  struct NbInfoSbb
-  {
-    //uint8_t   num;
-    uint8_t   numInv;
-    //uint8_t   inPos[5];
-    uint8_t   invInPos[5];
-  };
-  struct NbInfoOut
-  {
-    uint16_t  maxDist;
-    uint16_t  num;
-    uint16_t  outPos[5];
-  };
-  struct CoeffFracBits
-  {
-    int32_t   bits[6];
-  };
-
-
-  enum ScanPosType : int8_t { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
-
-  struct ScanInfo
-  {
-    ScanInfo() {}
-    short         numSbb;
-    short         scanIdx;
-    short         rasterPos;
-    short         sbbPos; // byte
-    short         nextSbbRight;
-    short         nextSbbBelow;
-    int8_t        sbbSize;
-    int8_t        insidePos;
-    int8_t        nextInsidePos;
-    ScanPosType   spt;
-    int8_t        posX;
-    int8_t        posY;
-    int8_t        sigCtxOffsetNext;
-    int8_t        gtxCtxOffsetNext;
-    NbInfoSbb     currNbInfoSbb;
-  };
-
-  class Rom;
-  struct TUParameters
-  {
-    TUParameters ( const Rom& rom, const unsigned width, const unsigned height, const ChannelType chType );
-    ~TUParameters()
-    {
-      delete [] m_scanInfo;
-    }
-
-    ChannelType       m_chType;
-    unsigned          m_width;
-    unsigned          m_height;
-    unsigned          m_numCoeff;
-    unsigned          m_numSbb;
-    unsigned          m_log2SbbWidth;
-    unsigned          m_log2SbbHeight;
-    unsigned          m_log2SbbSize;
-    unsigned          m_sbbSize;
-    unsigned          m_sbbMask;
-    unsigned          m_widthInSbb;
-    unsigned          m_heightInSbb;
-    const ScanElement *m_scanSbbId2SbbPos;
-    const ScanElement *m_scanId2BlkPos;
-    const NbInfoSbb*  m_scanId2NbInfoSbb;
-    const NbInfoOut*  m_scanId2NbInfoOut;
-    ScanInfo*         m_scanInfo;
-  private:
-    void xSetScanInfo( ScanInfo& scanInfo, int scanIdx );
-  };
-
-  class Rom
-  {
-  public:
-    Rom() : m_scansInitialized(false) {}
-    ~Rom() { xUninitScanArrays(); }
-    void                init        ()                       { xInitScanArrays(); }
-    const NbInfoSbb*    getNbInfoSbb( int hd, int vd ) const { return m_scanId2NbInfoSbbArray[hd][vd]; }
-    const NbInfoOut*    getNbInfoOut( int hd, int vd ) const { return m_scanId2NbInfoOutArray[hd][vd]; }
-    const TUParameters* getTUPars   ( const CompArea& area, const ComponentID compID ) const
-    {
-      return m_tuParameters[Log2(area.width)][Log2(area.height)][toChannelType(compID)];
-    }
-  private:
-    void  xInitScanArrays   ();
-    void  xUninitScanArrays ();
-  private:
-    bool          m_scansInitialized;
-    NbInfoSbb*    m_scanId2NbInfoSbbArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ];
-    NbInfoOut*    m_scanId2NbInfoOutArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ];
-    TUParameters* m_tuParameters         [ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ][ MAX_NUM_CH ];
-  };
-
   void Rom::xInitScanArrays()
   {
     if( m_scansInitialized )
@@ -427,48 +324,6 @@ namespace DQIntern
     }
   }
 
-
-
-  class RateEstimator
-  {
-  public:
-    RateEstimator () {}
-    ~RateEstimator() {}
-    void initCtx  ( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess );
-
-    inline const BinFracBits *sigSbbFracBits() const { return m_sigSbbFracBits; }
-    inline const BinFracBits *sigFlagBits(unsigned stateId) const
-    {
-      return m_sigFracBits[std::max(((int) stateId) - 1, 0)];
-    }
-    inline const CoeffFracBits *gtxFracBits(unsigned stateId) const { return m_gtxFracBits; }
-    inline int32_t              lastOffset(unsigned scanIdx) const
-    {
-      return m_lastBitsX[m_scanId2Pos[scanIdx].x] + m_lastBitsY[m_scanId2Pos[scanIdx].y];
-    }
-
-  private:
-    void  xSetLastCoeffOffset ( const FracBitsAccess& fracBitsAccess, const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID );
-    void  xSetSigSbbFracBits  ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
-    void  xSetSigFlagBits     ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
-    void  xSetGtxFlagBits     ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
-
-  private:
-    static const unsigned sm_numCtxSetsSig    = 3;
-    static const unsigned sm_numCtxSetsGtx    = 2;
-    static const unsigned sm_maxNumSigSbbCtx  = 2;
-    static const unsigned sm_maxNumSigCtx     = 12;
-    static const unsigned sm_maxNumGtxCtx     = 21;
-
-  private:
-    const ScanElement * m_scanId2Pos;
-    int32_t             m_lastBitsX      [ MAX_TB_SIZEY ];
-    int32_t             m_lastBitsY      [ MAX_TB_SIZEY ];
-    BinFracBits         m_sigSbbFracBits [ sm_maxNumSigSbbCtx ];
-    BinFracBits         m_sigFracBits    [ sm_numCtxSetsSig   ][ sm_maxNumSigCtx ];
-    CoeffFracBits       m_gtxFracBits                          [ sm_maxNumGtxCtx ];
-  };
-
   void RateEstimator::initCtx( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess )
   {
     m_scanId2Pos = tuPars.m_scanId2BlkPos;
@@ -598,69 +453,7 @@ namespace DQIntern
     }
   }
 
-
-
-
-
-  /*================================================================================*/
-  /*=====                                                                      =====*/
-  /*=====   D A T A   S T R U C T U R E S                                      =====*/
-  /*=====                                                                      =====*/
-  /*================================================================================*/
-
-
-  struct PQData
-  {
-    TCoeff  absLevel;
-    int64_t deltaDist;
-  };
-
-
-  struct Decision
-  {
-    int64_t rdCost;
-    TCoeff  absLevel;
-    int     prevId;
-  };
-
-
-
-
-  /*================================================================================*/
-  /*=====                                                                      =====*/
-  /*=====   P R E - Q U A N T I Z E R                                          =====*/
-  /*=====                                                                      =====*/
-  /*================================================================================*/
-
-  class Quantizer
-  {
-  public:
-    Quantizer() {}
-    void  init                 ( int dqThrVal ) { m_DqThrVal = dqThrVal; }
-    void  dequantBlock         ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, CoeffBuf& recCoeff, bool enableScalingLists, int* piDequantCoef ) const;
-    void  initQuantBlock       ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue );
-    inline void   preQuantCoeff( const TCoeff absCoeff, PQData *pqData, int quanCoeff ) const;
-    inline TCoeff getLastThreshold() const { return m_thresLast; }
-    inline TCoeff getSSbbThreshold() const { return m_thresSSbb; }
-
-    inline int64_t getQScale()       const { return m_QScale; }
-  private:
-    // quantization
-    int               m_DqThrVal;
-    int               m_QShift;
-    int64_t           m_QAdd;
-    int64_t           m_QScale;
-    TCoeff            m_maxQIdx;
-    TCoeff            m_thresLast;
-    TCoeff            m_thresSSbb;
-    // distortion normalization
-    int               m_DistShift;
-    int64_t           m_DistAdd;
-    int64_t           m_DistStepAdd;
-    int64_t           m_DistOrgFact;
-  };
-
-  void Quantizer::initQuantBlock(const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue = -1)
+  void Quantizer::initQuantBlock(const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue)
   {
     CHECKD( lambda <= 0.0, "Lambda must be greater than 0" );
 
@@ -758,16 +551,33 @@ namespace DQIntern
     }
   }
 
-  inline void Quantizer::preQuantCoeff( const TCoeff absCoeff, PQData* pqData, int quanCoeff ) const
+  bool Quantizer::preQuantCoeff( const TCoeff absCoeff, PQData* pqData, int quanCoeff ) const
   {
     int64_t scaledOrg = int64_t( absCoeff ) * quanCoeff;
-    TCoeff  qIdx      = std::max<TCoeff>( 1, std::min<TCoeff>( m_maxQIdx, TCoeff( ( scaledOrg + m_QAdd ) >> m_QShift ) ) );
+    TCoeff  qIdx      = TCoeff( ( scaledOrg + m_QAdd ) >> m_QShift );
+
+    if( qIdx < 0 )
+    {
+      int64_t scaledAdd = m_DistStepAdd - scaledOrg * m_DistOrgFact;
+      PQData& pq_a      = pqData[1];
+      PQData& pq_b      = pqData[2];
+
+      pq_a.deltaDist    = ( ( scaledAdd + 0 * m_DistStepAdd ) * 1 + m_DistAdd ) >> m_DistShift;
+      pq_a.absLevel     = 1;
+
+      pq_b.deltaDist    = ( ( scaledAdd + 1 * m_DistStepAdd ) * 2 + m_DistAdd ) >> m_DistShift;
+      pq_b.absLevel     = 1;
+      
+      return true;
+    }
+     
+    qIdx              = std::max<TCoeff>( 1, std::min<TCoeff>( m_maxQIdx, qIdx ) );
     int64_t scaledAdd = qIdx * m_DistStepAdd - scaledOrg * m_DistOrgFact;
 
-    PQData& pq_a      = pqData[ ( qIdx + 0 ) & 3 ];
-    PQData& pq_b      = pqData[ ( qIdx + 1 ) & 3 ];
-    PQData& pq_c      = pqData[ ( qIdx + 2 ) & 3 ];
-    PQData& pq_d      = pqData[ ( qIdx + 3 ) & 3 ];
+    PQData& pq_a      = pqData[( qIdx + 0 ) & 3];
+    PQData& pq_b      = pqData[( qIdx + 1 ) & 3];
+    PQData& pq_c      = pqData[( qIdx + 2 ) & 3];
+    PQData& pq_d      = pqData[( qIdx + 3 ) & 3];
 
     pq_a.deltaDist    = ( ( scaledAdd + 0 * m_DistStepAdd ) * ( qIdx + 0 ) + m_DistAdd ) >> m_DistShift;
     pq_a.absLevel     = ( qIdx + 1 ) >> 1;
@@ -780,13 +590,9 @@ namespace DQIntern
 
     pq_d.deltaDist    = ( ( scaledAdd + 3 * m_DistStepAdd ) * ( qIdx + 3 ) + m_DistAdd ) >> m_DistShift;
     pq_d.absLevel     = ( qIdx + 4 ) >> 1;
-  }
-
-
-
-
-
 
+    return false;
+  }
 
   /*================================================================================*/
   /*=====                                                                      =====*/
@@ -796,6 +602,13 @@ namespace DQIntern
 
   class State;
 
+  struct Decision
+  {
+    int64_t rdCost;
+    TCoeff  absLevel;
+    int     prevId;
+  };
+
   struct SbbCtx
   {
     uint8_t*  sbbFlags;
@@ -834,7 +647,6 @@ namespace DQIntern
     uint8_t                     m_memory[ 8 * ( MAX_TB_SIZEY * MAX_TB_SIZEY + MLS_GRP_NUM ) ];
   };
 
-#define RICEMAX 32
   const int32_t g_goRiceBits[4][RICEMAX] =
   {
     { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
@@ -983,7 +795,7 @@ namespace DQIntern
       if( rdCostA < decisionA.rdCost )
       {
         decisionA.rdCost    = rdCostA;
-        decisionA.absLevel  = pqDataA.absLevel;
+        decisionA.absLevel  = 1;
         decisionA.prevId    = m_stateId;
       }
 
@@ -1085,7 +897,7 @@ namespace DQIntern
     : m_sbbFracBits     { { 0, 0 } }
     , m_stateId         ( stateId )
     , m_sigFracBitsArray( rateEst.sigFlagBits(stateId) )
-    , m_gtxFracBitsArray( rateEst.gtxFracBits(stateId) )
+    , m_gtxFracBitsArray( rateEst.gtxFracBits() )
     , m_commonCtx       ( commonCtx )
   {
   }
@@ -1119,7 +931,7 @@ namespace DQIntern
 
       if( decision.absLevel )
       {
-        m_sbb.absLevels[scanInfo.insidePos] = ( uint8_t ) std::min<TCoeff>( 255, decision.absLevel );
+        m_sbb.absLevels[scanInfo.insidePos] = ( uint8_t ) std::min<TCoeff>( 254 + ( decision.absLevel & 1 ), decision.absLevel );
         
         if( scanInfo.currNbInfoSbb.numInv )
         {
@@ -1212,7 +1024,7 @@ namespace DQIntern
         ::memset( m_sbb.absLevels, 0, sizeof( m_sbb.absLevels ) );
       }
 
-      m_sbb.absLevels[ scanInfo.insidePos ] = (uint8_t)std::min<TCoeff>( 255, decision.absLevel );
+      m_sbb.absLevels[ scanInfo.insidePos ] = (uint8_t)std::min<TCoeff>( 254 + ( decision.absLevel & 1 ), decision.absLevel );
 
       m_commonCtx.update( scanInfo, prvState, *this );
 
@@ -1295,21 +1107,17 @@ namespace DQIntern
     }
   }
 
-
-
   /*================================================================================*/
   /*=====                                                                      =====*/
   /*=====   T C Q                                                              =====*/
   /*=====                                                                      =====*/
   /*================================================================================*/
-  class DepQuant : private RateEstimator
+  class DepQuant : private RateEstimator, public DepQuantImpl
   {
   public:
     DepQuant( bool enc );
 
-    void    quant   ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff );
-    void    dequant ( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* quantCoeff );
-    void    init    ( int dqTrVal );
+    void quant( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff );
 
   private:
     void    xDecideAndUpdate  ( const TCoeff absCoeff, const ScanInfo& scanInfo, bool zeroOut, int quantCoeff);
@@ -1322,7 +1130,6 @@ namespace DQIntern
     State*      m_prevStates;
     State*      m_skipStates;
     State       m_startState;
-    Quantizer   m_quant;
     Decision    m_trellis[ MAX_TB_SIZEY * MAX_TB_SIZEY ][ 8 ];
     Rom         m_scansRom;
   };
@@ -1354,17 +1161,6 @@ namespace DQIntern
   }
 #undef TINIT
 
-
-  void DepQuant::dequant( const TransformUnit& tu,  CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* piDequantCoef )
-  {
-    m_quant.dequantBlock( tu, compID, cQP, recCoeff, enableScalingLists, piDequantCoef );
-  }
-
-  void DepQuant::init( int dqTrVal )
-  {
-    m_quant.init( dqTrVal );
-  }
-
   void DepQuant::xDecide( const ScanInfo &scanInfo, const TCoeff absCoeff, const int lastOffset, Decision* decisions, bool zeroOut, int quanCoeff )
   {
     ::memcpy( decisions, startDec, 4*sizeof(Decision) );
@@ -1382,9 +1178,7 @@ namespace DQIntern
     }
 
     PQData  pqData[4];
-    m_quant.preQuantCoeff( absCoeff, pqData, quanCoeff );
-
-    bool near0 = pqData[1].deltaDist < pqData[2].deltaDist && pqData[1].absLevel == 1 && pqData[2].absLevel == 1;
+    bool near0 = m_quant.preQuantCoeff( absCoeff, pqData, quanCoeff );
 
     if( near0 )
     {
@@ -1392,6 +1186,8 @@ namespace DQIntern
       m_prevStates[1].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions[0], decisions[2] );
       m_prevStates[2].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions[3], decisions[1] );
       m_prevStates[3].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions[1], decisions[3] );
+
+      m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] );
     }
     else
     {
@@ -1410,6 +1206,9 @@ namespace DQIntern
       m_prevStates[1].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions[2], decisions[0] );
       m_prevStates[2].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions[1], decisions[3] );
       m_prevStates[3].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions[3], decisions[1] );
+
+      m_startState.checkRdCostStart( lastOffset, pqData[0], decisions[0] );
+      m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] );
     }
 
     if( scanInfo.spt==SCAN_EOCSBB )
@@ -1419,9 +1218,6 @@ namespace DQIntern
         m_skipStates[2].checkRdCostSkipSbb( decisions[2] );
         m_skipStates[3].checkRdCostSkipSbb( decisions[3] );
     }
-
-    m_startState.checkRdCostStart( lastOffset, pqData[0], decisions[0] );
-    m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] );
   }
 
   void DepQuant::xDecideAndUpdate( const TCoeff absCoeff, const ScanInfo& scanInfo, bool zeroOut, int quantCoeff )
@@ -1516,60 +1312,6 @@ namespace DQIntern
     {
       const TCoeff defaultTh = TCoeff( thres / ( defaultQuantisationCoefficient << 2 ) );
 
-#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
-      // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
-      if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR )
-      {
-        const int sbbSize = tuPars.m_sbbSize;
-        // move the pointer to the beginning of the current subblock
-        firstTestPos -= ( sbbSize - 1 );
-
-        const __m128i xdfTh = _mm_set1_epi32( defaultTh );
-
-        // for each subblock
-        for( ; firstTestPos >= 0; firstTestPos -= sbbSize )
-        {
-          // skip zeroed out blocks
-          // for 64-point transformation the coding order takes care of that
-          if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) )
-          {
-            continue;
-          }
-
-          // read first line of the subblock and check for coefficients larger than the threshold
-          // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width
-          int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx;
-          __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
-          __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh );
-
-          // same for the next line in the subblock
-          pos += tuPars.m_width;
-          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
-          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
-
-          // and the third line
-          pos += tuPars.m_width;
-          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
-          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
-
-          // and the last line
-          pos += tuPars.m_width;
-          xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
-          xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
-
-          // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold
-          if( !_mm_testz_si128( xdf, xdf ) ) break;
-        }
-
-        if( firstTestPos >= 0 )
-        {
-          // if a coefficient was found, advance the pointer to the end of the current subblock
-          // for the subsequent coefficient-wise refinement (C-impl after endif)
-          firstTestPos += sbbSize - 1;
-        }
-      }
-
-#endif
       for( ; firstTestPos >= 0; firstTestPos-- )
       {
         if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue;
@@ -1640,23 +1382,36 @@ namespace DQIntern
 
     tu.lastPos[compID] = scanIdx - 1;
   }
-
 }; // namespace DQIntern
 
+void DepQuantImpl::dequant( const TransformUnit& tu,  CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* piDequantCoef )
+{
+  m_quant.dequantBlock( tu, compID, cQP, recCoeff, enableScalingLists, piDequantCoef );
+}
 
-
+void DepQuantImpl::init( int dqTrVal )
+{
+  m_quant.init( dqTrVal );
+}
 
 //===== interface class =====
 DepQuant::DepQuant( const Quant* other, bool enc, bool useScalingLists ) : QuantRDOQ2( other, useScalingLists )
 {
+#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_QUANT
+  initDepQuantX86();
+#endif
+
   const DepQuant* dq = dynamic_cast<const DepQuant*>( other );
   CHECK( other && !dq, "The DepQuant cast must be successfull!" );
-  p = new DQIntern::DepQuant( enc );
+  if( !p )
+  {
+    p = new DQIntern::DepQuant( enc );
+  }
 }
 
 DepQuant::~DepQuant()
 {
-  delete static_cast<DQIntern::DepQuant*>(p);
+  delete p;
 }
 
 void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff& uiAbsSum, const QpParam& cQP, const Ctx& ctx )
diff --git a/source/Lib/CommonLib/DepQuant.h b/source/Lib/CommonLib/DepQuant.h
index 3e27b8128..ba8de5339 100644
--- a/source/Lib/CommonLib/DepQuant.h
+++ b/source/Lib/CommonLib/DepQuant.h
@@ -53,6 +53,200 @@ POSSIBILITY OF SUCH DAMAGE.
 
 namespace vvenc {
 
+namespace DQIntern
+{
+  /*================================================================================*/
+  /*=====                                                                      =====*/
+  /*=====   R A T E   E S T I M A T O R                                        =====*/
+  /*=====                                                                      =====*/
+  /*================================================================================*/
+
+  struct NbInfoSbb
+  {
+    uint8_t   numInv;
+    uint8_t   invInPos[5];
+  };
+  struct NbInfoOut
+  {
+    uint16_t  maxDist;
+    uint16_t  num;
+    uint16_t  outPos[5];
+  };
+  struct CoeffFracBits
+  {
+    int32_t   bits[6];
+  };
+
+
+  enum ScanPosType : int8_t { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
+
+  struct ScanInfo
+  {
+    ScanInfo() {}
+    short         numSbb;
+    short         scanIdx;
+    short         rasterPos;
+    short         sbbPos; // byte
+    short         nextSbbRight;
+    short         nextSbbBelow;
+    int8_t        sbbSize;
+    int8_t        insidePos;
+    int8_t        nextInsidePos;
+    ScanPosType   spt;
+    int8_t        posX;
+    int8_t        posY;
+    int8_t        sigCtxOffsetNext;
+    int8_t        gtxCtxOffsetNext;
+    NbInfoSbb     currNbInfoSbb;
+  };
+
+  class Rom;
+  struct TUParameters
+  {
+    TUParameters ( const Rom& rom, const unsigned width, const unsigned height, const ChannelType chType );
+    ~TUParameters()
+    {
+      delete [] m_scanInfo;
+    }
+
+    ChannelType       m_chType;
+    unsigned          m_width;
+    unsigned          m_height;
+    unsigned          m_numCoeff;
+    unsigned          m_numSbb;
+    unsigned          m_log2SbbWidth;
+    unsigned          m_log2SbbHeight;
+    unsigned          m_log2SbbSize;
+    unsigned          m_sbbSize;
+    unsigned          m_sbbMask;
+    unsigned          m_widthInSbb;
+    unsigned          m_heightInSbb;
+    const ScanElement *m_scanSbbId2SbbPos;
+    const ScanElement *m_scanId2BlkPos;
+    const NbInfoSbb*  m_scanId2NbInfoSbb;
+    const NbInfoOut*  m_scanId2NbInfoOut;
+    ScanInfo*         m_scanInfo;
+  private:
+    void xSetScanInfo( ScanInfo& scanInfo, int scanIdx );
+  };
+
+  class Rom
+  {
+  public:
+    Rom() : m_scansInitialized(false) {}
+    ~Rom() { xUninitScanArrays(); }
+    void                init        ()                       { xInitScanArrays(); }
+    const NbInfoSbb*    getNbInfoSbb( int hd, int vd ) const { return m_scanId2NbInfoSbbArray[hd][vd]; }
+    const NbInfoOut*    getNbInfoOut( int hd, int vd ) const { return m_scanId2NbInfoOutArray[hd][vd]; }
+    const TUParameters* getTUPars   ( const CompArea& area, const ComponentID compID ) const
+    {
+      return m_tuParameters[Log2(area.width)][Log2(area.height)][toChannelType(compID)];
+    }
+  private:
+    void  xInitScanArrays   ();
+    void  xUninitScanArrays ();
+  private:
+    bool          m_scansInitialized;
+    NbInfoSbb*    m_scanId2NbInfoSbbArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ];
+    NbInfoOut*    m_scanId2NbInfoOutArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ];
+    TUParameters* m_tuParameters         [ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ][ MAX_NUM_CH ];
+  };
+
+  class RateEstimator
+  {
+  public:
+    RateEstimator () {}
+    ~RateEstimator() {}
+    void initCtx  ( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess );
+
+    inline const BinFracBits *sigSbbFracBits() const { return m_sigSbbFracBits; }
+    inline const BinFracBits *sigFlagBits(unsigned stateId) const
+    {
+      return m_sigFracBits[std::max(((int) stateId) - 1, 0)];
+    }
+    inline const CoeffFracBits *gtxFracBits() const { return m_gtxFracBits; }
+    inline int32_t              lastOffset(unsigned scanIdx) const
+    {
+      return m_lastBitsX[m_scanId2Pos[scanIdx].x] + m_lastBitsY[m_scanId2Pos[scanIdx].y];
+    }
+
+  private:
+    void  xSetLastCoeffOffset ( const FracBitsAccess& fracBitsAccess, const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID );
+    void  xSetSigSbbFracBits  ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
+    void  xSetSigFlagBits     ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
+    void  xSetGtxFlagBits     ( const FracBitsAccess& fracBitsAccess, ChannelType chType );
+
+  public:
+    static const unsigned sm_numCtxSetsSig    = 3;
+    static const unsigned sm_numCtxSetsGtx    = 2;
+    static const unsigned sm_maxNumSigSbbCtx  = 2;
+    static const unsigned sm_maxNumSigCtx     = 12;
+    static const unsigned sm_maxNumGtxCtx     = 21;
+
+  private:
+    const ScanElement * m_scanId2Pos;
+    int32_t             m_lastBitsX      [ MAX_TB_SIZEY ];
+    int32_t             m_lastBitsY      [ MAX_TB_SIZEY ];
+    BinFracBits         m_sigSbbFracBits [ sm_maxNumSigSbbCtx ];
+    BinFracBits         m_sigFracBits    [ sm_numCtxSetsSig   ][ sm_maxNumSigCtx ];
+    CoeffFracBits       m_gtxFracBits                          [ sm_maxNumGtxCtx ];
+  };
+
+  /*================================================================================*/
+  /*=====                                                                      =====*/
+  /*=====   P R E - Q U A N T I Z E R                                          =====*/
+  /*=====                                                                      =====*/
+  /*================================================================================*/
+
+  struct PQData
+  {
+    TCoeff  absLevel;
+    int64_t deltaDist;
+  };
+
+  class Quantizer
+  {
+  public:
+    Quantizer() {}
+    void   init            ( int dqThrVal ) { m_DqThrVal = dqThrVal; }
+    void   dequantBlock    ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, CoeffBuf& recCoeff, bool enableScalingLists, int* piDequantCoef ) const;
+    void   initQuantBlock  ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue = -1 );
+    bool   preQuantCoeff   ( const TCoeff absCoeff, PQData *pqData, int quanCoeff ) const;
+    TCoeff getLastThreshold() const { return m_thresLast; }
+    TCoeff getSSbbThreshold() const { return m_thresSSbb; }
+
+    int64_t getQScale      () const { return m_QScale; }
+
+    // quantization
+    int               m_DqThrVal;
+    int               m_QShift;
+    int64_t           m_QAdd;
+    int64_t           m_QScale;
+    TCoeff            m_maxQIdx;
+    TCoeff            m_thresLast;
+    TCoeff            m_thresSSbb;
+    // distortion normalization
+    int               m_DistShift;
+    int64_t           m_DistAdd;
+    int64_t           m_DistStepAdd;
+    int64_t           m_DistOrgFact;
+  };
+
+#define RICEMAX 32
+  extern const int32_t g_goRiceBits[4][RICEMAX];
+}
+
+class DepQuantImpl
+{
+public:
+  virtual ~DepQuantImpl() {}
+  virtual void quant   ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ) = 0;
+  virtual void dequant ( const TransformUnit& tu,  CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP,                                                      bool enableScalingLists, int* quantCoeff );
+  virtual void init    ( int dqTrVal );
+
+protected:
+  DQIntern::Quantizer  m_quant;
+};
 
 class DepQuant : public QuantRDOQ2
 {
@@ -60,13 +254,19 @@ class DepQuant : public QuantRDOQ2
   DepQuant( const Quant* other, bool enc, bool useScalingLists );
   virtual ~DepQuant();
 
-  virtual void quant  ( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff &uiAbsSum, const QpParam& cQP, const Ctx& ctx );
+  virtual void quant  (       TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff &uiAbsSum, const QpParam& cQP, const Ctx& ctx );
   virtual void dequant( const TransformUnit& tu, CoeffBuf& dstCoeff, const ComponentID compID, const QpParam& cQP );
   
   virtual void init   ( int rdoq = 0, bool useRDOQTS = false, int dqThrVal = 8 );
 
 private:
-  void* p;
+#ifdef TARGET_SIMD_X86
+  void initDepQuantX86();
+  template <X86_VEXT vext>
+  void _initDepQuantX86();
+#endif
+
+  DepQuantImpl* p = nullptr;
 };
 
 } // namespace vvenc
diff --git a/source/Lib/CommonLib/MCTF.cpp b/source/Lib/CommonLib/MCTF.cpp
index c5619c836..004390adf 100644
--- a/source/Lib/CommonLib/MCTF.cpp
+++ b/source/Lib/CommonLib/MCTF.cpp
@@ -937,6 +937,34 @@ void MCTF::filter( const std::deque<Picture*>& picFifo, int filterIdx )
           bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength );
         }
       }
+      if (m_encCfg->m_forceScc <= 0)
+      {
+        bool forceSCC = false;
+        if (pic->gopEntry->m_isStartOfGop)
+        {
+          forceSCC = true;
+          for (int j = 0; j < QPA_MAX_NOISE_LEVELS; j++)
+          {
+            if (pic->m_picShared->m_minNoiseLevels[j] < 255 && pic->m_picShared->m_minNoiseLevels[j])
+            {
+              forceSCC = false;
+              break;
+            }
+          }
+          if (forceSCC)
+          {
+            for (int s = 0; s < mvErr.size(); s++)
+            {
+              if (int(mvErr[s]) == 0)
+              {
+                forceSCC = false;
+                break;
+              }
+            }
+          }
+        }
+        pic->m_picShared->m_forceSCC = forceSCC;
+      }
 
       if( !m_encCfg->m_blockImportanceMapping || !pic->useMCTF )
       {
diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp
index 25bdbc319..730863722 100644
--- a/source/Lib/CommonLib/Picture.cpp
+++ b/source/Lib/CommonLib/Picture.cpp
@@ -173,6 +173,7 @@ Picture::Picture()
     , ctsValid          ( false )
     , isPreAnalysis     ( false )
     , m_picShared       ( nullptr )
+    , gopAdaptedQP      ( 0 )
     , isMeanQPLimited   ( false )
     , picInitialQP      ( -1 )
     , picInitialLambda  ( -1.0 )
@@ -232,7 +233,7 @@ void Picture::reset()
   refCounter          = 0;
   poc                 = -1;
   TLayer              = std::numeric_limits<uint32_t>::max();
-
+  gopAdaptedQP        = 0;
   actualHeadBits      = 0;
   actualTotalBits     = 0;
 
diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h
index 213082380..a6c052bdb 100644
--- a/source/Lib/CommonLib/Picture.h
+++ b/source/Lib/CommonLib/Picture.h
@@ -238,6 +238,7 @@ struct Picture : public UnitArea
 
   std::vector<double>           ctuQpaLambda;
   std::vector<int>              ctuAdaptedQP;
+  int                           gopAdaptedQP; // QP offset of GOP (delta relative to base QP)
   bool                          isMeanQPLimited;
   std::mutex                    wppMutex;
   int                           picInitialQP;
diff --git a/source/Lib/CommonLib/StatCounter.cpp b/source/Lib/CommonLib/StatCounter.cpp
index 1491a453a..3ef9cfbf1 100644
--- a/source/Lib/CommonLib/StatCounter.cpp
+++ b/source/Lib/CommonLib/StatCounter.cpp
@@ -238,7 +238,7 @@ std::ostream& StatCounters::report2D( std::ostream& os, const StatCounter2DSet<T
   {
     OUTSTR( 0, " " );
     OUTSTR( numSymbolsInMantissa, (int)(cntAccumDimVer[i] / (double)scalingFactor) );
-    OUTSTR( numSymbolsInExp, "." );
+    OUTSTR( numSymbolsInExp, " " );
   }
 
   OUTSTR( 0, "\r\n" );
diff --git a/source/Lib/CommonLib/TimeProfiler.cpp b/source/Lib/CommonLib/TimeProfiler.cpp
index 81460c9ea..1c7646917 100644
--- a/source/Lib/CommonLib/TimeProfiler.cpp
+++ b/source/Lib/CommonLib/TimeProfiler.cpp
@@ -79,13 +79,13 @@ TProfiler* timeProfilerCreate( const vvenc_config& encCfg )
 #if ENABLE_TIME_PROFILING_PIC_TYPES
   tp = new TProfiler( 3, 1, 1, profilerId );
 #elif ENABLE_TIME_PROFILING_TL
-  tp = new TProfiler( encCfg.m_log2GopSize + 2, 1, 1, profilerId );
+  tp = new TProfiler( floorLog2( encCfg.m_GOPSize ) + 2, 1, 1, profilerId );
 #elif ENABLE_TIME_PROFILING_CTUS_IN_PIC
   int   widthInCTU  = ( encCfg.m_PadSourceWidth % encCfg.m_CTUSize )  ? encCfg.m_PadSourceWidth/encCfg.m_CTUSize  + 1 : encCfg.m_PadSourceWidth/encCfg.m_CTUSize;
   int   heightInCTU = ( encCfg.m_PadSourceHeight % encCfg.m_CTUSize ) ? encCfg.m_PadSourceHeight/encCfg.m_CTUSize + 1 : encCfg.m_PadSourceHeight/encCfg.m_CTUSize;
-  tp = new TProfiler( widthInCTU, heightInCTU, 2, 1, profilerId );
+  tp = new TProfiler( widthInCTU, heightInCTU, 2, profilerId );
 #elif ENABLE_TIME_PROFILING_CU_SHAPES
-  tp = new TProfiler( Log2(encCfg.m_CTUSize) + 1, Log2(encCfg.m_CTUSize) + 1, 2, 1, profilerId );
+  tp = new TProfiler( Log2(encCfg.m_CTUSize) + 1, Log2(encCfg.m_CTUSize) + 1, 2, profilerId );
 #endif
   profilerId++;
 #endif
@@ -98,8 +98,6 @@ void timeProfilerResults( TProfiler* tp )
   if( tp )
   {
     std::cout << *tp;
-    delete tp;
-    tp = nullptr;
   }
 #else
   if( tp )
@@ -128,7 +126,7 @@ void timeProfilerResults( TProfiler* tp )
 #elif ENABLE_TIME_PROFILING_CTUS_IN_PIC
     for( int i = 0; i < tp->getCountersSet().size(); i++ )
     {
-      std::cout << "Run-time of selected encoder stages across CTUs of all pictures " << "(" << ( i == 0 ? "Intra": "Inter" << ")" ) << std::endl;
+      std::cout << "Run-time of selected encoder stages across CTUs of all pictures " << "(" << ( i == 0 ? "Intra": "Inter" ) << ")" << std::endl;
       StatCounters::report2D( std::cout, tp->getCountersSet()[i], false, true, false, true, true, -1 );
       if( i > 0 )
         tp->getCountersSet()[0] += tp->getCountersSet()[i];
@@ -151,8 +149,6 @@ void timeProfilerResults( TProfiler* tp )
       StatCounters::report2D( std::cout, tp->getCountersSet()[0],  true, true, false, true, true, -1 );
     }
 #endif
-    delete tp;
-    tp = nullptr;
   }
 #endif
 }
diff --git a/source/Lib/CommonLib/TimeProfiler.h b/source/Lib/CommonLib/TimeProfiler.h
index 25ec59168..c0a50e39d 100644
--- a/source/Lib/CommonLib/TimeProfiler.h
+++ b/source/Lib/CommonLib/TimeProfiler.h
@@ -132,12 +132,6 @@ class TimeProfiler
   time_point previous = clock::now();
   STAGE    m_eStage;
   const unsigned m_numStages = sizeof( stageNames ) / sizeof( stageNames[0] )/*P_STAGES + 1*/;
-  int      m_iLevel;
-  int      m_iExtData;
-  unsigned m_numBlkHor;
-  unsigned m_numBlkVer;
-  unsigned m_curWId;
-  unsigned m_curHId;
 
 public:
   const time_point start_time = previous;
@@ -293,8 +287,6 @@ class TimeProfiler2D
     m_curX     = x;
     m_curY     = y;
     m_curZ     = z;
-    //if( s == P_ALF )
-    //  printf( "prof=%d\n", m_id );
   }
   TimeProfiler2D& operator+=( const TimeProfiler2D& other ) 
   {
@@ -392,7 +384,7 @@ class StageTimeProfiler2D
 #define PROFILER_EXT_ACCUM_AND_START_NEW_SET_(cond,p,s,t,l,x,y,w,h)   PROF_EXT_ACCUM_AND_START_NEW_SET_COND(cond,p,s,w,h,t)
 #endif
 
-#define PROFILER_EXT_UPDATE(p,s,t)                              PROF_EXT_UPDATE(p,s,t)
+#define PROFILER_SCOPE_TOP_LEVEL_EXT2D(cond,p,s,cs)             PROFILER_SCOPE_AND_STAGE_EXT2D_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, 0, 0, 0, 0)
 #define PROFILER_SCOPE_AND_STAGE_EXT2D(cond,p,s,cs,ch)          PROFILER_SCOPE_AND_STAGE_EXT2D_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, BX_(cs,ch), BY_(cs,ch), BW_(cs,ch), BH_(cs,ch) )
 #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch )   PROFILER_EXT_ACCUM_AND_START_NEW_SET_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, BX_(cs,ch), BY_(cs,ch), BW_(cs,ch), BH_(cs,ch) )
 #endif
@@ -401,12 +393,15 @@ class StageTimeProfiler2D
 #define PROFILER_ACCUM_AND_START_NEW_SET(cond,p,s)              (*(p))(s)
 #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch)    (*(p))(s)
 #define PROFILER_SCOPE_AND_STAGE(cond,p,s)                      PROFILER_SCOPE_AND_STAGE_(cond,p,s)
+#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs)               PROFILER_SCOPE_AND_STAGE_(cond,p,s)
 #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch)            PROFILER_SCOPE_AND_STAGE_(cond,p,s)
 #define PROFILER_EXT_UPDATE(p,s,t)
 typedef TimeProfiler TProfiler;
 #else  //ENABLE_TIME_PROFILING_EXTENDED
+#define PROFILER_EXT_UPDATE(p,s,t)                              PROF_EXT_UPDATE(p,s,t)
 #define PROFILER_ACCUM_AND_START_NEW_SET(cond,p,s)              PROF_EXT_ACCUM_AND_START_NEW_SET_COND(cond,p,s,0,0,0)
 #define PROFILER_SCOPE_AND_STAGE(cond,p,s)
+#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs)               PROFILER_SCOPE_TOP_LEVEL_EXT2D(cond,p,s,cs)
 #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch)            PROFILER_SCOPE_AND_STAGE_EXT2D(cond,p,s,cs,ch)
 typedef TimeProfiler2D TProfiler;
 #endif
@@ -427,6 +422,7 @@ void       timeProfilerResults( TProfiler* tp );
 #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch)
 #define PROFILER_SCOPE_AND_STAGE(cond,p,s)
 #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch)
+#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs)
 #define PROFILER_EXT_UPDATE(p,s,t)
 #endif
 
diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h
index e2b19969e..5ee3fd5f9 100644
--- a/source/Lib/CommonLib/TypeDef.h
+++ b/source/Lib/CommonLib/TypeDef.h
@@ -147,7 +147,7 @@ namespace vvenc {
 
 
 #if defined( TARGET_SIMD_X86 ) && !defined( REAL_TARGET_X86 )
-#  define SIMD_EVERYWHERE_EXTENSION_LEVEL                 SSE42
+#  define SIMD_EVERYWHERE_EXTENSION_LEVEL                 AVX2
 #endif
 
 // End of SIMD optimizations
diff --git a/source/Lib/CommonLib/x86/DepQuantX86.h b/source/Lib/CommonLib/x86/DepQuantX86.h
new file mode 100644
index 000000000..2a8a494db
--- /dev/null
+++ b/source/Lib/CommonLib/x86/DepQuantX86.h
@@ -0,0 +1,1630 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or 
+other Intellectual Property Rights other than the copyrights concerning 
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+#include "DepQuant.h"
+#include "TrQuant.h"
+#include "CodingStructure.h"
+#include "UnitTools.h"
+#ifdef TARGET_SIMD_X86
+#  include "x86/CommonDefX86.h"
+#  include <simde/x86/sse4.1.h>
+#if defined( USE_SSE41 ) || !defined( REAL_TARGET_X86 )
+#  include <simde/x86/sse4.2.h>
+#endif
+#endif
+
+#include <bitset>
+
+//! \ingroup CommonLib
+//! \{
+
+namespace vvenc {
+
+#if USE_SSE41 && defined( REAL_TARGET_X86 )
+#define _my_cmpgt_epi64( a, b ) simde_mm_cmpgt_epi64( a, b )
+#else
+#define _my_cmpgt_epi64( a, b ) _mm_cmpgt_epi64( a, b )
+#endif
+
+
+namespace DQIntern
+{
+  /*================================================================================*/
+  /*=====                                                                      =====*/
+  /*=====   T C Q   S T A T E                                                  =====*/
+  /*=====                                                                      =====*/
+  /*================================================================================*/
+
+  static constexpr int64_t rdCostInit = std::numeric_limits<int64_t>::max() >> 1;
+
+  struct Decisions
+  {
+    int64_t   rdCost[4];
+    TCoeffSig absLevel[4];
+    int8_t    prevId[4];
+  };
+
+  template<X86_VEXT vext>
+  class State;
+
+  struct StateMem
+  {
+    uint8_t tpl[64];
+    uint8_t sum[64];
+    uint8_t val[64];
+
+    struct
+    {
+      uint8_t sig[4];
+      uint8_t cff[4];
+    } ctx;
+
+    int64_t  rdCost[4];
+
+    int32_t  sbbBits0[4];
+    int32_t  sbbBits1[4];
+
+    uint8_t  numSig[4];
+    int8_t   refSbbCtxId[4];
+
+    int32_t  cffBits1[RateEstimator::sm_maxNumGtxCtx + 3];
+    int      remRegBins[4];
+
+    int      cffBitsCtxOffset;
+    bool     anyRemRegBinsLt4;
+    unsigned effWidth;
+    unsigned effHeight;
+    int      initRemRegBins;
+  };
+
+  struct SbbCtx
+  {
+    uint8_t*  sbbFlags;
+    uint8_t*  levels;
+  };
+
+  template<X86_VEXT vext>
+  class CommonCtx
+  {
+  public:
+    CommonCtx() : m_currSbbCtx( m_allSbbCtx ), m_prevSbbCtx( m_currSbbCtx + 4 ) {}
+
+    inline void swap() { std::swap(m_currSbbCtx, m_prevSbbCtx); }
+
+    inline void reset( const TUParameters& tuPars, const RateEstimator &rateEst)
+    {
+      m_nbInfo = tuPars.m_scanId2NbInfoOut;
+      ::memcpy( m_sbbFlagBits, rateEst.sigSbbFracBits(), 2*sizeof(BinFracBits) );
+      const int numSbb    = tuPars.m_numSbb;
+      const int chunkSize = numSbb + tuPars.m_numCoeff;
+      uint8_t*  nextMem   = m_memory;
+      for( int k = 0; k < 8; k++, nextMem += chunkSize )
+      {
+        m_allSbbCtx[k].sbbFlags = nextMem;
+        m_allSbbCtx[k].levels   = nextMem + numSbb;
+      }
+    }
+
+    inline void update( const ScanInfo &scanInfo, const int prevId, int stateId, StateMem &curr )
+    {
+      uint8_t*    sbbFlags  = m_currSbbCtx[stateId].sbbFlags;
+      uint8_t*    levels    = m_currSbbCtx[stateId].levels;
+      uint16_t    maxDist   = m_nbInfo[ scanInfo.scanIdx - 1 ].maxDist;
+      uint16_t    sbbSize   = scanInfo.sbbSize;
+      std::size_t setCpSize = ( maxDist > sbbSize ? maxDist - sbbSize : 0 ) * sizeof(uint8_t);
+      if( prevId >= 0 )
+      {
+        ::memcpy( sbbFlags, m_prevSbbCtx[prevId].sbbFlags, scanInfo.numSbb * sizeof( uint8_t ) );
+        ::memcpy( levels + scanInfo.scanIdx + sbbSize, m_prevSbbCtx[prevId].levels + scanInfo.scanIdx + sbbSize, setCpSize );
+      }
+      else
+      {
+        ::memset( sbbFlags, 0, scanInfo.numSbb * sizeof( uint8_t ) );
+        ::memset( levels + scanInfo.scanIdx + sbbSize, 0, setCpSize );
+      }
+      sbbFlags[scanInfo.sbbPos] = !!curr.numSig[stateId];
+
+      const int       sigNSbb   = ( ( scanInfo.nextSbbRight ? sbbFlags[scanInfo.nextSbbRight] : false ) || ( scanInfo.nextSbbBelow ? sbbFlags[scanInfo.nextSbbBelow] : false ) ? 1 : 0 );
+      curr.refSbbCtxId[stateId] = stateId;
+      const BinFracBits sbbBits = m_sbbFlagBits[sigNSbb];
+
+      curr.sbbBits0[stateId] = sbbBits.intBits[0];
+      curr.sbbBits1[stateId] = sbbBits.intBits[1];
+
+      if( sigNSbb || ( ( scanInfo.nextSbbRight && scanInfo.nextSbbBelow ) ? sbbFlags[scanInfo.nextSbbBelow  + 1] : false ) )
+      {
+        const int         scanBeg   = scanInfo.scanIdx - scanInfo.sbbSize;
+        const NbInfoOut*  nbOut     = m_nbInfo + scanBeg;
+        const uint8_t*    absLevels = levels   + scanBeg;
+
+        for( int id = 0; id < scanInfo.sbbSize; id++, nbOut++ )
+        {
+          const int idAddr = ( id << 2 ) + stateId;
+
+          if( nbOut->num )
+          {
+            TCoeff sumAbs = 0, sumAbs1 = 0, sumNum = 0;
+#define UPDATE(k) {TCoeff t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=std::min<TCoeff>(4+(t&1),t); sumNum+=!!t; }
+            switch( nbOut->num )
+            {
+            default:
+            case 5:
+              UPDATE(4);
+            case 4:
+              UPDATE(3);
+            case 3:
+              UPDATE(2);
+            case 2:
+              UPDATE(1);
+            case 1:
+              UPDATE(0);
+            }
+#undef UPDATE
+            curr.tpl[idAddr] = ( sumNum << 5 ) | sumAbs1;
+            curr.sum[idAddr] = ( uint8_t ) std::min( 255, sumAbs );
+          }
+        }
+      }
+    }
+
+    inline void updateAllLvls( const ScanInfo &scanInfo, const StateMem &curr )
+    {
+      uint8_t *levels0 = m_currSbbCtx[0].levels + scanInfo.scanIdx;
+      uint8_t *levels1 = m_currSbbCtx[1].levels + scanInfo.scanIdx;
+      uint8_t *levels2 = m_currSbbCtx[2].levels + scanInfo.scanIdx;
+      uint8_t *levels3 = m_currSbbCtx[3].levels + scanInfo.scanIdx;
+
+      const int regSize = 16;
+      const int ctxSize = scanInfo.sbbSize << 2;
+
+      const __m128i vshuf0 = _mm_setr_epi8(  0,  4,  8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
+      const __m128i vshuf1 = _mm_setr_epi8(  1,  5,  9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
+      const __m128i vshuf2 = _mm_setr_epi8(  2,  6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
+      const __m128i vshuf3 = _mm_setr_epi8(  3,  7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 );
+
+      for( int i = 0, j = 0; i < ctxSize; i += regSize, j += 4 )
+      {
+        __m128i in  = _mm_loadu_si128( ( const __m128i* ) &curr.val[i] );
+
+        _mm_storeu_si32( &levels0[j], _mm_shuffle_epi8( in, vshuf0 ) );
+        _mm_storeu_si32( &levels1[j], _mm_shuffle_epi8( in, vshuf1 ) );
+        _mm_storeu_si32( &levels2[j], _mm_shuffle_epi8( in, vshuf2 ) );
+        _mm_storeu_si32( &levels3[j], _mm_shuffle_epi8( in, vshuf3 ) );
+      }
+    }
+
+  private:
+    const NbInfoOut*            m_nbInfo;
+    BinFracBits                 m_sbbFlagBits[2];
+    SbbCtx                      m_allSbbCtx  [8];
+    SbbCtx*                     m_currSbbCtx;
+    SbbCtx*                     m_prevSbbCtx;
+    uint8_t                     m_memory[ 8 * ( MAX_TB_SIZEY * MAX_TB_SIZEY + MLS_GRP_NUM ) ];
+  };
+
+  template<X86_VEXT vext>
+  class State
+  {
+    friend class CommonCtx<vext>;
+  public:
+    State( const RateEstimator& rateEst, CommonCtx<vext>& commonCtx, const int stateId )
+      : m_stateId         ( stateId )
+      , m_sigFracBitsArray( rateEst.sigFlagBits(stateId) )
+      , m_gtxFracBitsArray( rateEst.gtxFracBits() )
+      , m_commonCtx       ( commonCtx )
+    {
+    }
+
+    static inline void updateStates( const ScanInfo &scanInfo, const Decisions &decisions, StateMem &prev, StateMem &curr )
+    {
+      int8_t s[4] = { 0 }, t[4] = { 0 }, l[4] = { 0 };
+
+#if 1
+      __m128i v254_4 = _mm_setr_epi16( 254, 254, 254, 254,  4,  4,  4,  4 );
+      __m128i v01    = _mm_setr_epi16(   1,   1,   1,   1,  1,  1,  1,  1 );
+      __m128i v032   = _mm_setr_epi8 (   0,   0,   0,   0, 32, 32, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0 );
+      __m128i vn1    = _mm_set1_epi8 (  -1 );
+
+      static_assert( sizeof( curr.rdCost ) == sizeof( decisions.rdCost ), "Non-matching array size" );
+      memcpy( curr.rdCost, decisions.rdCost, sizeof( decisions.rdCost ) );
+
+      // in signalling, the coeffs are always max 16 bit!
+      __m128i v = _mm_loadu_si64( decisions.absLevel );
+      v = _mm_unpacklo_epi64( v, v );
+      __m128i p = _mm_loadu_si32( decisions.prevId );
+      _mm_storeu_si32( s, p ); // store previous state indexes
+      p = _mm_shuffle_epi32( p, 0 ); 
+      __m128i n2  = _mm_cmplt_epi8( p, vn1 );
+      __m128i a_1 = _mm_and_si128( v, v01 );
+      __m128i a_m = _mm_min_epi16( v, _mm_add_epi16( v254_4, a_1 ) );
+      a_m = _mm_packs_epi16( a_m, vn1 );
+      a_m = _mm_or_si128   ( a_m, _mm_sign_epi8( v032, a_m ) );
+      a_m = _mm_andnot_si128( n2, a_m );
+      _mm_storeu_si32( l, a_m ); // store abs value
+      a_m = _mm_shuffle_epi32( a_m, 1 );
+      _mm_storeu_si32( t, a_m ); // store store capped abs value
+#else
+      for( int i = 0; i < 4; ++i )
+      {
+        s[ i ]               = decisions[ i ].prevId;
+        int min4_or_5        = std::min<TCoeff>( 4 + ( decisions[ i ].absLevel & 1 ), decisions[ i ].absLevel );
+        t[ i ]               = decisions[ i ].prevId > -2 ? min4_or_5 : 0;
+        t[ i ]              |= t[i] ? 32 : 0;
+        l[ i ]               = decisions[ i ].prevId > -2 ? std::min( decisions[i].absLevel, 255 ) : 0;
+        //all_above_minus_two &= decision[ i ].prevId > -2;
+      }
+#endif
+
+      {
+        const int ctxSize = 16 * 4;
+        const int regSize = 16;
+
+        __m128i vshuf     = _mm_loadu_si32 ( s );
+                vshuf     = _mm_shuffle_epi32( vshuf, 0 );
+        __m128i vshufmask = _mm_cmplt_epi8 ( vshuf, _mm_setzero_si128() );
+        vshuf             = _mm_add_epi8   ( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) );
+        vshuf             = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask );
+
+        for( int i = 0; i < ctxSize; i += regSize )
+        {
+          __m128i vtpl = _mm_loadu_si128( ( const __m128i* ) &prev.tpl[i] );
+          vtpl = _mm_shuffle_epi8( vtpl, vshuf );
+          _mm_storeu_si128( ( __m128i* ) &curr.tpl[i], vtpl );
+
+          __m128i vval = _mm_loadu_si128( ( const __m128i* ) &prev.val[i] );
+          vval = _mm_shuffle_epi8( vval, vshuf );
+          _mm_storeu_si128( ( __m128i* ) &curr.val[i], vval );
+
+          __m128i vsum = _mm_loadu_si128( ( const __m128i* ) &prev.sum[i] );
+          vsum = _mm_shuffle_epi8( vsum, vshuf );
+          _mm_storeu_si128( ( __m128i* ) &curr.sum[i], vsum );
+        }
+
+        __m128i numSig = _mm_loadu_si32( prev.numSig );
+        numSig = _mm_shuffle_epi8( numSig, vshuf );
+        __m128i lvls   = _mm_loadu_si32( l );
+        lvls   = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() );
+        numSig = _mm_subs_epi8( numSig, lvls );
+        _mm_storeu_si32( curr.numSig, numSig );
+
+        __m128i rsc = _mm_loadu_si32( prev.refSbbCtxId );
+        rsc         = _mm_shuffle_epi8( rsc, vshuf );
+        rsc         = _mm_blendv_epi8( rsc, vshuf, vshuf );
+        _mm_storeu_si32( curr.refSbbCtxId, rsc );
+
+        vshuf = _mm_cvtepi8_epi32( vshuf );
+        vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) );
+        vshuf = _mm_slli_epi32( vshuf, 2 );
+        vshuf = _mm_add_epi8( vshuf,
+                              _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ),
+                                               _mm_setzero_si128(),
+                                               vshuf ) );
+
+        __m128i rrb = _mm_loadu_si128( ( const __m128i* ) prev.remRegBins );
+        rrb = _mm_shuffle_epi8( rrb, vshuf );
+        rrb = _mm_sub_epi32( rrb, _mm_blendv_epi8( _mm_set1_epi32( 1 ), _mm_setzero_si128(), vshuf ) );
+        __m128i mlvl = _mm_loadu_si32( l );
+        rrb = _mm_blendv_epi8( rrb, _mm_set1_epi32( curr.initRemRegBins ), vshuf );
+        
+        __m128i mbins = _mm_cvtepi8_epi32( mlvl );
+        __m128i madd  = _mm_cmpeq_epi32( mbins, _mm_set1_epi32( 1 ) );
+        __m128i mmore = _mm_and_si128( _mm_cmpgt_epi32( mbins, _mm_set1_epi32( 1 ) ), _mm_set1_epi32( 3 ) );
+        madd = _mm_sub_epi32( madd, mmore );
+        madd = _mm_blendv_epi8( madd, _mm_setzero_si128(), _mm_cmplt_epi32(rrb, _mm_set1_epi32(4)));
+        rrb  = _mm_add_epi32( rrb, madd );
+        _mm_storeu_si128( ( __m128i* ) curr.remRegBins, rrb );
+        rrb = _mm_cmplt_epi32( rrb, _mm_set1_epi32( 4 ) );
+
+        curr.anyRemRegBinsLt4 = !_mm_test_all_zeros( rrb, rrb );
+
+        __m128i lvl1 = _mm_loadu_si32( l );
+
+        if( scanInfo.currNbInfoSbb.numInv )
+        {
+          //auto adds8 = []( uint8_t a, uint8_t b )
+          //{
+          //  uint8_t c = a + b;
+          //  if( c < a ) c = -1;
+          //  return c;
+          //};
+          //
+          //auto update_deps_scalar = [&]( int k )
+          //{
+          //  for( int i = 0; i < 4; i++ )
+          //  {
+          //    int addr = ( scanInfo.currNbInfoSbb.invInPos[k] << 2 ) + i;
+          //    curr.sum[addr] = adds8( curr.sum[addr], decisions[i].absLevel );
+          //  }
+          //};
+
+          auto update_deps_vec = [&]( int k )
+          {
+            int addr = scanInfo.currNbInfoSbb.invInPos[k] << 2;
+            __m128i msum = _mm_loadu_si32( &curr.sum[addr] );
+            msum = _mm_adds_epu8( msum, mlvl );
+            _mm_storeu_si32( &curr.sum[addr], msum );
+          };
+
+          switch( scanInfo.currNbInfoSbb.numInv )
+          {
+          default:
+          case 5:
+            update_deps_vec( 4 );
+          case 4:
+            update_deps_vec( 3 );
+          case 3:
+            update_deps_vec( 2 );
+          case 2:
+            update_deps_vec( 1 );
+          case 1:
+            update_deps_vec( 0 );
+          }
+        }
+
+        int addr = ( scanInfo.insidePos << 2 );
+        _mm_storeu_si32( &curr.val[addr], lvl1 );
+      }
+
+      {
+        __m128i tpl1 = _mm_loadu_si32( t );
+
+        auto update_deps = [&]( int k )
+        {
+          int addr = scanInfo.currNbInfoSbb.invInPos[k] << 2;
+          __m128i tpl = _mm_loadu_si32( &curr.tpl[addr] );
+          tpl = _mm_add_epi8( tpl, tpl1 );
+          _mm_storeu_si32( &curr.tpl[addr], tpl );
+        };
+
+        switch( scanInfo.currNbInfoSbb.numInv )
+        {
+        default:
+        case 5:
+          update_deps( 4 );
+        case 4:
+          update_deps( 3 );
+        case 3:
+          update_deps( 2 );
+        case 2:
+          update_deps( 1 );
+        case 1:
+          update_deps( 0 );
+        }
+      }
+
+      {
+        __m128i ones    = _mm_set1_epi32( 1 );
+        __m128i tplAcc  = _mm_loadu_si128( ( __m128i * ) &curr.tpl[ ( scanInfo.nextInsidePos << 2 ) ] );
+        tplAcc          = _mm_cvtepu8_epi32( tplAcc );
+
+        __m128i sumAbs1 = _mm_and_si128 ( tplAcc, _mm_set1_epi32( 31 ) );
+        __m128i sumNum  = _mm_srli_epi32( tplAcc, 5 );
+        __m128i sumGt1  = _mm_sub_epi32 ( sumAbs1, sumNum );
+        sumGt1  = _mm_min_epi32( sumGt1, _mm_set1_epi32( 4 ) );
+        sumGt1  = _mm_add_epi32( _mm_set1_epi32( scanInfo.gtxCtxOffsetNext ), sumGt1 );
+
+        sumAbs1 = _mm_add_epi32( sumAbs1, ones );
+        sumAbs1 = _mm_srai_epi32( sumAbs1, 1 );
+        sumAbs1 = _mm_min_epi32( sumAbs1, _mm_set1_epi32( 3 ) );
+
+        sumAbs1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.sigCtxOffsetNext ), sumAbs1 );
+        sumAbs1 = _mm_packs_epi32( sumAbs1, sumAbs1 );
+        sumAbs1 = _mm_packs_epi16( sumAbs1, sumAbs1 );
+        _mm_storeu_si32( curr.ctx.sig, sumAbs1 );
+
+        sumGt1  = _mm_packs_epi32( sumGt1, sumGt1 );
+        sumGt1  = _mm_packs_epi16( sumGt1, sumGt1 );
+        _mm_storeu_si32( curr.ctx.cff, sumGt1 );
+
+        curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext;
+      }
+    }
+
+    static inline void updateStatesEOS(const ScanInfo &scanInfo, const Decisions &decisions, StateMem& prev, const StateMem& skip, StateMem& curr, CommonCtx<vext> &commonCtx)
+    {
+      bool rem_reg_all_gte_4 = true;
+
+      int8_t s[4] = { 0 }, l[4] = { 0 };
+
+      for( int i = 0; i < 4; ++i )
+      {
+        s[i]              = decisions.prevId[i] >= 4 ? -2 : decisions.prevId[i];
+        l[i]              = s[i] > -2 ? std::min<int>( decisions.absLevel[i], 254 + ( decisions.absLevel[i] & 1 ) ) : 0;
+        curr.rdCost[i]    = decisions.rdCost[i];
+      }
+
+      {
+        const int ctxSize = 16 * 4;
+        const int regSize = 16;
+
+        __m128i vshuf     = _mm_loadu_si32( s );
+                vshuf     = _mm_shuffle_epi32( vshuf, 0 );
+        __m128i vshufmask = _mm_cmplt_epi8 ( vshuf, _mm_setzero_si128() );
+        vshuf             = _mm_add_epi8   ( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) );
+        vshuf             = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask );
+
+        for( int i = 0; i < ctxSize; i += regSize )
+        {
+          __m128i vval = _mm_loadu_si128( ( const __m128i* ) &prev.val[i] );
+          vval = _mm_shuffle_epi8( vval, vshuf );
+          _mm_storeu_si128( ( __m128i* ) &curr.val[i], vval );
+        }
+
+        __m128i numSig = _mm_loadu_si32( prev.numSig );
+        numSig = _mm_shuffle_epi8( numSig, vshuf );
+        __m128i lvls   = _mm_loadu_si32( l );
+        lvls   = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() );
+        numSig = _mm_subs_epi8( numSig, lvls );
+        _mm_storeu_si32( curr.numSig, numSig );
+      }
+
+      {
+        __m128i lvl1 = _mm_loadu_si32( l );
+        int addr = ( scanInfo.insidePos << 2 );
+        _mm_storeu_si32( &curr.val[addr], lvl1 );
+      }
+
+      commonCtx.updateAllLvls( scanInfo, curr );
+
+      memset( curr.val, 0, sizeof( curr.val ) );
+      memset( curr.tpl, 0, sizeof( curr.tpl ) );
+      memset( curr.sum, 0, sizeof( curr.sum ) );
+
+      for( int i = 0; i < 4; i++ )
+      {
+        int prevId = decisions.prevId[i];
+        int level  = decisions.absLevel[i];
+
+        if( prevId > -2 )
+        {
+          int remRegBins = 0;
+
+          if( prevId  >= 4 )
+          {
+            CHECKD( level != 0, "cannot happen" );
+            remRegBins = skip.remRegBins[prevId - 4];
+          }
+          else if( prevId >= 0 )
+          {
+            remRegBins = prev.remRegBins[prevId] - 1;
+            if( remRegBins >= 4 )
+            {
+              remRegBins -= ( level < 2 ? level : 3 );
+            }
+          }
+          else
+          {
+            remRegBins = curr.initRemRegBins;
+            if( remRegBins >= 4 )
+            {
+              remRegBins -= ( level < 2 ? level : 3 );
+            }
+          }
+
+          curr.remRegBins[i] = remRegBins;
+
+          const int refId = prevId < 0 ? -1 : ( prevId < 4 ? prev.refSbbCtxId[prevId] : prevId - 4 );
+          commonCtx.update( scanInfo, refId, i, curr );
+
+          rem_reg_all_gte_4 &= remRegBins >= 4;
+        }
+      }
+
+      curr.anyRemRegBinsLt4 = !rem_reg_all_gte_4;
+      memset( curr.numSig, 0, sizeof( curr.numSig ) );
+
+      {
+        __m128i ones    = _mm_set1_epi32( 1 );
+        __m128i tplAcc  = _mm_loadu_si128( ( __m128i * ) &curr.tpl[ ( scanInfo.nextInsidePos << 2 ) ] );
+        tplAcc          = _mm_cvtepu8_epi32( tplAcc );
+
+        __m128i sumAbs1 = _mm_and_si128 ( tplAcc, _mm_set1_epi32( 31 ) );
+        __m128i sumNum  = _mm_srli_epi32( tplAcc, 5 );
+        __m128i sumGt1  = _mm_sub_epi32 ( sumAbs1, sumNum );
+        sumGt1  = _mm_min_epi32( sumGt1, _mm_set1_epi32( 4 ) );
+        sumGt1  = _mm_add_epi32( _mm_set1_epi32( scanInfo.gtxCtxOffsetNext ), sumGt1 );
+
+        sumAbs1 = _mm_add_epi32( sumAbs1, ones );
+        sumAbs1 = _mm_srai_epi32( sumAbs1, 1 );
+        sumAbs1 = _mm_min_epi32( sumAbs1, _mm_set1_epi32( 3 ) );
+
+        sumAbs1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.sigCtxOffsetNext ), sumAbs1 );
+        sumAbs1 = _mm_packs_epi32( sumAbs1, sumAbs1 );
+        sumAbs1 = _mm_packs_epi16( sumAbs1, sumAbs1 );
+        _mm_storeu_si32( curr.ctx.sig, sumAbs1 );
+
+        sumGt1  = _mm_packs_epi32( sumGt1, sumGt1 );
+        sumGt1  = _mm_packs_epi16( sumGt1, sumGt1 );
+        _mm_storeu_si32( curr.ctx.cff, sumGt1 );
+
+        curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext;
+      }
+    }
+
+    inline void init( StateMem &state )
+    {
+      state.rdCost [m_stateId] = rdCostInit;
+      state.ctx.cff[m_stateId] =  0;
+      state.ctx.sig[m_stateId] =  0;
+      state.numSig [m_stateId] =  0;
+      state.refSbbCtxId[m_stateId]
+                               = -1;
+      state.remRegBins[m_stateId]
+                               =  4;
+      state.cffBitsCtxOffset   =  0;
+      m_goRicePar     = 0;
+      m_goRiceZero    = 0;
+    }
+
+    void checkRdCosts( const ScanPosType spt, const PQData& pqDataA, const PQData& pqDataB, Decisions& decisions, int idxAZ, int idxB, const StateMem& state ) const
+    {
+      const int32_t*  goRiceTab = g_goRiceBits[m_goRicePar];
+      int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist;
+      int64_t         rdCostB   = state.rdCost[m_stateId] + pqDataB.deltaDist;
+      int64_t         rdCostZ   = state.rdCost[m_stateId];
+
+      if( state.remRegBins[m_stateId] >= 4 )
+      {
+        const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]];
+        const BinFracBits    sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
+
+        if( pqDataA.absLevel < 4 )
+          rdCostA += cffBits.bits[ pqDataA.absLevel ];
+        else
+        {
+          const unsigned value = ( pqDataA.absLevel - 4 ) >> 1;
+          rdCostA += cffBits.bits[ pqDataA.absLevel - ( value << 1 ) ] + goRiceTab[ std::min<unsigned>( value, RICEMAX - 1 ) ];
+        }
+
+        if( pqDataB.absLevel < 4 )
+          rdCostB += cffBits.bits[ pqDataB.absLevel ];
+        else
+        {
+          const unsigned value = ( pqDataB.absLevel - 4 ) >> 1;
+          rdCostB += cffBits.bits[ pqDataB.absLevel - ( value << 1 ) ] + goRiceTab[std::min<unsigned>( value, RICEMAX - 1 )];
+        }
+
+        if( spt == SCAN_ISCSBB )
+        {
+          rdCostA += sigBits.intBits[ 1 ];
+          rdCostB += sigBits.intBits[ 1 ];
+          rdCostZ += sigBits.intBits[ 0 ];
+        }
+        else if( spt == SCAN_SOCSBB )
+        {
+          rdCostA += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ];
+          rdCostB += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ];
+          rdCostZ += state.sbbBits1[m_stateId] + sigBits.intBits[ 0 ];
+        }
+        else if( state.numSig[m_stateId] )
+        {
+          rdCostA += sigBits.intBits[ 1 ];
+          rdCostB += sigBits.intBits[ 1 ];
+          rdCostZ += sigBits.intBits[ 0 ];
+        }
+        else
+        {
+          rdCostZ = rdCostInit;
+        }
+      }
+      else
+      {
+        rdCostA += ( 1 << SCALE_BITS ) + goRiceTab[ pqDataA.absLevel <= m_goRiceZero ? pqDataA.absLevel - 1 : std::min<int>( pqDataA.absLevel, RICEMAX - 1 ) ];
+        rdCostB += ( 1 << SCALE_BITS ) + goRiceTab[ pqDataB.absLevel <= m_goRiceZero ? pqDataB.absLevel - 1 : std::min<int>( pqDataB.absLevel, RICEMAX - 1 ) ];
+        rdCostZ += goRiceTab[ m_goRiceZero ];
+      }
+
+      if( rdCostA < rdCostZ && rdCostA < decisions.rdCost[idxAZ] )
+      {
+        decisions.rdCost  [idxAZ] = rdCostA;
+        decisions.absLevel[idxAZ] = pqDataA.absLevel;
+        decisions.prevId  [idxAZ] = m_stateId;
+      }
+      else if( rdCostZ < decisions.rdCost[idxAZ] )
+      {
+        decisions.rdCost  [idxAZ] = rdCostZ;
+        decisions.absLevel[idxAZ] = 0;
+        decisions.prevId  [idxAZ] = m_stateId;
+      }
+
+      if( rdCostB < decisions.rdCost[idxB] )
+      {
+        decisions.rdCost  [idxB] = rdCostB;
+        decisions.absLevel[idxB] = pqDataB.absLevel;
+        decisions.prevId  [idxB] = m_stateId;
+      }
+    }
+
+    // has to be called as a first check, assumes no decision has been made yet
+    static void checkAllRdCosts( const ScanPosType spt, State* states, const PQData* pqData, Decisions& decisions, const StateMem& state )
+    {
+      // State mapping
+      // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0
+      // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2
+      // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1
+      // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3
+
+      __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[0] );
+      __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[2] );
+
+      //int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist;
+      //int64_t         rdCostB   = state.rdCost[m_stateId] + pqDataB.deltaDist;
+      //int64_t         rdCostZ   = state.rdCost[m_stateId];
+      __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 );
+      __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 );
+      __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) );
+      __m128i rdCostB01 = _mm_add_epi64( rdCostZ23, deltaDist );
+      __m128i rdCostB23 = _mm_add_epi64( rdCostZ01, deltaDist );
+              deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[0].deltaDist ), _mm_loadu_si64( &pqData[3].deltaDist ) );
+      __m128i rdCostA01 = _mm_add_epi64( rdCostZ01, deltaDist );
+      __m128i rdCostA23 = _mm_add_epi64( rdCostZ23, deltaDist );
+
+      //const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]];
+      //const BinFracBits    sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
+      //
+      //rdCostA += cffBits.bits[ pqDataA.absLevel ];
+      //rdCostB += cffBits.bits[ pqDataB.absLevel ];
+      __m128i sgbts02   = _mm_unpacklo_epi64( _mm_loadu_si64( &states[0].m_sigFracBitsArray[state.ctx.sig[0]] ),
+                                              _mm_loadu_si64( &states[2].m_sigFracBitsArray[state.ctx.sig[2]] ) );
+      __m128i sgbts13   = _mm_unpacklo_epi64( _mm_loadu_si64( &states[1].m_sigFracBitsArray[state.ctx.sig[1]] ),
+                                              _mm_loadu_si64( &states[3].m_sigFracBitsArray[state.ctx.sig[3]] ) );
+
+      {
+        __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
+        __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+        __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
+        __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+        sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 );
+        sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 );
+      }
+
+      {
+        // coeff context is indepndent of state
+        auto &base = states->m_gtxFracBitsArray;
+
+        int32_t cffBitsArr[4] =
+        {
+          base[state.ctx.cff[1]].bits[pqData[2].absLevel],
+          base[state.ctx.cff[3]].bits[pqData[1].absLevel],
+          base[state.ctx.cff[0]].bits[pqData[2].absLevel],
+          base[state.ctx.cff[2]].bits[pqData[1].absLevel],
+        };
+
+        __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr );
+        __m128i add     = _mm_cvtepi32_epi64( cffBits );
+        rdCostB01 = _mm_add_epi64( rdCostB01, add );
+        add       = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) );
+        rdCostB23 = _mm_add_epi64( rdCostB23, add );
+      }
+
+      {
+        // coeff context is indepndent of state
+        auto &base = states->m_gtxFracBitsArray;
+
+        int32_t cffBitsArr[4] =
+        {
+          base[state.ctx.cff[0]].bits[pqData[0].absLevel],
+          base[state.ctx.cff[2]].bits[pqData[3].absLevel],
+          base[state.ctx.cff[1]].bits[pqData[0].absLevel],
+          base[state.ctx.cff[3]].bits[pqData[3].absLevel],
+        };
+
+        __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr );
+        __m128i add     = _mm_cvtepi32_epi64( cffBits );
+        rdCostA01 = _mm_add_epi64( rdCostA01, add );
+        add       = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, add );
+      }
+
+      if( spt == SCAN_ISCSBB )
+      {
+        //  rdCostZ += sigBits.intBits[ 0 ];
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+
+        //  rdCostB += sigBits.intBits[ 1 ];
+        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) );
+        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) );
+
+        //  rdCostA += sigBits.intBits[ 1 ];
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) );
+      }
+      else if( spt == SCAN_SOCSBB )
+      {
+        //  rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ];
+        //  rdCostB += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ];
+        //  rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ];
+        __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 );
+        sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+
+        __m128i add = _mm_cvtepi32_epi64( sbbBits );
+        rdCostB23 = _mm_add_epi64( rdCostB23, add );
+        rdCostA01 = _mm_add_epi64( rdCostA01, add );
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, add );
+        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) );
+        rdCostB01 = _mm_add_epi64( rdCostB01, add );
+        rdCostA23 = _mm_add_epi64( rdCostA23, add );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, add );
+
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) );
+        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) );
+
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) );
+      }
+      else
+      {
+        //else if( state.numSig[m_stateId] )
+        //{
+        //  rdCostA += sigBits.intBits[ 1 ];
+        //  rdCostB += sigBits.intBits[ 1 ];
+        //  rdCostZ += sigBits.intBits[ 0 ];
+        //}
+        //else
+        //{
+        //  rdCostZ = decisionA.rdCost;
+        //}
+
+        __m128i numSig = _mm_loadu_si32( state.numSig );
+
+        rdCostZ01 = _mm_add_epi64(  rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+
+        __m128i mask13 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) );
+        mask13    = _mm_cmpgt_epi8( mask13, _mm_setzero_si128() );
+        __m128i mask02 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) );
+        mask02    = _mm_cmpgt_epi8( mask02, _mm_setzero_si128() );
+
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+
+        rdCostB01 = _mm_add_epi64( rdCostB01, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) );
+        rdCostB23 = _mm_add_epi64( rdCostB23, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) );
+
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) );
+
+        __m128i rdMax = _mm_loadu_si64( &rdCostInit );
+        rdMax = _mm_unpacklo_epi64( rdMax, rdMax );
+
+        rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask02 );
+        rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask13 );
+      }
+
+      // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0
+      // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2
+      // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1
+      // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3
+      // Z0, or A0, or B0
+      // Z1, or A1, or B1
+      // B2, or Z2, or A2
+      // B3, or Z3, or A3
+
+      __m128i rdBest01 = rdCostZ01;
+      __m128i rdBest23 = rdCostB23;
+
+      __m128i valBest = _mm_setr_epi32(                  0,                  0, pqData[2].absLevel, pqData[1].absLevel );
+      __m128i valCand = _mm_setr_epi32( pqData[0].absLevel, pqData[3].absLevel,                  0,                  0 );
+
+      __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 );
+      __m128i idxCand = _mm_setr_epi32( 0, 2, 1, 3 );
+
+      __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 );
+      __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 );
+      __m128i chng   = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
+      chng           = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 );
+      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 );
+
+      valBest = _mm_blendv_epi8( valBest, valCand, chng );
+      idxBest = _mm_blendv_epi8( idxBest, idxCand, chng );
+
+      
+      valCand = _mm_setr_epi32( pqData[2].absLevel, pqData[1].absLevel, pqData[0].absLevel, pqData[3].absLevel );
+      idxCand = _mm_setr_epi32( 1, 3, 1, 3 );
+
+      chng01 = _my_cmpgt_epi64( rdBest01, rdCostB01 );
+      chng23 = _my_cmpgt_epi64( rdBest23, rdCostA23 );
+      chng   = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
+      chng   = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostB01, chng01 );
+      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostA23, chng23 );
+
+      valBest = _mm_blendv_epi8( valBest, valCand, chng );
+      idxBest = _mm_blendv_epi8( idxBest, idxCand, chng );
+
+
+      valBest = _mm_packs_epi32( valBest, _mm_setzero_si128() );
+      idxBest = _mm_packs_epi32( idxBest, _mm_setzero_si128() );
+      idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() );
+
+
+      _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[0], rdBest01 );
+      _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[2], rdBest23 );
+
+      _mm_storeu_si64( decisions.absLevel, valBest );
+      _mm_storeu_si32( decisions.prevId,   idxBest );
+    }
+
+    void checkRdCostsOdd1( const ScanPosType spt, const PQData& pqDataA, Decisions& decisions, int idxA, int idxZ, const StateMem& state ) const
+    {
+      CHECKD( pqDataA.absLevel != 1, "" );
+
+      int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist;
+      int64_t         rdCostZ   = state.rdCost[m_stateId];
+
+      if( state.remRegBins[m_stateId] >= 4 )
+      {
+        const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
+
+        rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1];
+
+        if( spt == SCAN_ISCSBB )
+        {
+          rdCostA += sigBits.intBits[ 1 ];
+          rdCostZ += sigBits.intBits[ 0 ];
+        }
+        else if( spt == SCAN_SOCSBB )
+        {
+          rdCostA += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ];
+          rdCostZ += state.sbbBits1[m_stateId] + sigBits.intBits[ 0 ];
+        }
+        else if( state.numSig[m_stateId] )
+        {
+          rdCostA += sigBits.intBits[ 1 ];
+          rdCostZ += sigBits.intBits[ 0 ];
+        }
+        else
+        {
+          rdCostZ = rdCostInit;
+        }
+      }
+      else
+      {
+        const int32_t*  goRiceTab = g_goRiceBits[m_goRicePar];
+
+        rdCostA += ( 1 << SCALE_BITS ) + goRiceTab[0];
+        rdCostZ += goRiceTab[m_goRiceZero];
+      }
+
+      if( rdCostA < decisions.rdCost[idxA] )
+      {
+        decisions.rdCost  [idxA] = rdCostA;
+        decisions.absLevel[idxA] = pqDataA.absLevel;
+        decisions.prevId  [idxA] = m_stateId;
+      }
+
+      if( rdCostZ < decisions.rdCost[idxZ] )
+      {
+        decisions.rdCost  [idxZ] = rdCostZ;
+        decisions.absLevel[idxZ] = 0;
+        decisions.prevId  [idxZ] = m_stateId;
+      }
+    }
+
+    // has to be called as a first check, assumes no decision has been made yet!!!
+    static void checkAllRdCostsOdd1( const ScanPosType spt, State* states, const PQData* pqData, Decisions& decisions, const StateMem& state )
+    {
+      // State mapping
+      // decision 0: either 1 from 1 (pqData[2]), or 0 from 0
+      // decision 1: either 1 from 3 (pqData[1]), or 0 from 2
+      // decision 2: either 1 from 0 (pqData[2]), or 0 from 1
+      // decision 3: either 1 from 2 (pqData[1]), or 0 from 3
+
+      __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[0] );
+      __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[2] );
+
+      //int64_t         rdCostA   = state.rdCost[m_stateId] + pqDataA.deltaDist; // done
+      //int64_t         rdCostZ   = state.rdCost[m_stateId]; // done
+      __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 );
+      __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 );
+      __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) );
+      __m128i rdCostA01 = _mm_add_epi64( rdCostZ23, deltaDist );
+      __m128i rdCostA23 = _mm_add_epi64( rdCostZ01, deltaDist );
+
+      //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]];
+      //
+      //rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1]; // done
+      //
+      __m128i sgbts02   = _mm_unpacklo_epi64( _mm_loadu_si64( &states[0].m_sigFracBitsArray[state.ctx.sig[0]] ),
+                                              _mm_loadu_si64( &states[2].m_sigFracBitsArray[state.ctx.sig[2]] ) );
+      __m128i sgbts13   = _mm_unpacklo_epi64( _mm_loadu_si64( &states[1].m_sigFracBitsArray[state.ctx.sig[1]] ),
+                                              _mm_loadu_si64( &states[3].m_sigFracBitsArray[state.ctx.sig[3]] ) );
+
+      {
+        __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
+        __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+        __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
+        __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+        sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 );
+        sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 );
+      }
+
+      {
+#if USE_AVX2
+        __m128i cffidx = _mm_cvtepi8_epi32( _mm_loadu_si32( &state.ctx.cff ) );
+        cffidx = _mm_shuffle_epi32( cffidx, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) );
+        cffidx = _mm_sub_epi8( cffidx, _mm_set1_epi32( state.cffBitsCtxOffset ) );
+        __m256i cffBits256 = _mm256_loadu_si256( ( const __m256i* ) &state.cffBits1[state.cffBitsCtxOffset] );
+        cffBits256 = _mm256_permutevar8x32_epi32( cffBits256, _mm256_castsi128_si256( cffidx ) );
+        __m128i cffBits = _mm256_castsi256_si128( cffBits256 );
+#else
+        __m128i cffBits;
+        __m128i bits0123 = _mm_loadu_si128( ( const __m128i* ) &state.cffBits1[state.cffBitsCtxOffset + 0] );
+        __m128i bits4    = _mm_loadu_si32 (                    &state.cffBits1[state.cffBitsCtxOffset + 4] );
+        __m128i cfCtxIdx = _mm_loadu_si32 (                    &state.ctx.cff );
+        cfCtxIdx = _mm_cvtepi8_epi32( cfCtxIdx );
+        cfCtxIdx = _mm_sub_epi8( cfCtxIdx, _mm_set1_epi32(      state.cffBitsCtxOffset ) );
+        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 1 ) );
+        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 2 ) );
+        cfCtxIdx = _mm_slli_epi32( cfCtxIdx, 2 );
+        cfCtxIdx = _mm_add_epi8( cfCtxIdx, _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ) );
+        cffBits  = _mm_shuffle_epi8( bits4, _mm_sub_epi8( cfCtxIdx, _mm_set1_epi8( 16 ) ) );
+        cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_cmpgt_epi8( cfCtxIdx, _mm_set1_epi8( 15 ) ) );
+        cffBits  = _mm_or_si128( cffBits, _mm_shuffle_epi8( bits0123, cfCtxIdx ) );
+        cffBits  = _mm_shuffle_epi32( cffBits, ( 1 << 0 ) + ( 3 << 2 ) +( 0 << 4 ) + ( 2 << 6 ) );
+#endif
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( cffBits ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ) );
+      }
+
+      if( spt == SCAN_ISCSBB )
+      {
+        //  rdCostZ += sigBits.intBits[ 0 ]; // done
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+
+        //  rdCostA += sigBits.intBits[ 1 ]; // done
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) );
+      }
+      else if( spt == SCAN_SOCSBB )
+      {
+        //  rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; // done
+        //  rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; // dome
+        __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 );
+        sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+        
+        __m128i add = _mm_cvtepi32_epi64( sbbBits );
+        rdCostA23 = _mm_add_epi64( rdCostA23, add );
+        rdCostZ01 = _mm_add_epi64( rdCostZ01, add );
+        add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) );
+        rdCostA01 = _mm_add_epi64( rdCostA01, add );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, add );
+
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) );
+      }
+      else
+      {
+        //else if( m_numSigSbb )
+        //{
+        //  rdCostA += sigBits.intBits[ 1 ]; // done
+        //  rdCostZ += sigBits.intBits[ 0 ]; // done
+        //}
+        //else
+        //{
+        //  rdCostZ = decisionZ.rdCost; // done
+        //}
+
+        __m128i numSig = _mm_loadu_si32( state.numSig );
+
+        rdCostZ01 = _mm_add_epi64(  rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) );
+        rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) );
+
+        __m128i mask01 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) );
+        mask01    = _mm_cmpgt_epi8( mask01, _mm_setzero_si128() );
+        __m128i mask23 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) );
+        mask23    = _mm_cmpgt_epi8( mask23, _mm_setzero_si128() );
+        sgbts02   = _mm_unpackhi_epi64( sgbts02, sgbts02 );
+        sgbts13   = _mm_unpackhi_epi64( sgbts13, sgbts13 );
+        rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask01, _mm_cvtepi32_epi64( sgbts13 ) ) );
+        rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask23, _mm_cvtepi32_epi64( sgbts02 ) ) );
+
+        __m128i rdMax = _mm_loadu_si64( &rdCostInit );
+        rdMax = _mm_unpacklo_epi64( rdMax, rdMax );
+
+        rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask23 );
+        rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask01 );
+      }
+
+      //// decision 0: either 1 from 1 (pqData[2]), or 0 from 0
+      //// decision 1: either 1 from 3 (pqData[1]), or 0 from 2
+      //// decision 2: either 1 from 0 (pqData[2]), or 0 from 1
+      //// decision 3: either 1 from 2 (pqData[1]), or 0 from 3
+
+      // d0: Z0, or A0
+      // d1: Z1, or A1
+      // d2: A2, or Z2
+      // d3: A3, or Z3
+
+      __m128i rdBest01 = rdCostZ01;
+      __m128i rdBest23 = rdCostA23;
+
+      __m128i valBest = _mm_setr_epi32( 0, 0, 1, 1 );
+      __m128i valCand = _mm_setr_epi32( 1, 1, 0, 0 );
+
+      __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 );
+      __m128i idxCand = _mm_setr_epi32( 1, 3, 1, 3 );
+
+      __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 );
+      __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 );
+      __m128i chng   = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011
+              chng   = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) );
+
+      rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 );
+      rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 );
+
+      _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[0], rdBest01 );
+      _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[2], rdBest23 );
+
+      valBest = _mm_packs_epi32( _mm_blendv_epi8( valBest, valCand, chng ), _mm_setzero_si128() );
+      idxBest = _mm_packs_epi32( _mm_blendv_epi8( idxBest, idxCand, chng ), _mm_setzero_si128() );
+      idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() );
+
+      _mm_storeu_si64( decisions.absLevel, valBest );
+      _mm_storeu_si32( decisions.prevId,   idxBest );
+    }
+
+    inline void checkRdCostStart(int32_t lastOffset, const PQData &pqData, Decisions &decisions, int idx ) const
+    {
+      const CoeffFracBits &cffBits = m_gtxFracBitsArray[0];
+
+      int64_t rdCost = pqData.deltaDist + lastOffset;
+      if (pqData.absLevel < 4)
+      {
+        rdCost += cffBits.bits[pqData.absLevel];
+      }
+      else
+      {
+        const unsigned value = (pqData.absLevel - 4) >> 1;
+        rdCost += cffBits.bits[pqData.absLevel - (value << 1)] + g_goRiceBits[0][value < RICEMAX ? value : RICEMAX-1];
+      }
+
+      if( rdCost < decisions.rdCost[idx] )
+      {
+        decisions.rdCost  [idx] = rdCost;
+        decisions.absLevel[idx] = pqData.absLevel;
+        decisions.prevId  [idx] = -1;
+      }
+    }
+
+    inline void checkRdCostSkipSbb(Decisions &decisions, int idx, const StateMem& state) const
+    {
+      int64_t rdCost = state.rdCost[m_stateId] + state.sbbBits0[m_stateId];
+      if( rdCost < decisions.rdCost[idx] )
+      {
+        decisions.rdCost  [idx] = rdCost;
+        decisions.absLevel[idx] = 0;
+        decisions.prevId  [idx] = 4 | m_stateId;
+      }
+    }
+
+    inline void checkRdCostSkipSbbZeroOut(Decisions &decisions, int idx, const StateMem& state) const
+    {
+      int64_t rdCost          = state.rdCost[m_stateId] + state.sbbBits0[m_stateId];
+      decisions.rdCost  [idx] = rdCost;
+      decisions.absLevel[idx] = 0;
+      decisions.prevId  [idx] = 4 | m_stateId;
+    }
+
+    inline void setRiceParam( const ScanInfo& scanInfo, const StateMem& state, bool ge4 )
+    {
+      if( state.remRegBins[m_stateId] < 4 || ge4 )
+      {
+        const int addr  = ( scanInfo.insidePos << 2 ) + m_stateId;
+        TCoeff  sumAbs  = state.sum[addr];
+        int sumSub      = state.remRegBins[m_stateId] < 4 ? 0 : 4 * 5;
+        int sumAll      = std::max( std::min( 31, ( int ) sumAbs - sumSub ), 0 );
+        m_goRicePar     = g_auiGoRiceParsCoeff[sumAll];
+
+        if( state.remRegBins[m_stateId] < 4 )
+        {
+          m_goRiceZero  = g_auiGoRicePosCoeff0( m_stateId, m_goRicePar );
+        }
+      }
+    }
+
+  private:
+
+    int8_t                    m_goRicePar;
+    int8_t                    m_goRiceZero;
+    const int8_t              m_stateId;
+    const BinFracBits*const   m_sigFracBitsArray;
+    const CoeffFracBits*const m_gtxFracBitsArray;
+    CommonCtx<vext>&          m_commonCtx;
+  };
+
+  /*================================================================================*/
+  /*=====                                                                      =====*/
+  /*=====   T C Q                                                              =====*/
+  /*=====                                                                      =====*/
+  /*================================================================================*/
+  template<X86_VEXT vext>
+  class DepQuantSimd : private RateEstimator, public DepQuantImpl
+  {
+  public:
+    const Decisions startDec[2] =
+    { 
+      Decisions
+      {
+        { rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2 },
+        { -1, -1, -1, -1 },
+        { -2, -2, -2, -2 },
+      }, 
+      Decisions
+      {
+        { rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2 },
+        { 0, 0, 0, 0 },
+        { 4, 5, 6, 7 },
+      }
+    };
+
+#define TINIT(x) {*this,m_commonCtx,x}
+    DepQuantSimd()
+      : RateEstimator ()
+      , m_commonCtx   ()
+      , m_allStates   {TINIT(0),TINIT(1),TINIT(2),TINIT(3),TINIT(0),TINIT(1),TINIT(2),TINIT(3),TINIT(0),TINIT(1),TINIT(2),TINIT(3)}
+      , m_currStates  (  m_allStates      )
+      , m_prevStates  (  m_currStates + 4 )
+      , m_skipStates  (  m_prevStates + 4 )
+    {
+      m_scansRom.init();
+
+      for( int t = 0; t < ( MAX_TB_SIZEY * MAX_TB_SIZEY ); t++ )
+      {
+        memcpy( m_trellis[t], startDec, sizeof( startDec ) );
+      }
+    }
+#undef TINIT
+
+    ~DepQuantSimd()
+    {
+    }
+
+    void init( int dqTrVal )
+    {
+      m_quant.init( dqTrVal );
+    }
+
+    void quant( TransformUnit &tu, const CCoeffBuf &srcCoeff, const ComponentID compID, const QpParam &cQP, const double lambda, const Ctx &ctx, TCoeff &absSum, bool enableScalingLists, int *quantCoeff )
+    {
+      //===== reset / pre-init =====
+      const TUParameters& tuPars  = *m_scansRom.getTUPars( tu.blocks[compID], compID );
+      m_quant.initQuantBlock    ( tu, compID, cQP, lambda );
+      TCoeffSig*    qCoeff      = tu.getCoeffs( compID ).buf;
+      const TCoeff* tCoeff      = srcCoeff.buf;
+      const int     numCoeff    = tu.blocks[compID].area();
+      ::memset( qCoeff, 0x00, numCoeff * sizeof( TCoeffSig ) );
+      absSum                    = 0;
+
+      const CompArea& area      = tu.blocks[ compID ];
+      const uint32_t  width     = area.width;
+      const uint32_t  height    = area.height;
+      const uint32_t  lfnstIdx  = tu.cu->lfnstIdx;
+      //===== scaling matrix ====
+      //const int         qpDQ = cQP.Qp + 1;
+      //const int         qpPer = qpDQ / 6;
+      //const int         qpRem = qpDQ - 6 * qpPer;
+
+      //TCoeff thresTmp = thres;
+      bool zeroOut = false;
+      bool zeroOutforThres = false;
+      int effWidth = tuPars.m_width, effHeight = tuPars.m_height;
+      if( ( tu.mtsIdx[compID] > MTS_SKIP || ( tu.cs->sps->MTS && tu.cu->sbtInfo != 0 && tuPars.m_height <= 32 && tuPars.m_width <= 32 ) ) && compID == COMP_Y )
+      {
+        effHeight = ( tuPars.m_height == 32 ) ? 16 : tuPars.m_height;
+        effWidth  = ( tuPars.m_width  == 32 ) ? 16 : tuPars.m_width;
+        zeroOut   = ( effHeight < tuPars.m_height || effWidth < tuPars.m_width );
+      }
+      zeroOutforThres = zeroOut || ( 32 < tuPars.m_height || 32 < tuPars.m_width );
+      //===== find first test position =====
+      int firstTestPos = std::min<int>( tuPars.m_width, JVET_C0024_ZERO_OUT_TH ) * std::min<int>( tuPars.m_height, JVET_C0024_ZERO_OUT_TH ) - 1;
+      if( lfnstIdx > 0 && tu.mtsIdx[compID] != MTS_SKIP && width >= 4 && height >= 4 )
+      {
+        firstTestPos = ( ( width == 4 && height == 4 ) || ( width == 8 && height == 8 ) )  ? 7 : 15 ;
+      }
+
+      const TCoeff defaultQuantisationCoefficient = (TCoeff)m_quant.getQScale();
+      const TCoeff thres = m_quant.getLastThreshold();
+      const int zeroOutWidth  = ( tuPars.m_width  == 32 && zeroOut ) ? 16 : 32;
+      const int zeroOutHeight = ( tuPars.m_height == 32 && zeroOut ) ? 16 : 32;
+
+      if( enableScalingLists )
+      {
+        for( ; firstTestPos >= 0; firstTestPos-- )
+        {
+          if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue;
+
+          const TCoeff thresTmp = TCoeff( thres / ( 4 * quantCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) );
+
+          if( abs( tCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) > thresTmp ) break;
+        }
+      }
+      else
+      {
+        const TCoeff defaultTh = TCoeff( thres / ( defaultQuantisationCoefficient << 2 ) );
+
+#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 )
+        // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold
+        if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR )
+        {
+          const int sbbSize = tuPars.m_sbbSize;
+          // move the pointer to the beginning of the current subblock
+          firstTestPos -= ( sbbSize - 1 );
+
+          const __m128i xdfTh = _mm_set1_epi32( defaultTh );
+
+          // for each subblock
+          for( ; firstTestPos >= 0; firstTestPos -= sbbSize )
+          {
+            // skip zeroed out blocks
+            // for 64-point transformation the coding order takes care of that
+            if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) )
+            {
+              continue;
+            }
+
+            // read first line of the subblock and check for coefficients larger than the threshold
+            // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width
+            int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx;
+            __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
+            __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh );
+
+            // same for the next line in the subblock
+            pos += tuPars.m_width;
+            xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
+            xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
+
+            // and the third line
+            pos += tuPars.m_width;
+            xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
+            xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
+
+            // and the last line
+            pos += tuPars.m_width;
+            xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) );
+            xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) );
+
+            // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold
+            if( !_mm_testz_si128( xdf, xdf ) ) break;
+          }
+
+          if( firstTestPos >= 0 )
+          {
+            // if a coefficient was found, advance the pointer to the end of the current subblock
+            // for the subsequent coefficient-wise refinement (C-impl after endif)
+            firstTestPos += sbbSize - 1;
+          }
+        }
+
+#endif
+        for( ; firstTestPos >= 0; firstTestPos-- )
+        {
+          if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue;
+          if( abs( tCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) > defaultTh ) break;
+        }
+      }
+
+      if( firstTestPos < 0 )
+      {
+        tu.lastPos[compID] = -1;
+        return;
+      }
+
+      //===== real init =====
+      RateEstimator::initCtx( tuPars, tu, compID, ctx.getFracBitsAcess() );
+      m_commonCtx.reset( tuPars, *this );
+      for( int k = 0; k < 12; k++ )
+      {
+        m_allStates[k].init( m_state_mem[k>>2] );
+      }
+
+      const int numCtx = isLuma( compID ) ? 21 : 11;
+      const CoeffFracBits* const cffBits = gtxFracBits();
+      for( int i = 0; i < numCtx; i++ )
+      {
+        m_state_mem[0].cffBits1[i] = cffBits[i].bits[1];
+        m_state_mem[1].cffBits1[i] = cffBits[i].bits[1];
+        m_state_mem[2].cffBits1[i] = cffBits[i].bits[1];
+      }
+
+      int effectWidth  = std::min( 32, effWidth );
+      int effectHeight = std::min( 32, effHeight );
+      for (int k = 0; k < 3; k++)
+      {
+        m_state_mem[k].effWidth         = effectWidth;
+        m_state_mem[k].effHeight        = effectHeight;
+        m_state_mem[k].initRemRegBins   = ( effectWidth * effectHeight * MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT ) / 16;
+        m_state_mem[k].anyRemRegBinsLt4 = true; // for the first coeff use scalar impl., because it check against the init state, which
+                                                // prohibits some paths
+      }
+
+      //===== populate trellis =====
+      for( int scanIdx = firstTestPos; scanIdx >= 0; scanIdx-- )
+      {
+        const ScanInfo& scanInfo = tuPars.m_scanInfo[ scanIdx ];
+        if( enableScalingLists )
+        {
+          m_quant.initQuantBlock( tu, compID, cQP, lambda, quantCoeff[scanInfo.rasterPos] );
+          xDecideAndUpdate( abs( tCoeff[scanInfo.rasterPos] ), scanInfo, zeroOut && ( scanInfo.posX >= effWidth || scanInfo.posY >= effHeight ), quantCoeff[scanInfo.rasterPos] );
+        }
+        else
+          xDecideAndUpdate( abs( tCoeff[scanInfo.rasterPos] ), scanInfo, zeroOut && ( scanInfo.posX >= effWidth || scanInfo.posY >= effHeight ), defaultQuantisationCoefficient );
+      }
+
+      //===== find best path =====
+      int       prevId      = -1;
+      int64_t   minPathCost =  0;
+      for( int8_t stateId = 0; stateId < 4; stateId++ )
+      {
+        int64_t pathCost = m_trellis[0][0].rdCost[stateId];
+        if( pathCost < minPathCost )
+        {
+          prevId      = stateId;
+          minPathCost = pathCost;
+        }
+      }
+
+      //===== backward scanning =====
+      int scanIdx = 0;
+      for( ; prevId >= 0; scanIdx++ )
+      {
+        TCoeffSig absLevel = m_trellis[scanIdx][prevId >> 2].absLevel[prevId & 3];
+        int32_t blkpos     = tuPars.m_scanId2BlkPos[scanIdx].idx;
+        qCoeff[ blkpos ]   = TCoeffSig( tCoeff[blkpos] < 0 ? -absLevel : absLevel );
+        absSum            += absLevel;
+        prevId             = m_trellis[scanIdx][prevId >> 2].prevId[prevId & 3];
+      }
+
+      tu.lastPos[compID] = scanIdx - 1;
+    }
+
+  private:
+
+    void xDecideAndUpdate( const TCoeff absCoeff, const ScanInfo &scanInfo, bool zeroOut, int quantCoeff )
+    {
+      Decisions *decisions = &m_trellis[scanInfo.scanIdx][0];
+
+      std::swap( m_prevStates, m_currStates );
+      std::swap( m_prevStateI, m_currStateI );
+
+      xDecide( scanInfo, absCoeff, lastOffset(scanInfo.scanIdx), *decisions, zeroOut, quantCoeff );
+
+      if( scanInfo.scanIdx )
+      {
+        if( scanInfo.insidePos == 0 )
+        {
+          m_commonCtx.swap();
+          State<vext>::updateStatesEOS( scanInfo, *decisions, m_state_mem[m_prevStateI], m_state_mem[m_skipStateI], m_state_mem[m_currStateI], m_commonCtx );
+          ::memcpy( decisions + 1, decisions, sizeof( Decisions ) );
+        }
+        else if( !zeroOut )
+        {
+          State<vext>::updateStates( scanInfo, *decisions, m_state_mem[m_prevStateI], m_state_mem[m_currStateI] );
+        }
+
+        if( scanInfo.spt == SCAN_SOCSBB )
+        {
+          std::swap( m_prevStates, m_skipStates );
+          std::swap( m_prevStateI, m_skipStateI );
+        }
+      }
+    }
+
+    void xDecide( const ScanInfo &scanInfo, const TCoeff absCoeff, const int lastOffset, Decisions &decisions, bool zeroOut, int quantCoeff )
+    {
+      ::memcpy( &decisions, startDec, sizeof( Decisions ) );
+
+      StateMem& prev = m_state_mem[m_prevStateI];
+      StateMem& skip = m_state_mem[m_skipStateI];
+
+      if( zeroOut )
+      {
+        if( scanInfo.spt==SCAN_EOCSBB )
+        {
+          m_skipStates[0].checkRdCostSkipSbbZeroOut( decisions, 0, skip );
+          m_skipStates[1].checkRdCostSkipSbbZeroOut( decisions, 1, skip );
+          m_skipStates[2].checkRdCostSkipSbbZeroOut( decisions, 2, skip );
+          m_skipStates[3].checkRdCostSkipSbbZeroOut( decisions, 3, skip );
+        }
+        return;
+      }
+
+      PQData  pqData[4];
+      //bool near0 = m_quant.preQuantCoeff( absCoeff, pqData, quantCoeff );
+
+      /// start inline prequant
+      int64_t scaledOrg = int64_t( absCoeff ) * quantCoeff;
+      TCoeff  qIdx      = TCoeff( ( scaledOrg + m_quant.m_QAdd ) >> m_quant.m_QShift );
+
+      if( qIdx < 0 )
+      {
+        int64_t scaledAdd = m_quant.m_DistStepAdd - scaledOrg * m_quant.m_DistOrgFact;
+        PQData& pq_a      = pqData[1];
+        PQData& pq_b      = pqData[2];
+
+        pq_a.deltaDist    = ( ( scaledAdd + 0 * m_quant.m_DistStepAdd ) * 1 + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_a.absLevel     = 1;
+
+        pq_b.deltaDist    = ( ( scaledAdd + 1 * m_quant.m_DistStepAdd ) * 2 + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_b.absLevel     = 1;
+        /// stop inline prequant
+
+        if( prev.anyRemRegBinsLt4 )
+        {
+          m_prevStates[0].setRiceParam( scanInfo, prev, false );
+          m_prevStates[0].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions, 2, 0, prev );
+
+          m_prevStates[1].setRiceParam( scanInfo, prev, false );
+          m_prevStates[1].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions, 0, 2, prev );
+
+          m_prevStates[2].setRiceParam( scanInfo, prev, false );
+          m_prevStates[2].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions, 3, 1, prev );
+
+          m_prevStates[3].setRiceParam( scanInfo, prev, false ); 
+          m_prevStates[3].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions, 1, 3, prev );
+        }
+        else
+        {
+          // has to be called as a first check, assumes no decision has been made yet
+          State<vext>::checkAllRdCostsOdd1( scanInfo.spt, m_prevStates, pqData, decisions, prev );
+        }
+
+        m_prevStates->checkRdCostStart( lastOffset, pqData[2], decisions, 2 );
+      }
+      else
+      {
+        /// start inline prequant
+        qIdx              = std::max<TCoeff>( 1, std::min<TCoeff>( m_quant.m_maxQIdx, qIdx ) );
+        int64_t scaledAdd = qIdx * m_quant.m_DistStepAdd - scaledOrg * m_quant.m_DistOrgFact;
+
+        PQData& pq_a      = pqData[( qIdx + 0 ) & 3];
+        PQData& pq_b      = pqData[( qIdx + 1 ) & 3];
+        PQData& pq_c      = pqData[( qIdx + 2 ) & 3];
+        PQData& pq_d      = pqData[( qIdx + 3 ) & 3];
+
+        pq_a.deltaDist    = ( ( scaledAdd + 0 * m_quant.m_DistStepAdd ) * ( qIdx + 0 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_a.absLevel     = ( qIdx + 1 ) >> 1;
+
+        pq_b.deltaDist    = ( ( scaledAdd + 1 * m_quant.m_DistStepAdd ) * ( qIdx + 1 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_b.absLevel     = ( qIdx + 2 ) >> 1;
+
+        pq_c.deltaDist    = ( ( scaledAdd + 2 * m_quant.m_DistStepAdd ) * ( qIdx + 2 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_c.absLevel     = ( qIdx + 3 ) >> 1;
+
+        pq_d.deltaDist    = ( ( scaledAdd + 3 * m_quant.m_DistStepAdd ) * ( qIdx + 3 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift;
+        pq_d.absLevel     = ( qIdx + 4 ) >> 1;
+        /// stop inline prequant
+
+        bool cff02ge4 = pqData[0].absLevel >= 4/* || pqData[2].absLevel >= 4 */;
+        bool cff13ge4 = /* pqData[1].absLevel >= 4 || */ pqData[3].absLevel >= 4;
+
+        if( cff02ge4 || cff13ge4 || prev.anyRemRegBinsLt4 )
+        {
+          if( prev.anyRemRegBinsLt4 || cff02ge4 )
+          {
+            m_prevStates[0].setRiceParam( scanInfo, prev, cff02ge4 );
+            m_prevStates[1].setRiceParam( scanInfo, prev, cff02ge4 );
+          }
+
+          if( prev.anyRemRegBinsLt4 || cff13ge4 )
+          {
+            m_prevStates[2].setRiceParam( scanInfo, prev, cff13ge4 );
+            m_prevStates[3].setRiceParam( scanInfo, prev, cff13ge4 ); 
+          }
+
+          m_prevStates[0].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions, 0, 2, prev );
+          m_prevStates[1].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions, 2, 0, prev );
+          m_prevStates[2].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions, 1, 3, prev );
+          m_prevStates[3].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions, 3, 1, prev );
+        }
+        else
+        {
+          // has to be called as a first check, assumes no decision has been made yet
+          State<vext>::checkAllRdCosts( scanInfo.spt, m_prevStates, pqData, decisions, prev );
+        }
+
+        m_prevStates->checkRdCostStart( lastOffset, pqData[0], decisions, 0 );
+        m_prevStates->checkRdCostStart( lastOffset, pqData[2], decisions, 2 );
+      }
+
+      if( scanInfo.spt==SCAN_EOCSBB )
+      {
+        m_skipStates[0].checkRdCostSkipSbb( decisions, 0, skip );
+        m_skipStates[1].checkRdCostSkipSbb( decisions, 1, skip );
+        m_skipStates[2].checkRdCostSkipSbb( decisions, 2, skip );
+        m_skipStates[3].checkRdCostSkipSbb( decisions, 3, skip );
+      }
+    }
+
+  private:
+    CommonCtx<vext> m_commonCtx;
+    State<vext>     m_allStates[ 12 ];
+    State<vext>*    m_currStates;
+    State<vext>*    m_prevStates;
+    State<vext>*    m_skipStates;
+    Quantizer       m_quant;
+    Decisions       m_trellis[MAX_TB_SIZEY * MAX_TB_SIZEY][2];
+    Rom             m_scansRom;
+
+    StateMem        m_state_mem[3];
+
+    int m_currStateI = 0;
+    int m_prevStateI = 1;
+    int m_skipStateI = 2;
+  };
+}; // namespace DQIntern
+
+template<X86_VEXT vext>
+void DepQuant::_initDepQuantX86()
+{
+  p = new DQIntern::DepQuantSimd<vext>();
+}
+template void DepQuant::_initDepQuantX86<SIMDX86>();
+
+} // namespace vvenc
+
+//! \}
+
+;
\ No newline at end of file
diff --git a/source/Lib/CommonLib/x86/InitX86.cpp b/source/Lib/CommonLib/x86/InitX86.cpp
index e0c598598..f3eb6ec5c 100644
--- a/source/Lib/CommonLib/x86/InitX86.cpp
+++ b/source/Lib/CommonLib/x86/InitX86.cpp
@@ -374,6 +374,25 @@ void Quant::initQuantX86()
   }
 }
 
+void DepQuant::initDepQuantX86()
+{
+  auto vext = read_x86_extension_flags();
+  switch (vext){
+  case AVX512:
+  case AVX2:
+    _initDepQuantX86<AVX2>();
+    break;
+  case AVX:
+  case SSE42:
+    _initDepQuantX86<SSE42>();
+    break;
+  case SSE41:
+    _initDepQuantX86<SSE41>();
+    break;
+  default:
+    break;
+  }
+}
 
 #endif
 
diff --git a/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h b/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
index ab174d349..d3bbb0678 100644
--- a/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
+++ b/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
@@ -105,7 +105,7 @@ void offsetBlock_SIMD( const int     channelBitDepth,
 
 #ifdef USE_AVX2
       // AVX2
-      if ((width>8) && (vext >= AVX2))
+      if( ( width & 15 ) == 0 && vext >= AVX2 )
       {
         __m256i vsrca,vsrcal,vsrcar;
         __m256i vbaseoffset = _mm256_set1_epi16(2) ;
@@ -224,7 +224,7 @@ void offsetBlock_SIMD( const int     channelBitDepth,
     }
 #ifdef USE_AVX2
     // AVX2
-    if ((width>8) && (vext >= AVX2))
+    if( ( width & 15 ) == 0 && ( vext >= AVX2 ) )
     {
       __m256i vsrca,vsrcat,vsrcab;
 
@@ -329,7 +329,7 @@ void offsetBlock_SIMD( const int     channelBitDepth,
       }
 #ifdef USE_AVX2
       // AVX2
-      if ((width>8) && (vext >= AVX2))
+      if( ( width & 15 ) == 0 && vext >= AVX2 )
       {
         __m256i vsrca,vsrcat,vsrcab;
 
@@ -504,7 +504,7 @@ void offsetBlock_SIMD( const int     channelBitDepth,
       }
 #ifdef USE_AVX2
       // AVX2
-      if ((width>8) && (vext >= AVX2))
+      if( ( width & 15 ) == 0 && vext >= AVX2 )
       {
         __m256i vsrca,vsrcat,vsrcab;
         __m256i vbaseoffset = _mm256_set1_epi16(2) ;
@@ -644,7 +644,7 @@ void offsetBlock_SIMD( const int     channelBitDepth,
     }
 #ifdef USE_AVX2
     // AVX2
-    if ((width>8) && (vext >= AVX2))
+    if( ( width & 15 ) == 0 && vext >= AVX2 )
     {
       __m256i vsrc;
       __m256i vbaseoffset = _mm256_set1_epi16(startIdx) ;
diff --git a/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp
deleted file mode 100644
index 1a1bf7d77..000000000
--- a/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../AdaptiveLoopFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp b/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp
deleted file mode 100644
index 6932a17a3..000000000
--- a/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../AffineGradientSearchX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp b/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp
deleted file mode 100644
index efc1edbfc..000000000
--- a/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../InterPredX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp
deleted file mode 100644
index b8430ec4a..000000000
--- a/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../InterpolationFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp b/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp
deleted file mode 100644
index de7ea6063..000000000
--- a/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../IntraPredX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp
deleted file mode 100644
index d4218924b..000000000
--- a/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../LoopFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp b/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp
deleted file mode 100644
index 53ca22885..000000000
--- a/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../RdCostX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp b/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp
deleted file mode 100644
index eafe24887..000000000
--- a/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../SampleAdaptiveOffsetX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp b/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp
deleted file mode 100644
index 67fa5dac8..000000000
--- a/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../TrafoX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/MCTF_avx.cpp b/source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp
similarity index 98%
rename from source/Lib/CommonLib/x86/avx/MCTF_avx.cpp
rename to source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp
index 83a8f4442..5a2498cb8 100644
--- a/source/Lib/CommonLib/x86/avx/MCTF_avx.cpp
+++ b/source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp
@@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE.
 
 ------------------------------------------------------------------------------------------- */
 
-#include "../MCTFX86.h"
+#include "../DepQuantX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/Quant_avx.cpp b/source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp
similarity index 98%
rename from source/Lib/CommonLib/x86/avx/Quant_avx.cpp
rename to source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp
index b3d681789..5a2498cb8 100644
--- a/source/Lib/CommonLib/x86/avx/Quant_avx.cpp
+++ b/source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp
@@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE.
 
 ------------------------------------------------------------------------------------------- */
 
-#include "../QuantX86.h"
+#include "../DepQuantX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp
deleted file mode 100644
index 07563b572..000000000
--- a/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../BufferX86.h"
diff --git a/source/Lib/CommonLib/x86/avx/Buffer_avx.cpp b/source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp
similarity index 98%
rename from source/Lib/CommonLib/x86/avx/Buffer_avx.cpp
rename to source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp
index 07563b572..5a2498cb8 100644
--- a/source/Lib/CommonLib/x86/avx/Buffer_avx.cpp
+++ b/source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp
@@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE.
 
 ------------------------------------------------------------------------------------------- */
 
-#include "../BufferX86.h"
+#include "../DepQuantX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp b/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp
deleted file mode 100644
index efc1edbfc..000000000
--- a/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../InterPredX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp b/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp
deleted file mode 100644
index b8430ec4a..000000000
--- a/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../InterpolationFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp b/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp
deleted file mode 100644
index de7ea6063..000000000
--- a/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../IntraPredX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp b/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp
deleted file mode 100644
index d4218924b..000000000
--- a/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../LoopFilterX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp b/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp
deleted file mode 100644
index 83a8f4442..000000000
--- a/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../MCTFX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp
deleted file mode 100644
index b3d681789..000000000
--- a/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../QuantX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp b/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp
deleted file mode 100644
index 53ca22885..000000000
--- a/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../RdCostX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp b/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp
deleted file mode 100644
index eafe24887..000000000
--- a/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../SampleAdaptiveOffsetX86.h"
diff --git a/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp
deleted file mode 100644
index 67fa5dac8..000000000
--- a/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -----------------------------------------------------------------------------
-The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
-the Software are granted under this license.
-
-The Clear BSD License
-
-Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted (subject to the limitations in the disclaimer below) provided that
-the following conditions are met:
-
-     * Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-
-     * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-     * Neither the name of the copyright holder nor the names of its
-     contributors may be used to endorse or promote products derived from this
-     software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-------------------------------------------------------------------------------------------- */
-
-#include "../TrafoX86.h"
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index fd2b6b4bf..8e1e10425 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -295,7 +295,6 @@ EncCu::~EncCu()
 
 void EncCu::encodeCtu( Picture* pic, int (&prevQP)[MAX_NUM_CH], uint32_t ctuXPosInCtus, uint32_t ctuYPosInCtus )
 {
-  PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_COMPRESS_CU, pic->cs, CH_L );
   CodingStructure&     cs          = *pic->cs;
   Slice*               slice       = cs.slice;
   const PreCalcValues& pcv         = *cs.pcv;
@@ -390,6 +389,7 @@ void EncCu::xCompressCtu( CodingStructure& cs, const UnitArea& area, const unsig
   cs.initSubStructure( *tempCS, partitioner->chType, partitioner->currArea(), false, orgBuffer, rspBuffer );
   cs.initSubStructure( *bestCS, partitioner->chType, partitioner->currArea(), false, orgBuffer, rspBuffer );
   m_CABACEstimator->determineNeighborCus( *tempCS, partitioner->currArea(), partitioner->chType, partitioner->treeType );
+  PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_COMPRESS_CU, tempCS, CH_L );
 
   // copy the relevant area
   UnitArea clippedArea = clipArea( partitioner->currArea(), cs.area );
@@ -2252,7 +2252,7 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC
 
 void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &pm, const EncTestMode &encTestMode)
 {
-  PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_INTER_GPM, tempCS, partitioner.chType );
+  PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_INTER_GPM, tempCS, pm.chType );
 
   const Slice &slice = *tempCS->slice;
   if ((m_pcEncCfg->m_Geo > 1) && (slice.TLayer <= 1))
diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp
index 73af5cee0..78b498021 100644
--- a/source/Lib/EncoderLib/EncGOP.cpp
+++ b/source/Lib/EncoderLib/EncGOP.cpp
@@ -211,6 +211,7 @@ void EncGOP::init( const VVEncCfg& encCfg, const GOPCfg* gopCfg, RateCtrl& rateC
   {
     m_ticksPerFrameMul4 = (int)((int64_t)4 *(int64_t)m_pcEncCfg->m_TicksPerSecond * (int64_t)m_pcEncCfg->m_FrameScale/(int64_t)m_pcEncCfg->m_FrameRate);
   }
+  m_forceSCC = false;
 }
 
 
@@ -1376,8 +1377,14 @@ void EncGOP::xInitPicsInCodingOrder( const PicList& picList )
 
     CHECK( m_lastCodingNum == -1 && ! pic->gopEntry->m_isStartOfIntra, "encoding should start with an I-Slice" );
 
+    xForceScc( *pic );
+
     // initialize slice header
     pic->encTime.startTimer();
+    if( pic->gopEntry->m_isStartOfGop )
+    {
+      xInitGopQpCascade( *pic, picList );
+    }
     xInitFirstSlice( *pic, picList, false );
     pic->encTime.stopTimer();
 
@@ -1444,6 +1451,115 @@ void EncGOP::xGetProcessingLists( std::list<Picture*>& procList, std::list<Pictu
   CHECK( ! rcUpdateList.empty() && m_gopEncListOutput.empty(), "first picture in RC update and in output list have to be the same" );
 }
 
+void EncGOP::xInitGopQpCascade( Picture& keyPic, const PicList& picList )
+{
+  uint32_t gopMotEstCount = 0, gopMotEstError = 0;
+  uint32_t gopSpVisCount  = 0, gopSpVisActLum = 0, gopSpVisActChr = 0;
+  const double resRatio4K = double (m_pcEncCfg->m_SourceWidth * m_pcEncCfg->m_SourceHeight) / (3840.0 * 2160.0);
+  const bool isHighRes    = (std::min (m_pcEncCfg->m_SourceWidth, m_pcEncCfg->m_SourceHeight) > 1280);
+  const int poc0Offset    = (m_pcEncCfg->m_poc0idr ? -1 : 0); // place leading poc 0 idr in GOP -1
+  const int gopNum = keyPic.gopEntry->m_gopNum + (keyPic.poc == 0 ? poc0Offset : 0);
+  int dQP = 0;
+  double qpStart = 24.0;
+  unsigned num = 0, sum = 0;
+  uint8_t gopMinNoiseLevels[QPA_MAX_NOISE_LEVELS];
+
+  // if max bit-rate not set or rate control enabled, skip QP adaptation
+  if( m_pcEncCfg->m_RCMaxBitrate <= 0
+      || m_pcEncCfg->m_RCMaxBitrate == INT32_MAX
+      || m_pcEncCfg->m_RCNumPasses == 2
+      || m_pcEncCfg->m_LookAhead > 0
+      || m_pcEncCfg->m_RCTargetBitrate != 0 )
+  {
+    return;
+  }
+
+  std::fill_n (gopMinNoiseLevels, QPA_MAX_NOISE_LEVELS, 255u);
+
+  for (auto pic : picList)
+  {
+    const int picGopNum = pic->gopEntry->m_gopNum + (pic->poc == 0 ? poc0Offset : 0);
+
+    if (picGopNum == gopNum && pic->m_picShared->m_picMotEstError > 0)
+    {
+      CHECK( pic->isInitDone, "try to modify GOP qp of picture, which has already been initialized" );
+      // summarize motion errors of all MCTF filtered pictures in GOP
+      gopMotEstCount++;
+      gopMotEstError += pic->m_picShared->m_picMotEstError;
+      // go through ranges, search per-range minimum in GOP
+      for (int i = 0; i < QPA_MAX_NOISE_LEVELS; i++)
+      {
+        gopMinNoiseLevels[i] = std::min<uint8_t> (gopMinNoiseLevels[i], pic->m_picShared->m_minNoiseLevels[i]);
+      }
+    }
+    else if (picGopNum + 1 == gopNum && pic->gopEntry->m_isStartOfGop /*&& !keyPic.gopEntry->m_isStartOfIntra*/) // disabled for start of Intra segments, for segment parallel encoding
+    {
+      // store activities of previous start-of-GOP picture
+      gopSpVisCount  = 1;
+      gopSpVisActLum = pic->m_picShared->m_picSpVisAct[CH_L];
+      gopSpVisActChr = pic->m_picShared->m_picSpVisAct[CH_C];
+    }
+  }
+
+  gopSpVisCount++;  // add current TL-0 spatial activities
+  gopSpVisActLum += keyPic.m_picShared->m_picSpVisAct[CH_L];
+  gopSpVisActChr += keyPic.m_picShared->m_picSpVisAct[CH_C];
+
+  gopMotEstError = (gopMotEstError + (gopMotEstCount >> 1)) / std::max (1u, gopMotEstCount);
+  gopSpVisActLum = (gopSpVisActLum + (gopSpVisCount  >> 1)) / gopSpVisCount;
+  gopSpVisActChr = (gopSpVisActChr + (gopSpVisCount  >> 1)) / gopSpVisCount;
+
+  for (int i = 0; i < QPA_MAX_NOISE_LEVELS; i++) // go through ranges again, find overall min-average in GOP
+  {
+    if (gopMinNoiseLevels[i] < 255)
+    {
+      num++;
+      sum += gopMinNoiseLevels[i];
+    }
+  }
+
+  if (num > 0 && sum > 0)
+  {
+    qpStart += 0.5 * (6.0 * log ((double) sum / (double) num) / log (2.0) - 1.0 - 24.0); // see RateCtrl.cpp
+  }
+  qpStart += log (resRatio4K) / log (2.0); // ICIP23 paper
+
+  // TODO hlm, henkel: adapt GOP's QP offset (capped CQF, adaptive QP cascade)
+  const int bDepth = m_pcEncCfg->m_internalBitDepth[CH_L];
+  const int intraP = Clip3 (m_pcEncCfg->m_GOPSize, 4 * VVENC_MAX_GOP, m_pcEncCfg->m_IntraPeriod);
+  const int visAct = std::max (uint16_t (gopSpVisActLum >> (12 - bDepth)), keyPic.picVisActY); // when vaY=0
+  const double apa = sqrt ((m_pcEncCfg->m_usePerceptQPATempFiltISlice || !keyPic.gopEntry->m_isStartOfIntra ? 32.0 : 16.0) * double (1 << (2 * bDepth - 10)) / sqrt (resRatio4K));
+  const int auxOff = (m_pcEncCfg->m_blockImportanceMapping && !keyPic.m_picShared->m_ctuBimQpOffset.empty() ? keyPic.m_picShared->m_picAuxQpOffset : 0);
+  const int iFrmQP = m_pcEncCfg->m_QP + (keyPic.gopEntry->m_isStartOfIntra ? m_pcEncCfg->m_intraQPOffset : 0) + auxOff + int (floor (3.0 * log (visAct / apa) / log (2.0) + 0.5));
+  const int qp32BC = int (16384.0 + 7.21875 * pow ((double) gopSpVisActLum, 4.0/3.0) + 1.46875 * pow ((double) gopSpVisActChr, 4.0/3.0)) * (isHighRes ? 96 : 24); // TODO hlm
+  const int iFrmBC = int (0.5 + qp32BC * pow (2.0, (33 - iFrmQP) / 5.0) * sqrt (resRatio4K)); // * HD tuning
+  const int  shift = (gopMotEstError < 32 ? 5 - (gopMotEstError >> 4) : 3);
+  if (keyPic.m_picShared->m_picMotEstError >= 256) gopMotEstError >>= 2; else // avoid 2 much capping at cuts
+  if (gopMotEstError >= 120) /*TODO tune this*/ gopMotEstError >>= 1;
+  const int bFrmBC = int ((4.0 * iFrmBC * intraP) / sqrt((double)gopSpVisActLum) * std::max (int (gopMotEstError * gopMotEstError) >> (bDepth / 2), (keyPic.picVisActTL0 - visAct) >> shift) * pow(2.0, -1.0 * bDepth));
+
+  const double fac = double (m_pcEncCfg->m_FrameScale * intraP) / m_pcEncCfg->m_FrameRate;
+  const double mBC = (m_pcEncCfg->m_RCMaxBitrate > 0 && m_pcEncCfg->m_RCMaxBitrate != INT32_MAX ? m_pcEncCfg->m_RCMaxBitrate * fac : 0.0);
+
+  if (mBC > 0.0 && iFrmBC + bFrmBC > mBC)  // max. I-period bit-count exceeded
+  {
+    const double d = std::max (0, iFrmQP) - (105.0 / 128.0) * sqrt ((double) std::max (1, iFrmQP)) * log (mBC / double (iFrmBC + bFrmBC)) / log (2.0);
+
+    dQP = Clip3 (0, MAX_QP, int (0.5 + d + 0.5 * std::max (0.0, qpStart - d))) - std::max (0, iFrmQP);
+  }
+
+  for (auto pic : picList) // store in all pictures of GOP
+  {
+    const int picGopNum = pic->gopEntry->m_gopNum + (pic->poc == 0 ? poc0Offset : 0);
+
+    if (picGopNum == gopNum)
+    {
+      pic->gopAdaptedQP = dQP;
+    }
+  }
+  keyPic.gopAdaptedQP = dQP; // TODO: add any additional key-frame offset here
+}
+
 void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncodeLtRef )
 {
   memset( pic.cs->alfAps, 0, sizeof(pic.cs->alfAps));
@@ -2613,6 +2729,20 @@ void EncGOP::xPrintPictureInfo( const Picture& pic, AccessUnitList& accessUnit,
   }
 }
 
+void EncGOP::xForceScc( Picture& pic )
+{
+  if( pic.gopEntry->m_isStartOfGop )
+  {
+    m_forceSCC = pic.m_picShared->m_forceSCC;
+  }
+  if( m_forceSCC && (!pic.isSccStrong || !pic.isSccWeak) )
+  {
+    pic.isSccStrong = true;
+    pic.isSccWeak = true;
+    pic.setSccFlags(m_pcEncCfg);
+  }
+}
+
 } // namespace vvenc
 
 //! \}
diff --git a/source/Lib/EncoderLib/EncGOP.h b/source/Lib/EncoderLib/EncGOP.h
index 796b21bac..633182318 100644
--- a/source/Lib/EncoderLib/EncGOP.h
+++ b/source/Lib/EncoderLib/EncGOP.h
@@ -150,6 +150,7 @@ class EncGOP : public EncStage
   std::deque<PicApsGlobal*> m_globalApsList;
 
   std::vector<int>          m_globalCtuQpVector;
+  bool                      m_forceSCC;
 
 public:
   EncGOP( MsgLog& msglog );
@@ -190,6 +191,7 @@ class EncGOP : public EncStage
   void xSetupPicAps                   ( Picture* pic );
   void xInitPicsInCodingOrder         ( const PicList& picList );
   void xGetProcessingLists            ( std::list<Picture*>& procList, std::list<Picture*>& rcUpdateList, const bool lockStepMode );
+  void xInitGopQpCascade              ( Picture& keyPic, const PicList& picList );
   void xInitFirstSlice                ( Picture& pic, const PicList& picList, bool isEncodeLtRef );
   void xInitSliceTMVPFlag             ( PicHeader* picHeader, const Slice* slice );
   void xUpdateRPRtmvp                 ( PicHeader* picHeader, Slice* slice );
@@ -223,6 +225,8 @@ class EncGOP : public EncStage
     std::lock_guard<std::mutex> lock( m_gopEncMutex );
     return ( int ) m_freePicEncoderList.size() >= std::max(1, m_pcEncCfg->m_maxParallelFrames); 
   }
+  void xForceScc                      ( Picture& pic );
+
 };// END CLASS DEFINITION EncGOP
 
 } // namespace vvenc
diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp
index f189a84f3..c8280ee4c 100644
--- a/source/Lib/EncoderLib/EncLib.cpp
+++ b/source/Lib/EncoderLib/EncLib.cpp
@@ -131,10 +131,11 @@ void EncLib::initEncoderLib( const vvenc_config& encCfg )
 #endif
 
 #if ENABLE_TIME_PROFILING
-  if( g_timeProfiler == nullptr )
+  if( g_timeProfiler )
   {
-    g_timeProfiler = timeProfilerCreate( encCfg );
+    delete g_timeProfiler;
   }
+  g_timeProfiler = timeProfilerCreate( encCfg );
 #endif
 }
 
@@ -166,12 +167,17 @@ void EncLib::uninitEncoderLib()
 
 #if ENABLE_TIME_PROFILING
 #if ENABLE_TIME_PROFILING_MT_MODE
-  for( auto& p : m_threadPool->getProfilers() )
+  if( m_threadPool )
   {
-    *g_timeProfiler += *p;
+    for(auto& p : m_threadPool->getProfilers())
+    {
+      *g_timeProfiler += *p;
+    }
   }
 #endif
   timeProfilerResults( g_timeProfiler );
+  delete g_timeProfiler;
+  g_timeProfiler = nullptr;
 #endif
   xUninitLib();
 }
@@ -423,7 +429,7 @@ void EncLib::encodePicture( bool flush, const vvencYUVBuffer* yuvInBuf, AccessUn
       }
     }
 
-    PROFILER_EXT_UPDATE( g_timeProfiler, P_TOP_LEVEL, pic->TLayer );
+    PROFILER_EXT_UPDATE( g_timeProfiler, P_TOP_LEVEL, 0 );
 
     // trigger stages
     isQueueEmpty = m_picsRcvd > 0 || ( m_picsRcvd <= 0 && flush );
diff --git a/source/Lib/EncoderLib/EncPicture.cpp b/source/Lib/EncoderLib/EncPicture.cpp
index fa1fd7592..fc0600797 100644
--- a/source/Lib/EncoderLib/EncPicture.cpp
+++ b/source/Lib/EncoderLib/EncPicture.cpp
@@ -83,7 +83,7 @@ void EncPicture::init( const VVEncCfg& encCfg,
 
 void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder )
 {
-  PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs, CH_L );
+  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs );
   ITT_TASKSTART( itt_domain_picEncoder, itt_handle_start );
 
   pic.encTime.startTimer();
@@ -122,7 +122,7 @@ void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder )
 
 void EncPicture::finalizePicture( Picture& pic )
 {
-  PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs, CH_L );
+  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs );
   CodingStructure& cs = *(pic.cs);
   Slice* slice        = pic.slices[0];
   // ALF
diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp
index d8b4d074c..722c3d19d 100644
--- a/source/Lib/EncoderLib/EncSlice.cpp
+++ b/source/Lib/EncoderLib/EncSlice.cpp
@@ -390,6 +390,8 @@ void EncSlice::xInitSliceLambdaQP( Slice* slice )
   slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0;
 }
 
+static const int highTL[6] = { -1, 0, 0, 2, 4, 5 };
+
 int EncSlice::xGetQPForPicture( const Slice* slice )
 {
   const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ];
@@ -401,11 +403,13 @@ int EncSlice::xGetQPForPicture( const Slice* slice )
   }
   else
   {
-    const SliceType sliceType = slice->sliceType;
-
-    qp = m_pcEncCfg->m_QP;
+    qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP;
 
-    if( sliceType == VVENC_I_SLICE )
+    if (m_pcEncCfg->m_usePerceptQPA)
+    {
+      qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[slice->TLayer] + ((qp * (16 + std::min (2u, slice->TLayer))) >> 4) + 0/*TODO +-1?*/);
+    }
+    else if( slice->isIntra() )
     {
       qp += m_pcEncCfg->m_intraQPOffset;
     }
@@ -735,7 +739,7 @@ void EncSlice::finishCompressSlice( Picture* pic, Slice& slice )
 
 void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr )
 {
-  PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs, CH_L );
+  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs );
   CodingStructure& cs      = *pic->cs;
   Slice&           slice   = *cs.slice;
   const PreCalcValues& pcv = *cs.pcv;
diff --git a/source/Lib/EncoderLib/EncStage.h b/source/Lib/EncoderLib/EncStage.h
index caf267088..a3f396a8c 100644
--- a/source/Lib/EncoderLib/EncStage.h
+++ b/source/Lib/EncoderLib/EncStage.h
@@ -64,6 +64,7 @@ class PicShared
   GOPEntry         m_gopEntry;
   bool             m_isSccWeak;
   bool             m_isSccStrong;
+  bool             m_forceSCC;
   uint16_t         m_picVisActTL0;
   uint16_t         m_picVisActY;
   uint16_t         m_picSpVisAct[MAX_NUM_CH];
@@ -88,6 +89,7 @@ class PicShared
   PicShared()
   : m_isSccWeak     ( false )
   , m_isSccStrong   ( false )
+  , m_forceSCC      ( false )
   , m_picVisActTL0  ( 0 )
   , m_picVisActY    ( 0 )
   , m_picMemorySTA  ( 0 )
@@ -137,6 +139,7 @@ class PicShared
 
     m_isSccWeak    = false;
     m_isSccStrong  = false;
+    m_forceSCC     = false;
     m_picVisActTL0 = 0;
     m_picVisActY   = 0;
     m_picMemorySTA = 0;
diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp
index a8aca551c..e7b6a69a2 100644
--- a/source/Lib/EncoderLib/PreProcess.cpp
+++ b/source/Lib/EncoderLib/PreProcess.cpp
@@ -299,7 +299,7 @@ void PreProcess::xGetVisualActivity( Picture* pic, const PicList& picList ) cons
   uint16_t picVisActTL0 = 0;
   uint16_t picVisActY   = 0;
 
-  if( m_doVisAct && ! m_doVisActQpa ) // for the time being qpa activity done on ctu basis in applyQPAdaptationSlice(), which for now sums up luma activity
+  if( ( m_doVisAct && !m_doVisActQpa ) || ( cappedCRF && m_encCfg->m_usePerceptQPA && pic->gopEntry->m_temporalId == 0 ) ) // for the time being qpa activity done on ctu basis in applyQPAdaptationSlice(), which for now sums up luma activity
   {
     // find previous pictures
     const Picture* prevPics[ NUM_QPA_PREV_FRAMES ];
@@ -441,121 +441,153 @@ void PreProcess::xDisableTempDown( Picture* pic, const PicList& picList )
 
 void PreProcess::xDetectScc( Picture* pic ) const
 {
-  CPelUnitBuf yuvOrgBuf = pic->getOrigBuf();
-
-  bool isSccWeak   = false;
-  bool isSccStrong = false;
-
-  const int SIZE_BL = 4;
-  const int minLevel = 1 << (m_encCfg->m_internalBitDepth[CH_L] - (!m_encCfg->m_videoFullRangeFlag ? 4 : 6)); // 1/16th or 1/64th of range
-  const int K_SC = 25;
-  const Pel* piSrc = yuvOrgBuf.Y().buf;
-  const uint32_t uiStride = yuvOrgBuf.Y().stride;
-  const uint32_t uiWidth  = yuvOrgBuf.Y().width;
-  const uint32_t uiHeight = yuvOrgBuf.Y().height;
-  int size = SIZE_BL;
-  unsigned   hh, ww;
-  int SizeS = SIZE_BL << 1;
+  if( m_encCfg->m_forceScc > 0 )
+  {
+    pic->isSccStrong = pic->m_picShared->m_isSccStrong = m_encCfg->m_forceScc >= 3;
+    pic->isSccWeak   = pic->m_picShared->m_isSccWeak   = m_encCfg->m_forceScc >= 2;
+    return;
+  }
+
+  CPelBuf yuvOrgBuf = pic->getOrigBuf().Y();
+
+  // blocksize and threshold
+  static constexpr int SIZE_BL =  4;
+  static constexpr int K_SC    = 23;
+  static constexpr int K_noSC  =  8;
+
+  // mean and variance fixed point accuracy
+  static constexpr int accM = 4;
+  static constexpr int accV = 2;
+
+  static_assert( accM <= 4 && accV <= 4, "Maximum Mean and Variance accuracy of 4 allowed!" );
+  static constexpr int shfM = 4 - accM;
+  static constexpr int shfV = 4 + accM - accV;
+  static constexpr int addM = 1 << shfM >> 1;
+  static constexpr int addV = 1 << shfV >> 1;
+
+  static constexpr int SizeS = SIZE_BL << 1;
+
+  const int minLevel = 1 << ( m_encCfg->m_internalBitDepth[CH_L] - ( m_encCfg->m_videoFullRangeFlag ? 6 : 4 ) ); // 1/16th or 1/64th of range
+
+  const Pel*     piSrc    = yuvOrgBuf.buf;
+  const uint32_t uiStride = yuvOrgBuf.stride;
+  const uint32_t uiWidth  = yuvOrgBuf.width;
+  const uint32_t uiHeight = yuvOrgBuf.height;
+
+  CHECK( ( uiWidth & 7 ) != 0 || ( uiHeight & 7 ) != 0, "Width and height have to be multiples of 8!" );
+
+  const int amountBlock = ( uiWidth >> 2 ) * ( uiHeight >> 2 );
+
   int sR[4] = { 0, 0, 0, 0 }; // strong SCC data
   int zR[4] = { 0, 0, 0, 0 }; // zero input data
-  const int amountBlock = (uiWidth >> 2) * (uiHeight >> 2);
-  for( hh = 0; hh < uiHeight; hh += SizeS )
+
+  for( int hh = 0; hh < uiHeight; hh += SizeS )
   {
-    for( ww = 0; ww < uiWidth; ww += SizeS )
+    for( int ww = 0; ww < uiWidth; ww += SizeS )
     {
-      int Rx = ww >= (uiWidth  >> 1) ? 1 : 0;
-      int Ry = hh >= (uiHeight >> 1) ? 1 : 0;
-      Ry = Ry << 1 | Rx;
+      int Rx = ww >= ( uiWidth  >> 1 ) ? 1 : 0;
+      int Ry = hh >= ( uiHeight >> 1 ) ? 2 : 0;
+      Ry = Ry | Rx;
 
-      int i = ww;
-      int j = hh;
       int n = 0;
       int Var[4];
-      for( j = hh; (j < hh + SizeS) && (j < uiHeight); j += size )
+
+      for( int j = hh; j < hh + SizeS; j += SIZE_BL )
       {
-        for( i = ww; (i < ww + SizeS) && (i < uiWidth); i += size )
+        for( int i = ww; i < ww + SizeS; i += SIZE_BL )
         {
-          int sum = 0;
+          const Pel *p0 = &piSrc[j * uiStride + i];
+
           int Mit = 0;
-          int V = 0;
-          int h = j;
-          int w = i;
-          for( h = j; (h < j + size) && (h < uiHeight); h++ )
+          int V   = 0;
+
+          for( int h = 0; h < SIZE_BL; h++, p0 += uiStride )
           {
-            for( w = i; (w < i + size) && (w < uiWidth); w++ )
+            for( int w = 0; w < SIZE_BL; w++ )
             {
-              sum += int(piSrc[h * uiStride + w]);
+              Mit += p0[w];
             }
           }
-          int sizeEnd = ((h - j) * (w - i));
-          Mit = sum / sizeEnd;
-          for( h = j; (h < j + size) && (h < uiHeight); h++ )
+
+          Mit = ( Mit + addM ) >> shfM;
+
+          p0 = &piSrc[j * uiStride + i];
+
+          for( int h = 0; h < SIZE_BL; h++, p0 += uiStride )
           {
-            for( w = i; (w < i + size) && (w < uiWidth); w++ )
+            for( int w = 0; w < SIZE_BL; w++ )
             {
-              V += abs(Mit - int(piSrc[h * uiStride + w]));
+              V += abs( Mit - ( int( p0[w] ) << accM ) );
             }
           }
-          // Variance in Block (SIZE_BL*SIZE_BL)
-          if (V < sizeEnd && Mit <= minLevel)
+
+          // if variance is lower than 1 and mean is lower/equal to minLevel
+          if( V < ( 1 << ( accM + 4 ) ) && Mit <= ( minLevel << accM ) )
           {
             Var[n] = -1;
           }
           else
           {
-            Var[n] = V / sizeEnd;
+            Var[n] = ( V + addV ) >> shfV;
           }
+
           n++;
         }
       }
+
       for( int i = 0; i < 2; i++ )
       {
-        if( Var[i] == Var[i + 2] )
+        const int var0 = Var[ i];
+        const int var1 = Var[ i + 2];
+        const int var2 = Var[ i << 1];
+        const int var3 = Var[(i << 1) + 1];
+
+        if( var0 < 0 && var1 < 0 && zR[Ry] * 20 < amountBlock )
         {
-          if( Var[i] < 0 && zR[Ry] * 20 < amountBlock )
-          {
-            zR[Ry]++;
-          }
-          else
-          {
-            sR[Ry]++;
-          }
+          zR[Ry]++;
         }
-        if( Var[i << 1] == Var[(i << 1) + 1] )
+        else if( var0 == var1 )
         {
-          if( Var[i << 1] < 0 && zR[Ry] * 20 < amountBlock )
-          {
-            zR[Ry]++;
-          }
-          else
-          {
-            sR[Ry]++;
-          }
+          sR[Ry]++;
+        }
+
+        if( var2 < 0 && var3 < 0 && zR[Ry] * 20 < amountBlock )
+        {
+          zR[Ry]++;
+        }
+        else if( var2 == var3 )
+        {
+          sR[Ry]++;
         }
       }
     }
   }
-  int s = 0;
-  isSccStrong = true;
-  size = 0;
+
+  bool isSccWeak     = false;
+  bool isSccStrong   = false;
+  bool isNoSccStrong = false;
+
+  int numAll   = 0;
+  int numMin   = amountBlock, numMax = 0;
+  int numBelow = 0;
+
   for( int r = 0; r < 4; r++ )
   {
-    s += sR[r];
-    if (size < sR[r]) // find peak quarter
-    {
-      size = sR[r];
-    }
-    if ((sR[r] * 100 / (amountBlock >> 2)) <= K_SC)
-    {
-      isSccStrong = false;
-    }
-  }
-  isSccWeak = ((s * 100 / amountBlock) > K_SC);
-  if (isSccWeak && (size * 93 / (amountBlock >> 1)) > K_SC)
-  {
-    isSccStrong = true; // peak quarter is above 2.15*K_SC threshold
+    numAll   += sR[r];
+    numMax    = std::max( numMax, sR[r] );
+    numMin    = std::min( numMin, sR[r] );
+    numBelow += sR[r] * 100 <= K_SC * ( amountBlock >> 2 ) ? 1 : 0;
   }
 
+  // lowest quarter is above K_SC threshold
+  isSccStrong   = numMin * 100 >  K_SC *   ( amountBlock >> 2 );
+  // lowest quarter is below K_noSC threshold and theres more than one quarter below K_SC threshold
+  isNoSccStrong = numMin * 100 <= K_noSC * ( amountBlock >> 2 ) && numBelow > 1;
+  // overall is above K_SC threshold
+  isSccWeak     = numAll * 100 >  K_SC *     amountBlock;
+  // peak quarter is above 2.15*K_SC threshold
+  isSccStrong  |= isSccWeak && !isNoSccStrong && numMax * 186 > K_SC * amountBlock;
+
   PicShared* picShared     = pic->m_picShared;
   pic->isSccWeak           = isSccWeak;
   pic->isSccStrong         = isSccStrong;
diff --git a/source/Lib/EncoderLib/RateCtrl.cpp b/source/Lib/EncoderLib/RateCtrl.cpp
index 43b2b6c3b..8f261371b 100644
--- a/source/Lib/EncoderLib/RateCtrl.cpp
+++ b/source/Lib/EncoderLib/RateCtrl.cpp
@@ -1285,7 +1285,7 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double&
           tmpVal = updateQPstartModelVal() + log (sqrOfResRatio) / log (2.0); // GOP's QPstart
           d /= (double)it->numBits;
           d = firstPassSliceQP - ( 105.0 / 128.0 ) * sqrt( (double)std::max( 1, firstPassSliceQP ) ) * log( d ) / log( 2.0 );
-          sliceQP = int( 0.5 + d + ( it->isIntra && m_pcEncCfg->m_HdrMode != vvencHDRMode::VVENC_HDR_OFF ? 0.375 : 0.5 ) * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] );
+          sliceQP = int( 0.5 + d + ( it->isIntra ? 0.375 : 0.5 ) * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] );
 
           encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : m_pcEncCfg->m_QP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ), 5 - budgetRelaxScale,
                                   ( it->poc < encRCSeq->gopSize ? 0 : ( m_pcEncCfg->m_maxTLayer + 1 ) >> 1 ), sqrOfResRatio, sliceQP, &encRCSeq->lastAverageQP );
diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h
index b6f51f0f9..7f28a2936 100644
--- a/source/Lib/apputils/VVEncAppCfg.h
+++ b/source/Lib/apputils/VVEncAppCfg.h
@@ -858,6 +858,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr )
     ("AddGOP32refPics",                                 c->m_addGOP32refPics,                                "Use different QP offsets and reference pictures in GOP structure")
     ("NumRefPics",                                      c->m_numRefPics,                                     "Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)" )
     ("NumRefPicsSCC",                                   c->m_numRefPicsSCC,                                  "Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics)" )
+    ("ForceSCC",                                        c->m_forceScc,                                       "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" )
     ;
 
     opts.setSubSection("Low-level QT-BTT partitioning options");
diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt
index b224d9f76..3cdbca21c 100644
--- a/source/Lib/vvenc/CMakeLists.txt
+++ b/source/Lib/vvenc/CMakeLists.txt
@@ -29,8 +29,8 @@ if( VVENC_ENABLE_X86_SIMD )
   # get x86 include files
   file( GLOB X86_INC_FILES "../CommonLib/x86/*.h" )
 
-  ## get avx source files
-  #file( GLOB AVX_SRC_FILES "../CommonLib/x86/avx/*.cpp" )
+  # get avx source files
+  file( GLOB AVX_SRC_FILES "../CommonLib/x86/avx/*.cpp" )
 
   # get avx2 source files
   file( GLOB AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" )
@@ -38,8 +38,8 @@ if( VVENC_ENABLE_X86_SIMD )
   # get sse4.1 source files
   file( GLOB SSE41_SRC_FILES "../CommonLib/x86/sse41/*.cpp" )
 
-  ## get sse4.2 source files
-  #file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" )
+  # get sse4.2 source files
+  file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" )
 endif()
 
 if( VVENC_ENABLE_ARM_SIMD )
@@ -93,31 +93,30 @@ set( CMAKE_VISIBILITY_INLINES_HIDDEN TRUE )
 if( VVENC_ENABLE_X86_SIMD )
   # set needed compile definitions
   set_property( SOURCE ${SSE41_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE41 )
-  #set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 )
-  #set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX )
+  set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 )
+  set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX )
   set_property( SOURCE ${AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 )
   # set needed compile flags
   if( MSVC )
-    #set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" )
+    set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" )
     set_property( SOURCE ${AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" )
   elseif( UNIX OR MINGW )
     include( vvencCompilerSupport )
 
     set_if_compiler_supports_flag( FLAG_mxsave -mxsave  )
     set_if_compiler_supports_flag( FLAG_msse41 -msse4.1 )
-    #set_if_compiler_supports_flag( FLAG_msse42 -msse4.2 )
-    #set_if_compiler_supports_flag( FLAG_mavx   -mavx    )
+    set_if_compiler_supports_flag( FLAG_msse42 -msse4.2 )
+    set_if_compiler_supports_flag( FLAG_mavx   -mavx    )
     set_if_compiler_supports_flag( FLAG_mavx2  -mavx2   )
 
     set_property( SOURCE ${X86_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS ${FLAG_mxsave} )
     set_property( SOURCE ${SSE41_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse41}" )
-    #set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" )
-    #set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}"   )
+    set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" )
+    set_property( SOURCE ${AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}"   )
     set_property( SOURCE ${AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}"  )
   endif()
 
-  #add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} )
-  add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${AVX2_SRC_FILES} )
+  add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} )
   # disble LTO for the files compiled with special architecture flags
   set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES
                                               INTERPROCEDURAL_OPTIMIZATION                OFF
diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp
index 982d2b9e2..eb98284af 100644
--- a/source/Lib/vvenc/vvencCfg.cpp
+++ b/source/Lib/vvenc/vvencCfg.cpp
@@ -687,8 +687,9 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c )
 
   c->m_FirstPassMode                           = 0;
 
+  c->m_forceScc                                = 0;
+
   c->m_reservedFlag                            = false;
-  c->m_reservedInt                             = 0;
   memset( c->m_reservedDouble, 0, sizeof(c->m_reservedDouble) );
 
   // init default preset
@@ -760,9 +761,11 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
   {
     vvenc_confirmParameter( c, c->m_bufferingPeriodSEIEnabled, "Enabling bufferingPeriod SEI requires rate control" );
     vvenc_confirmParameter( c, c->m_pictureTimingSEIEnabled,   "Enabling pictureTiming SEI requires rate control" );
-    vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0,          "Specifying a maximum bitrate requires rate control" );
+    vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX && !c->m_usePerceptQPA, "Enabling capped CQF requires PerceptQPA to be enabled" );
+    vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0 && c->m_RCInitialQP > 0, "Specifying an RCInitialQP value requires rate control" );
+    vvenc_confirmParameter( c, c->m_RCMaxBitrate < 0, "Cannot specify a relative max rate when using QCF, please specify an absolute value" );
   }
-  else if ( c->m_RCMaxBitrate == 0 )
+  if( c->m_RCMaxBitrate == 0 )
   {
     c->m_RCMaxBitrate = INT32_MAX;
   }
@@ -804,8 +807,8 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
     }
   }
 
-  const double d = (3840.0 * 2160.0) / double (c->m_SourceWidth * c->m_SourceHeight);
-  const int rcQP = (c->m_RCInitialQP > 0 ? std::min (vvenc::MAX_QP, c->m_RCInitialQP) : std::max (0, vvenc::MAX_QP_PERCEPT_QPA - (c->m_FirstPassMode > 2 ? 4 : 2) - int (0.5 + sqrt ((d * std::max (0, c->m_RCTargetBitrate)) / 500000.0))));
+  const double d = (c->m_RCTargetBitrate != VVENC_RC_OFF ? 1.0 : 2.25) * (3840.0 * 2160.0) / double (c->m_SourceWidth * c->m_SourceHeight);
+  const int rcQP = (c->m_RCInitialQP > 0 ? std::min (vvenc::MAX_QP, c->m_RCInitialQP) : std::max (0, vvenc::MAX_QP_PERCEPT_QPA - (c->m_FirstPassMode > 2 ? 4 : 2) - int (0.5 + sqrt ((d * std::max (0, (c->m_RCTargetBitrate != VVENC_RC_OFF ? c->m_RCTargetBitrate : c->m_RCMaxBitrate))) / 500000.0))));
 
   // TODO 2.0: make this an error
   //vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && c->m_QP != VVENC_AUTO_QP && c->m_QP != VVENC_DEFAULT_QP, "Rate-control and QP based encoding are mutually exclusive!" );
@@ -823,6 +826,7 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c )
 
   vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && ( c->m_RCTargetBitrate < 0 || c->m_RCTargetBitrate > 800000000 ),  "TargetBitrate must be between 0 and 800000000" );
   vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && (int64_t) c->m_RCMaxBitrate * 2 < (int64_t) c->m_RCTargetBitrate * 3, "MaxBitrate must be at least 1.5*TargetBitrate" );
+  vvenc_confirmParameter( c, c->m_RCTargetBitrate == VVENC_RC_OFF && c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX && rcQP + sqrt (c->m_FrameRate / (double) c->m_FrameScale) > c->m_QP + 10.125, "Capped CQF is used and MaxBitrate is too low for specified QP and frame rate/scale" );
   vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && ( c->m_FirstPassMode < 0 || c->m_FirstPassMode > 4 ), "FirstPassMode must be 0, 1, 2, 3, or 4" );
 
   if ( c->m_internChromaFormat < 0 || c->m_internChromaFormat >= VVENC_NUM_CHROMA_FORMAT )
@@ -2619,7 +2623,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset )
       c->m_MinQT[ 0 ]                      = 8;
       c->m_MinQT[ 1 ]                      = 8;
       c->m_MinQT[ 2 ]                      = 4;
-      c->m_maxMTTDepth                     = 221111;
+      c->m_maxMTTDepth                     = 1;
       c->m_maxMTTDepthI                    = 2;
 
       // speedups
@@ -2628,7 +2632,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset )
       c->m_contentBasedFastQtbt            = true;
       c->m_fastHad                         = false;
       c->m_usePbIntraFast                  = 1;
-      c->m_useFastMrg                      = 3;
+      c->m_useFastMrg                      = 2;
       c->m_fastLocalDualTreeMode           = 1;
       c->m_fastSubPel                      = 1;
       c->m_FastIntraTools                  = 1;
@@ -2639,7 +2643,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset )
       c->m_numIntraModesFullRD             = -1;
       c->m_reduceIntraChromaModesFullRD    = true;
       c->m_meReduceTap                     = 2;
-      c->m_numRefPics                      = 222111;
+      c->m_numRefPics                      = 222221;
       c->m_numRefPicsSCC                   = 0;
 
       // tools
@@ -2976,17 +2980,23 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve
       }
       else
         css << "single-pass";
-      if( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX )
+    }
+    else
+    {
+      css << "QP " << c->m_QP;
+    }
+    if( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX )
+    {
+      if ( c->m_RCTargetBitrate <= 0 )
       {
-        if( c->m_RCMaxBitrate < 1000000 )
-          css << "  (max. rate " <<  (double)c->m_RCMaxBitrate/1000.0 << " kbps)";
-        else
-          css << "  (max. rate " <<  (double)c->m_RCMaxBitrate/1000000.0 << " Mbps)";
+        css << "  capped CQF";
       }
-      css << "\n";
+      if( c->m_RCMaxBitrate < 1000000 )
+        css << "  (max. rate " <<  (double)c->m_RCMaxBitrate/1000.0 << " kbps)";
+      else
+        css << "  (max. rate " <<  (double)c->m_RCMaxBitrate/1000000.0 << " Mbps)";
     }
-    else
-      css << "QP " <<  c->m_QP << "\n";
+    css << "\n";
 
     css << loglvl << "Perceptual optimization                : " << (c->m_usePerceptQPA ? "Enabled" : "Disabled") << "\n";
     css << loglvl << "Intra period (keyframe)                : " << c->m_IntraPeriod << "\n";
@@ -3186,16 +3196,23 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve
       css << "Passes:" << c->m_RCNumPasses << " ";
       css << "Pass:" << c->m_RCPass << " ";
       css << "TargetBitrate:" << c->m_RCTargetBitrate << " ";
-      if ( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX )
+      if ( c->m_RCInitialQP > 0 )
       {
-        css << "MaxBitrate:" << c->m_RCMaxBitrate << " ";
+        css << "RCInitialQP:" << c->m_RCInitialQP << " ";
       }
-      css << "RCInitialQP:" << c->m_RCInitialQP << " ";
     }
     else
     {
       css << "QP:" << c->m_QP << " ";
     }
+    if ( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX )
+    {
+      if ( c->m_RCTargetBitrate <= 0 )
+      {
+        css << "(capped CQF) ";
+      }
+      css << "MaxBitrate:" << c->m_RCMaxBitrate << " ";
+    }
 
     css << "LookAhead:" << c->m_LookAhead << " ";
     css << "FirstPassMode:" << c->m_FirstPassMode << " ";
diff --git a/thirdparty/simde/x86/avx2.h b/thirdparty/simde/x86/avx2.h
index 1247b5193..9fd0d9490 100644
--- a/thirdparty/simde/x86/avx2.h
+++ b/thirdparty/simde/x86/avx2.h
@@ -4080,7 +4080,7 @@ simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) {
 
     SIMDE_VECTORIZE
     for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
-      r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
+      r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i];
     }
 
     return simde__m256i_from_private(r_);
@@ -4104,7 +4104,7 @@ simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) {
 
     SIMDE_VECTORIZE
     for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
-      r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
+      r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i];
     }
 
     return simde__m256i_from_private(r_);
@@ -4128,7 +4128,7 @@ simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) {
 
     SIMDE_VECTORIZE
     for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
-      r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
+      r_.i32[i] = (b_.i32[i] == INT32_C(0)) ? INT32_C(0) : (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
     }
 
     return simde__m256i_from_private(r_);