diff --git a/cfg/experimental/lowdelay_medium.cfg b/cfg/experimental/lowdelay_medium.cfg index 5534da9e4..2a30481ed 100644 --- a/cfg/experimental/lowdelay_medium.cfg +++ b/cfg/experimental/lowdelay_medium.cfg @@ -70,7 +70,7 @@ DualITree : 1 # separate partitioning of luma and chrom MinQTLumaISlice : 8 MinQTChromaISliceInChromaSamples : 4 # minimum QT size in chroma samples for chroma separate tree MinQTNonISlice : 8 -MaxMTTDepth : 221111 +MaxMTTDepth : 1 MaxMTTDepthI : 2 MaxNumMergeCand : 6 # Maximum number of merge candidates @@ -130,7 +130,7 @@ ContentBasedFastQtbt : 1 # Signal based QTBT speed-up: 0: disabled PBIntraFast : 1 # Intra mode pre-check dependent on best Inter mode, skip intra if it is not probable (0:off ... 2:fastest) FastQtBtEnc : 1 # Fast encoding setting for QTBT FastHAD : 0 # Use fast sub-sampled hadamard for square blocks >=32x32 -FastMrg : 3 # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster +FastMrg : 2 # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster FastLocalDualTreeMode : 1 # Fast intra pass coding for local dual-tree in intra coding region: 0: disable, 1: use threshold, 2: one intra mode only FastSubPel : 1 # Fast sub-pel ME: 0: disabled, 1: enabled FastIntraTools : 1 # Speedup intra tools: LFNST, ISP, MTS diff --git a/cfg/randomaccess_medium.cfg b/cfg/randomaccess_medium.cfg index 9377b27f4..abd3ff023 100644 --- a/cfg/randomaccess_medium.cfg +++ b/cfg/randomaccess_medium.cfg @@ -22,7 +22,7 @@ BipredSearchRange : 4 # Search range for bi-prediction ref HadamardME : 1 # Use of hadamard measure for fractional ME FEN : 3 # Fast encoder decision FDM : 1 # Fast Decision for Merge RD cost -NumRefPics : 222111 # Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL) +NumRefPics : 222221 # Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL) NumRefPicsSCC : 0 # Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics) #======== Quantization ============= @@ -58,7 +58,7 @@ DualITree : 1 # separate partitioning of luma and chrom MinQTLumaISlice : 8 MinQTChromaISliceInChromaSamples : 4 # minimum QT size in chroma samples for chroma separate tree MinQTNonISlice : 8 -MaxMTTDepth : 221111 +MaxMTTDepth : 1 MaxMTTDepthI : 2 MaxNumMergeCand : 6 # Maximum number of merge candidates @@ -118,7 +118,7 @@ ContentBasedFastQtbt : 1 # Signal based QTBT speed-up: 0: disabled PBIntraFast : 1 # Intra mode pre-check dependent on best Inter mode, skip intra if it is not probable (0:off ... 2:fastest) FastQtBtEnc : 1 # Fast encoding setting for QTBT FastHAD : 0 # Use fast sub-sampled hadamard for square blocks >=32x32 -FastMrg : 3 # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster +FastMrg : 2 # Fast methods for inter merge: 0: disabled, 1: vtm, 2: fast, 3: faster FastLocalDualTreeMode : 1 # Fast intra pass coding for local dual-tree in intra coding region: 0: disable, 1: use threshold, 2: one intra mode only FastSubPel : 1 # Fast sub-pel ME: 0: disabled, 1: enabled FastIntraTools : 1 # Speedup intra tools: LFNST, ISP, MTS diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h index f8d63fd8f..c1f46ced9 100644 --- a/include/vvenc/vvencCfg.h +++ b/include/vvenc/vvencCfg.h @@ -774,7 +774,7 @@ typedef struct vvenc_config // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate). // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate. // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier - int m_reservedInt; + int m_forceScc; double m_reservedDouble[9]; // internal state variables diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 48532587e..b804a2e49 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -582,9 +582,11 @@ inline std::string prnt( const char* fmt, ...) #if ( _WIN32 && ( _MSC_VER > 1300 ) ) || defined (__MINGW64_VERSION_MAJOR) #define xMalloc( type, len ) _aligned_malloc( sizeof(type)*(len), MEMORY_ALIGN_DEF_SIZE ) +#define xMalloc2( type, len, alg ) _aligned_malloc( sizeof(type)*(len), alg ) #define xFree( ptr ) _aligned_free ( ptr ) #elif defined (__MINGW32__) #define xMalloc( type, len ) __mingw_aligned_malloc( sizeof(type)*(len), MEMORY_ALIGN_DEF_SIZE ) +#define xMalloc2( type, len, alg ) __mingw_aligned_malloc( sizeof(type)*(len), alg ) #define xFree( ptr ) __mingw_aligned_free( ptr ) #else namespace detail { @@ -599,11 +601,13 @@ static inline T* aligned_malloc(size_t len, size_t alignement) { } } #define xMalloc( type, len ) detail::aligned_malloc( len, MEMORY_ALIGN_DEF_SIZE ) +#define xMalloc2( type, len, alg ) detail::aligned_malloc( len, alg ) #define xFree( ptr ) free( ptr ) #endif #else #define xMalloc( type, len ) malloc ( sizeof(type)*(len) ) +#define xMalloc2( type, len, alg ) malloc ( sizeof(type)*(len) ) #define xFree( ptr ) free ( ptr ) #endif //#if ALIGNED_MALLOC diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index 961e03133..a77964072 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -44,10 +44,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "TrQuant.h" #include "CodingStructure.h" #include "UnitTools.h" -#ifdef TARGET_SIMD_X86 -# include "x86/CommonDefX86.h" -# include -#endif #include @@ -59,105 +55,6 @@ namespace vvenc { namespace DQIntern { - /*================================================================================*/ - /*===== =====*/ - /*===== R A T E E S T I M A T O R =====*/ - /*===== =====*/ - /*================================================================================*/ - - struct NbInfoSbb - { - //uint8_t num; - uint8_t numInv; - //uint8_t inPos[5]; - uint8_t invInPos[5]; - }; - struct NbInfoOut - { - uint16_t maxDist; - uint16_t num; - uint16_t outPos[5]; - }; - struct CoeffFracBits - { - int32_t bits[6]; - }; - - - enum ScanPosType : int8_t { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 }; - - struct ScanInfo - { - ScanInfo() {} - short numSbb; - short scanIdx; - short rasterPos; - short sbbPos; // byte - short nextSbbRight; - short nextSbbBelow; - int8_t sbbSize; - int8_t insidePos; - int8_t nextInsidePos; - ScanPosType spt; - int8_t posX; - int8_t posY; - int8_t sigCtxOffsetNext; - int8_t gtxCtxOffsetNext; - NbInfoSbb currNbInfoSbb; - }; - - class Rom; - struct TUParameters - { - TUParameters ( const Rom& rom, const unsigned width, const unsigned height, const ChannelType chType ); - ~TUParameters() - { - delete [] m_scanInfo; - } - - ChannelType m_chType; - unsigned m_width; - unsigned m_height; - unsigned m_numCoeff; - unsigned m_numSbb; - unsigned m_log2SbbWidth; - unsigned m_log2SbbHeight; - unsigned m_log2SbbSize; - unsigned m_sbbSize; - unsigned m_sbbMask; - unsigned m_widthInSbb; - unsigned m_heightInSbb; - const ScanElement *m_scanSbbId2SbbPos; - const ScanElement *m_scanId2BlkPos; - const NbInfoSbb* m_scanId2NbInfoSbb; - const NbInfoOut* m_scanId2NbInfoOut; - ScanInfo* m_scanInfo; - private: - void xSetScanInfo( ScanInfo& scanInfo, int scanIdx ); - }; - - class Rom - { - public: - Rom() : m_scansInitialized(false) {} - ~Rom() { xUninitScanArrays(); } - void init () { xInitScanArrays(); } - const NbInfoSbb* getNbInfoSbb( int hd, int vd ) const { return m_scanId2NbInfoSbbArray[hd][vd]; } - const NbInfoOut* getNbInfoOut( int hd, int vd ) const { return m_scanId2NbInfoOutArray[hd][vd]; } - const TUParameters* getTUPars ( const CompArea& area, const ComponentID compID ) const - { - return m_tuParameters[Log2(area.width)][Log2(area.height)][toChannelType(compID)]; - } - private: - void xInitScanArrays (); - void xUninitScanArrays (); - private: - bool m_scansInitialized; - NbInfoSbb* m_scanId2NbInfoSbbArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ]; - NbInfoOut* m_scanId2NbInfoOutArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ]; - TUParameters* m_tuParameters [ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ][ MAX_NUM_CH ]; - }; - void Rom::xInitScanArrays() { if( m_scansInitialized ) @@ -427,48 +324,6 @@ namespace DQIntern } } - - - class RateEstimator - { - public: - RateEstimator () {} - ~RateEstimator() {} - void initCtx ( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess ); - - inline const BinFracBits *sigSbbFracBits() const { return m_sigSbbFracBits; } - inline const BinFracBits *sigFlagBits(unsigned stateId) const - { - return m_sigFracBits[std::max(((int) stateId) - 1, 0)]; - } - inline const CoeffFracBits *gtxFracBits(unsigned stateId) const { return m_gtxFracBits; } - inline int32_t lastOffset(unsigned scanIdx) const - { - return m_lastBitsX[m_scanId2Pos[scanIdx].x] + m_lastBitsY[m_scanId2Pos[scanIdx].y]; - } - - private: - void xSetLastCoeffOffset ( const FracBitsAccess& fracBitsAccess, const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID ); - void xSetSigSbbFracBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); - void xSetSigFlagBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); - void xSetGtxFlagBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); - - private: - static const unsigned sm_numCtxSetsSig = 3; - static const unsigned sm_numCtxSetsGtx = 2; - static const unsigned sm_maxNumSigSbbCtx = 2; - static const unsigned sm_maxNumSigCtx = 12; - static const unsigned sm_maxNumGtxCtx = 21; - - private: - const ScanElement * m_scanId2Pos; - int32_t m_lastBitsX [ MAX_TB_SIZEY ]; - int32_t m_lastBitsY [ MAX_TB_SIZEY ]; - BinFracBits m_sigSbbFracBits [ sm_maxNumSigSbbCtx ]; - BinFracBits m_sigFracBits [ sm_numCtxSetsSig ][ sm_maxNumSigCtx ]; - CoeffFracBits m_gtxFracBits [ sm_maxNumGtxCtx ]; - }; - void RateEstimator::initCtx( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess ) { m_scanId2Pos = tuPars.m_scanId2BlkPos; @@ -598,69 +453,7 @@ namespace DQIntern } } - - - - - /*================================================================================*/ - /*===== =====*/ - /*===== D A T A S T R U C T U R E S =====*/ - /*===== =====*/ - /*================================================================================*/ - - - struct PQData - { - TCoeff absLevel; - int64_t deltaDist; - }; - - - struct Decision - { - int64_t rdCost; - TCoeff absLevel; - int prevId; - }; - - - - - /*================================================================================*/ - /*===== =====*/ - /*===== P R E - Q U A N T I Z E R =====*/ - /*===== =====*/ - /*================================================================================*/ - - class Quantizer - { - public: - Quantizer() {} - void init ( int dqThrVal ) { m_DqThrVal = dqThrVal; } - void dequantBlock ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, CoeffBuf& recCoeff, bool enableScalingLists, int* piDequantCoef ) const; - void initQuantBlock ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue ); - inline void preQuantCoeff( const TCoeff absCoeff, PQData *pqData, int quanCoeff ) const; - inline TCoeff getLastThreshold() const { return m_thresLast; } - inline TCoeff getSSbbThreshold() const { return m_thresSSbb; } - - inline int64_t getQScale() const { return m_QScale; } - private: - // quantization - int m_DqThrVal; - int m_QShift; - int64_t m_QAdd; - int64_t m_QScale; - TCoeff m_maxQIdx; - TCoeff m_thresLast; - TCoeff m_thresSSbb; - // distortion normalization - int m_DistShift; - int64_t m_DistAdd; - int64_t m_DistStepAdd; - int64_t m_DistOrgFact; - }; - - void Quantizer::initQuantBlock(const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue = -1) + void Quantizer::initQuantBlock(const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue) { CHECKD( lambda <= 0.0, "Lambda must be greater than 0" ); @@ -758,16 +551,33 @@ namespace DQIntern } } - inline void Quantizer::preQuantCoeff( const TCoeff absCoeff, PQData* pqData, int quanCoeff ) const + bool Quantizer::preQuantCoeff( const TCoeff absCoeff, PQData* pqData, int quanCoeff ) const { int64_t scaledOrg = int64_t( absCoeff ) * quanCoeff; - TCoeff qIdx = std::max( 1, std::min( m_maxQIdx, TCoeff( ( scaledOrg + m_QAdd ) >> m_QShift ) ) ); + TCoeff qIdx = TCoeff( ( scaledOrg + m_QAdd ) >> m_QShift ); + + if( qIdx < 0 ) + { + int64_t scaledAdd = m_DistStepAdd - scaledOrg * m_DistOrgFact; + PQData& pq_a = pqData[1]; + PQData& pq_b = pqData[2]; + + pq_a.deltaDist = ( ( scaledAdd + 0 * m_DistStepAdd ) * 1 + m_DistAdd ) >> m_DistShift; + pq_a.absLevel = 1; + + pq_b.deltaDist = ( ( scaledAdd + 1 * m_DistStepAdd ) * 2 + m_DistAdd ) >> m_DistShift; + pq_b.absLevel = 1; + + return true; + } + + qIdx = std::max( 1, std::min( m_maxQIdx, qIdx ) ); int64_t scaledAdd = qIdx * m_DistStepAdd - scaledOrg * m_DistOrgFact; - PQData& pq_a = pqData[ ( qIdx + 0 ) & 3 ]; - PQData& pq_b = pqData[ ( qIdx + 1 ) & 3 ]; - PQData& pq_c = pqData[ ( qIdx + 2 ) & 3 ]; - PQData& pq_d = pqData[ ( qIdx + 3 ) & 3 ]; + PQData& pq_a = pqData[( qIdx + 0 ) & 3]; + PQData& pq_b = pqData[( qIdx + 1 ) & 3]; + PQData& pq_c = pqData[( qIdx + 2 ) & 3]; + PQData& pq_d = pqData[( qIdx + 3 ) & 3]; pq_a.deltaDist = ( ( scaledAdd + 0 * m_DistStepAdd ) * ( qIdx + 0 ) + m_DistAdd ) >> m_DistShift; pq_a.absLevel = ( qIdx + 1 ) >> 1; @@ -780,13 +590,9 @@ namespace DQIntern pq_d.deltaDist = ( ( scaledAdd + 3 * m_DistStepAdd ) * ( qIdx + 3 ) + m_DistAdd ) >> m_DistShift; pq_d.absLevel = ( qIdx + 4 ) >> 1; - } - - - - - + return false; + } /*================================================================================*/ /*===== =====*/ @@ -796,6 +602,13 @@ namespace DQIntern class State; + struct Decision + { + int64_t rdCost; + TCoeff absLevel; + int prevId; + }; + struct SbbCtx { uint8_t* sbbFlags; @@ -834,7 +647,6 @@ namespace DQIntern uint8_t m_memory[ 8 * ( MAX_TB_SIZEY * MAX_TB_SIZEY + MLS_GRP_NUM ) ]; }; -#define RICEMAX 32 const int32_t g_goRiceBits[4][RICEMAX] = { { 32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, @@ -983,7 +795,7 @@ namespace DQIntern if( rdCostA < decisionA.rdCost ) { decisionA.rdCost = rdCostA; - decisionA.absLevel = pqDataA.absLevel; + decisionA.absLevel = 1; decisionA.prevId = m_stateId; } @@ -1085,7 +897,7 @@ namespace DQIntern : m_sbbFracBits { { 0, 0 } } , m_stateId ( stateId ) , m_sigFracBitsArray( rateEst.sigFlagBits(stateId) ) - , m_gtxFracBitsArray( rateEst.gtxFracBits(stateId) ) + , m_gtxFracBitsArray( rateEst.gtxFracBits() ) , m_commonCtx ( commonCtx ) { } @@ -1119,7 +931,7 @@ namespace DQIntern if( decision.absLevel ) { - m_sbb.absLevels[scanInfo.insidePos] = ( uint8_t ) std::min( 255, decision.absLevel ); + m_sbb.absLevels[scanInfo.insidePos] = ( uint8_t ) std::min( 254 + ( decision.absLevel & 1 ), decision.absLevel ); if( scanInfo.currNbInfoSbb.numInv ) { @@ -1212,7 +1024,7 @@ namespace DQIntern ::memset( m_sbb.absLevels, 0, sizeof( m_sbb.absLevels ) ); } - m_sbb.absLevels[ scanInfo.insidePos ] = (uint8_t)std::min( 255, decision.absLevel ); + m_sbb.absLevels[ scanInfo.insidePos ] = (uint8_t)std::min( 254 + ( decision.absLevel & 1 ), decision.absLevel ); m_commonCtx.update( scanInfo, prvState, *this ); @@ -1295,21 +1107,17 @@ namespace DQIntern } } - - /*================================================================================*/ /*===== =====*/ /*===== T C Q =====*/ /*===== =====*/ /*================================================================================*/ - class DepQuant : private RateEstimator + class DepQuant : private RateEstimator, public DepQuantImpl { public: DepQuant( bool enc ); - void quant ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ); - void dequant ( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* quantCoeff ); - void init ( int dqTrVal ); + void quant( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ); private: void xDecideAndUpdate ( const TCoeff absCoeff, const ScanInfo& scanInfo, bool zeroOut, int quantCoeff); @@ -1322,7 +1130,6 @@ namespace DQIntern State* m_prevStates; State* m_skipStates; State m_startState; - Quantizer m_quant; Decision m_trellis[ MAX_TB_SIZEY * MAX_TB_SIZEY ][ 8 ]; Rom m_scansRom; }; @@ -1354,17 +1161,6 @@ namespace DQIntern } #undef TINIT - - void DepQuant::dequant( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* piDequantCoef ) - { - m_quant.dequantBlock( tu, compID, cQP, recCoeff, enableScalingLists, piDequantCoef ); - } - - void DepQuant::init( int dqTrVal ) - { - m_quant.init( dqTrVal ); - } - void DepQuant::xDecide( const ScanInfo &scanInfo, const TCoeff absCoeff, const int lastOffset, Decision* decisions, bool zeroOut, int quanCoeff ) { ::memcpy( decisions, startDec, 4*sizeof(Decision) ); @@ -1382,9 +1178,7 @@ namespace DQIntern } PQData pqData[4]; - m_quant.preQuantCoeff( absCoeff, pqData, quanCoeff ); - - bool near0 = pqData[1].deltaDist < pqData[2].deltaDist && pqData[1].absLevel == 1 && pqData[2].absLevel == 1; + bool near0 = m_quant.preQuantCoeff( absCoeff, pqData, quanCoeff ); if( near0 ) { @@ -1392,6 +1186,8 @@ namespace DQIntern m_prevStates[1].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions[0], decisions[2] ); m_prevStates[2].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions[3], decisions[1] ); m_prevStates[3].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions[1], decisions[3] ); + + m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] ); } else { @@ -1410,6 +1206,9 @@ namespace DQIntern m_prevStates[1].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions[2], decisions[0] ); m_prevStates[2].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions[1], decisions[3] ); m_prevStates[3].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions[3], decisions[1] ); + + m_startState.checkRdCostStart( lastOffset, pqData[0], decisions[0] ); + m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] ); } if( scanInfo.spt==SCAN_EOCSBB ) @@ -1419,9 +1218,6 @@ namespace DQIntern m_skipStates[2].checkRdCostSkipSbb( decisions[2] ); m_skipStates[3].checkRdCostSkipSbb( decisions[3] ); } - - m_startState.checkRdCostStart( lastOffset, pqData[0], decisions[0] ); - m_startState.checkRdCostStart( lastOffset, pqData[2], decisions[2] ); } void DepQuant::xDecideAndUpdate( const TCoeff absCoeff, const ScanInfo& scanInfo, bool zeroOut, int quantCoeff ) @@ -1516,60 +1312,6 @@ namespace DQIntern { const TCoeff defaultTh = TCoeff( thres / ( defaultQuantisationCoefficient << 2 ) ); -#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 ) - // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold - if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR ) - { - const int sbbSize = tuPars.m_sbbSize; - // move the pointer to the beginning of the current subblock - firstTestPos -= ( sbbSize - 1 ); - - const __m128i xdfTh = _mm_set1_epi32( defaultTh ); - - // for each subblock - for( ; firstTestPos >= 0; firstTestPos -= sbbSize ) - { - // skip zeroed out blocks - // for 64-point transformation the coding order takes care of that - if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) - { - continue; - } - - // read first line of the subblock and check for coefficients larger than the threshold - // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width - int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx; - __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); - __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh ); - - // same for the next line in the subblock - pos += tuPars.m_width; - xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); - xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); - - // and the third line - pos += tuPars.m_width; - xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); - xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); - - // and the last line - pos += tuPars.m_width; - xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); - xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); - - // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold - if( !_mm_testz_si128( xdf, xdf ) ) break; - } - - if( firstTestPos >= 0 ) - { - // if a coefficient was found, advance the pointer to the end of the current subblock - // for the subsequent coefficient-wise refinement (C-impl after endif) - firstTestPos += sbbSize - 1; - } - } - -#endif for( ; firstTestPos >= 0; firstTestPos-- ) { if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue; @@ -1640,23 +1382,36 @@ namespace DQIntern tu.lastPos[compID] = scanIdx - 1; } - }; // namespace DQIntern +void DepQuantImpl::dequant( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* piDequantCoef ) +{ + m_quant.dequantBlock( tu, compID, cQP, recCoeff, enableScalingLists, piDequantCoef ); +} - +void DepQuantImpl::init( int dqTrVal ) +{ + m_quant.init( dqTrVal ); +} //===== interface class ===== DepQuant::DepQuant( const Quant* other, bool enc, bool useScalingLists ) : QuantRDOQ2( other, useScalingLists ) { +#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_QUANT + initDepQuantX86(); +#endif + const DepQuant* dq = dynamic_cast( other ); CHECK( other && !dq, "The DepQuant cast must be successfull!" ); - p = new DQIntern::DepQuant( enc ); + if( !p ) + { + p = new DQIntern::DepQuant( enc ); + } } DepQuant::~DepQuant() { - delete static_cast(p); + delete p; } void DepQuant::quant( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff& uiAbsSum, const QpParam& cQP, const Ctx& ctx ) diff --git a/source/Lib/CommonLib/DepQuant.h b/source/Lib/CommonLib/DepQuant.h index 3e27b8128..ba8de5339 100644 --- a/source/Lib/CommonLib/DepQuant.h +++ b/source/Lib/CommonLib/DepQuant.h @@ -53,6 +53,200 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +namespace DQIntern +{ + /*================================================================================*/ + /*===== =====*/ + /*===== R A T E E S T I M A T O R =====*/ + /*===== =====*/ + /*================================================================================*/ + + struct NbInfoSbb + { + uint8_t numInv; + uint8_t invInPos[5]; + }; + struct NbInfoOut + { + uint16_t maxDist; + uint16_t num; + uint16_t outPos[5]; + }; + struct CoeffFracBits + { + int32_t bits[6]; + }; + + + enum ScanPosType : int8_t { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 }; + + struct ScanInfo + { + ScanInfo() {} + short numSbb; + short scanIdx; + short rasterPos; + short sbbPos; // byte + short nextSbbRight; + short nextSbbBelow; + int8_t sbbSize; + int8_t insidePos; + int8_t nextInsidePos; + ScanPosType spt; + int8_t posX; + int8_t posY; + int8_t sigCtxOffsetNext; + int8_t gtxCtxOffsetNext; + NbInfoSbb currNbInfoSbb; + }; + + class Rom; + struct TUParameters + { + TUParameters ( const Rom& rom, const unsigned width, const unsigned height, const ChannelType chType ); + ~TUParameters() + { + delete [] m_scanInfo; + } + + ChannelType m_chType; + unsigned m_width; + unsigned m_height; + unsigned m_numCoeff; + unsigned m_numSbb; + unsigned m_log2SbbWidth; + unsigned m_log2SbbHeight; + unsigned m_log2SbbSize; + unsigned m_sbbSize; + unsigned m_sbbMask; + unsigned m_widthInSbb; + unsigned m_heightInSbb; + const ScanElement *m_scanSbbId2SbbPos; + const ScanElement *m_scanId2BlkPos; + const NbInfoSbb* m_scanId2NbInfoSbb; + const NbInfoOut* m_scanId2NbInfoOut; + ScanInfo* m_scanInfo; + private: + void xSetScanInfo( ScanInfo& scanInfo, int scanIdx ); + }; + + class Rom + { + public: + Rom() : m_scansInitialized(false) {} + ~Rom() { xUninitScanArrays(); } + void init () { xInitScanArrays(); } + const NbInfoSbb* getNbInfoSbb( int hd, int vd ) const { return m_scanId2NbInfoSbbArray[hd][vd]; } + const NbInfoOut* getNbInfoOut( int hd, int vd ) const { return m_scanId2NbInfoOutArray[hd][vd]; } + const TUParameters* getTUPars ( const CompArea& area, const ComponentID compID ) const + { + return m_tuParameters[Log2(area.width)][Log2(area.height)][toChannelType(compID)]; + } + private: + void xInitScanArrays (); + void xUninitScanArrays (); + private: + bool m_scansInitialized; + NbInfoSbb* m_scanId2NbInfoSbbArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ]; + NbInfoOut* m_scanId2NbInfoOutArray[ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ]; + TUParameters* m_tuParameters [ MAX_TU_SIZE_IDX ][ MAX_TU_SIZE_IDX ][ MAX_NUM_CH ]; + }; + + class RateEstimator + { + public: + RateEstimator () {} + ~RateEstimator() {} + void initCtx ( const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID, const FracBitsAccess& fracBitsAccess ); + + inline const BinFracBits *sigSbbFracBits() const { return m_sigSbbFracBits; } + inline const BinFracBits *sigFlagBits(unsigned stateId) const + { + return m_sigFracBits[std::max(((int) stateId) - 1, 0)]; + } + inline const CoeffFracBits *gtxFracBits() const { return m_gtxFracBits; } + inline int32_t lastOffset(unsigned scanIdx) const + { + return m_lastBitsX[m_scanId2Pos[scanIdx].x] + m_lastBitsY[m_scanId2Pos[scanIdx].y]; + } + + private: + void xSetLastCoeffOffset ( const FracBitsAccess& fracBitsAccess, const TUParameters& tuPars, const TransformUnit& tu, const ComponentID compID ); + void xSetSigSbbFracBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); + void xSetSigFlagBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); + void xSetGtxFlagBits ( const FracBitsAccess& fracBitsAccess, ChannelType chType ); + + public: + static const unsigned sm_numCtxSetsSig = 3; + static const unsigned sm_numCtxSetsGtx = 2; + static const unsigned sm_maxNumSigSbbCtx = 2; + static const unsigned sm_maxNumSigCtx = 12; + static const unsigned sm_maxNumGtxCtx = 21; + + private: + const ScanElement * m_scanId2Pos; + int32_t m_lastBitsX [ MAX_TB_SIZEY ]; + int32_t m_lastBitsY [ MAX_TB_SIZEY ]; + BinFracBits m_sigSbbFracBits [ sm_maxNumSigSbbCtx ]; + BinFracBits m_sigFracBits [ sm_numCtxSetsSig ][ sm_maxNumSigCtx ]; + CoeffFracBits m_gtxFracBits [ sm_maxNumGtxCtx ]; + }; + + /*================================================================================*/ + /*===== =====*/ + /*===== P R E - Q U A N T I Z E R =====*/ + /*===== =====*/ + /*================================================================================*/ + + struct PQData + { + TCoeff absLevel; + int64_t deltaDist; + }; + + class Quantizer + { + public: + Quantizer() {} + void init ( int dqThrVal ) { m_DqThrVal = dqThrVal; } + void dequantBlock ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, CoeffBuf& recCoeff, bool enableScalingLists, int* piDequantCoef ) const; + void initQuantBlock ( const TransformUnit& tu, const ComponentID compID, const QpParam& cQP, const double lambda, int gValue = -1 ); + bool preQuantCoeff ( const TCoeff absCoeff, PQData *pqData, int quanCoeff ) const; + TCoeff getLastThreshold() const { return m_thresLast; } + TCoeff getSSbbThreshold() const { return m_thresSSbb; } + + int64_t getQScale () const { return m_QScale; } + + // quantization + int m_DqThrVal; + int m_QShift; + int64_t m_QAdd; + int64_t m_QScale; + TCoeff m_maxQIdx; + TCoeff m_thresLast; + TCoeff m_thresSSbb; + // distortion normalization + int m_DistShift; + int64_t m_DistAdd; + int64_t m_DistStepAdd; + int64_t m_DistOrgFact; + }; + +#define RICEMAX 32 + extern const int32_t g_goRiceBits[4][RICEMAX]; +} + +class DepQuantImpl +{ +public: + virtual ~DepQuantImpl() {} + virtual void quant ( TransformUnit& tu, const CCoeffBuf& srcCoeff, const ComponentID compID, const QpParam& cQP, const double lambda, const Ctx& ctx, TCoeff& absSum, bool enableScalingLists, int* quantCoeff ) = 0; + virtual void dequant ( const TransformUnit& tu, CoeffBuf& recCoeff, const ComponentID compID, const QpParam& cQP, bool enableScalingLists, int* quantCoeff ); + virtual void init ( int dqTrVal ); + +protected: + DQIntern::Quantizer m_quant; +}; class DepQuant : public QuantRDOQ2 { @@ -60,13 +254,19 @@ class DepQuant : public QuantRDOQ2 DepQuant( const Quant* other, bool enc, bool useScalingLists ); virtual ~DepQuant(); - virtual void quant ( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff &uiAbsSum, const QpParam& cQP, const Ctx& ctx ); + virtual void quant ( TransformUnit& tu, const ComponentID compID, const CCoeffBuf& pSrc, TCoeff &uiAbsSum, const QpParam& cQP, const Ctx& ctx ); virtual void dequant( const TransformUnit& tu, CoeffBuf& dstCoeff, const ComponentID compID, const QpParam& cQP ); virtual void init ( int rdoq = 0, bool useRDOQTS = false, int dqThrVal = 8 ); private: - void* p; +#ifdef TARGET_SIMD_X86 + void initDepQuantX86(); + template + void _initDepQuantX86(); +#endif + + DepQuantImpl* p = nullptr; }; } // namespace vvenc diff --git a/source/Lib/CommonLib/MCTF.cpp b/source/Lib/CommonLib/MCTF.cpp index c5619c836..004390adf 100644 --- a/source/Lib/CommonLib/MCTF.cpp +++ b/source/Lib/CommonLib/MCTF.cpp @@ -937,6 +937,34 @@ void MCTF::filter( const std::deque& picFifo, int filterIdx ) bilateralFilter( origBuf, srcFrameInfo, fltrBuf, overallStrength ); } } + if (m_encCfg->m_forceScc <= 0) + { + bool forceSCC = false; + if (pic->gopEntry->m_isStartOfGop) + { + forceSCC = true; + for (int j = 0; j < QPA_MAX_NOISE_LEVELS; j++) + { + if (pic->m_picShared->m_minNoiseLevels[j] < 255 && pic->m_picShared->m_minNoiseLevels[j]) + { + forceSCC = false; + break; + } + } + if (forceSCC) + { + for (int s = 0; s < mvErr.size(); s++) + { + if (int(mvErr[s]) == 0) + { + forceSCC = false; + break; + } + } + } + } + pic->m_picShared->m_forceSCC = forceSCC; + } if( !m_encCfg->m_blockImportanceMapping || !pic->useMCTF ) { diff --git a/source/Lib/CommonLib/Picture.cpp b/source/Lib/CommonLib/Picture.cpp index 25bdbc319..730863722 100644 --- a/source/Lib/CommonLib/Picture.cpp +++ b/source/Lib/CommonLib/Picture.cpp @@ -173,6 +173,7 @@ Picture::Picture() , ctsValid ( false ) , isPreAnalysis ( false ) , m_picShared ( nullptr ) + , gopAdaptedQP ( 0 ) , isMeanQPLimited ( false ) , picInitialQP ( -1 ) , picInitialLambda ( -1.0 ) @@ -232,7 +233,7 @@ void Picture::reset() refCounter = 0; poc = -1; TLayer = std::numeric_limits::max(); - + gopAdaptedQP = 0; actualHeadBits = 0; actualTotalBits = 0; diff --git a/source/Lib/CommonLib/Picture.h b/source/Lib/CommonLib/Picture.h index 213082380..a6c052bdb 100644 --- a/source/Lib/CommonLib/Picture.h +++ b/source/Lib/CommonLib/Picture.h @@ -238,6 +238,7 @@ struct Picture : public UnitArea std::vector ctuQpaLambda; std::vector ctuAdaptedQP; + int gopAdaptedQP; // QP offset of GOP (delta relative to base QP) bool isMeanQPLimited; std::mutex wppMutex; int picInitialQP; diff --git a/source/Lib/CommonLib/StatCounter.cpp b/source/Lib/CommonLib/StatCounter.cpp index 1491a453a..3ef9cfbf1 100644 --- a/source/Lib/CommonLib/StatCounter.cpp +++ b/source/Lib/CommonLib/StatCounter.cpp @@ -238,7 +238,7 @@ std::ostream& StatCounters::report2D( std::ostream& os, const StatCounter2DSetgetCountersSet().size(); i++ ) { - std::cout << "Run-time of selected encoder stages across CTUs of all pictures " << "(" << ( i == 0 ? "Intra": "Inter" << ")" ) << std::endl; + std::cout << "Run-time of selected encoder stages across CTUs of all pictures " << "(" << ( i == 0 ? "Intra": "Inter" ) << ")" << std::endl; StatCounters::report2D( std::cout, tp->getCountersSet()[i], false, true, false, true, true, -1 ); if( i > 0 ) tp->getCountersSet()[0] += tp->getCountersSet()[i]; @@ -151,8 +149,6 @@ void timeProfilerResults( TProfiler* tp ) StatCounters::report2D( std::cout, tp->getCountersSet()[0], true, true, false, true, true, -1 ); } #endif - delete tp; - tp = nullptr; } #endif } diff --git a/source/Lib/CommonLib/TimeProfiler.h b/source/Lib/CommonLib/TimeProfiler.h index 25ec59168..c0a50e39d 100644 --- a/source/Lib/CommonLib/TimeProfiler.h +++ b/source/Lib/CommonLib/TimeProfiler.h @@ -132,12 +132,6 @@ class TimeProfiler time_point previous = clock::now(); STAGE m_eStage; const unsigned m_numStages = sizeof( stageNames ) / sizeof( stageNames[0] )/*P_STAGES + 1*/; - int m_iLevel; - int m_iExtData; - unsigned m_numBlkHor; - unsigned m_numBlkVer; - unsigned m_curWId; - unsigned m_curHId; public: const time_point start_time = previous; @@ -293,8 +287,6 @@ class TimeProfiler2D m_curX = x; m_curY = y; m_curZ = z; - //if( s == P_ALF ) - // printf( "prof=%d\n", m_id ); } TimeProfiler2D& operator+=( const TimeProfiler2D& other ) { @@ -392,7 +384,7 @@ class StageTimeProfiler2D #define PROFILER_EXT_ACCUM_AND_START_NEW_SET_(cond,p,s,t,l,x,y,w,h) PROF_EXT_ACCUM_AND_START_NEW_SET_COND(cond,p,s,w,h,t) #endif -#define PROFILER_EXT_UPDATE(p,s,t) PROF_EXT_UPDATE(p,s,t) +#define PROFILER_SCOPE_TOP_LEVEL_EXT2D(cond,p,s,cs) PROFILER_SCOPE_AND_STAGE_EXT2D_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, 0, 0, 0, 0) #define PROFILER_SCOPE_AND_STAGE_EXT2D(cond,p,s,cs,ch) PROFILER_SCOPE_AND_STAGE_EXT2D_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, BX_(cs,ch), BY_(cs,ch), BW_(cs,ch), BH_(cs,ch) ) #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch ) PROFILER_EXT_ACCUM_AND_START_NEW_SET_(cond,p,s,!(cs)->slice->isIntra(), (cs)->slice->TLayer, BX_(cs,ch), BY_(cs,ch), BW_(cs,ch), BH_(cs,ch) ) #endif @@ -401,12 +393,15 @@ class StageTimeProfiler2D #define PROFILER_ACCUM_AND_START_NEW_SET(cond,p,s) (*(p))(s) #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch) (*(p))(s) #define PROFILER_SCOPE_AND_STAGE(cond,p,s) PROFILER_SCOPE_AND_STAGE_(cond,p,s) +#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs) PROFILER_SCOPE_AND_STAGE_(cond,p,s) #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch) PROFILER_SCOPE_AND_STAGE_(cond,p,s) #define PROFILER_EXT_UPDATE(p,s,t) typedef TimeProfiler TProfiler; #else //ENABLE_TIME_PROFILING_EXTENDED +#define PROFILER_EXT_UPDATE(p,s,t) PROF_EXT_UPDATE(p,s,t) #define PROFILER_ACCUM_AND_START_NEW_SET(cond,p,s) PROF_EXT_ACCUM_AND_START_NEW_SET_COND(cond,p,s,0,0,0) #define PROFILER_SCOPE_AND_STAGE(cond,p,s) +#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs) PROFILER_SCOPE_TOP_LEVEL_EXT2D(cond,p,s,cs) #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch) PROFILER_SCOPE_AND_STAGE_EXT2D(cond,p,s,cs,ch) typedef TimeProfiler2D TProfiler; #endif @@ -427,6 +422,7 @@ void timeProfilerResults( TProfiler* tp ); #define PROFILER_EXT_ACCUM_AND_START_NEW_SET(cond,p,s,cs,ch) #define PROFILER_SCOPE_AND_STAGE(cond,p,s) #define PROFILER_SCOPE_AND_STAGE_EXT(cond,p,s,cs,ch) +#define PROFILER_SCOPE_TOP_LEVEL_EXT(cond,p,s,cs) #define PROFILER_EXT_UPDATE(p,s,t) #endif diff --git a/source/Lib/CommonLib/TypeDef.h b/source/Lib/CommonLib/TypeDef.h index e2b19969e..5ee3fd5f9 100644 --- a/source/Lib/CommonLib/TypeDef.h +++ b/source/Lib/CommonLib/TypeDef.h @@ -147,7 +147,7 @@ namespace vvenc { #if defined( TARGET_SIMD_X86 ) && !defined( REAL_TARGET_X86 ) -# define SIMD_EVERYWHERE_EXTENSION_LEVEL SSE42 +# define SIMD_EVERYWHERE_EXTENSION_LEVEL AVX2 #endif // End of SIMD optimizations diff --git a/source/Lib/CommonLib/x86/DepQuantX86.h b/source/Lib/CommonLib/x86/DepQuantX86.h new file mode 100644 index 000000000..2a8a494db --- /dev/null +++ b/source/Lib/CommonLib/x86/DepQuantX86.h @@ -0,0 +1,1630 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +#include "DepQuant.h" +#include "TrQuant.h" +#include "CodingStructure.h" +#include "UnitTools.h" +#ifdef TARGET_SIMD_X86 +# include "x86/CommonDefX86.h" +# include +#if defined( USE_SSE41 ) || !defined( REAL_TARGET_X86 ) +# include +#endif +#endif + +#include + +//! \ingroup CommonLib +//! \{ + +namespace vvenc { + +#if USE_SSE41 && defined( REAL_TARGET_X86 ) +#define _my_cmpgt_epi64( a, b ) simde_mm_cmpgt_epi64( a, b ) +#else +#define _my_cmpgt_epi64( a, b ) _mm_cmpgt_epi64( a, b ) +#endif + + +namespace DQIntern +{ + /*================================================================================*/ + /*===== =====*/ + /*===== T C Q S T A T E =====*/ + /*===== =====*/ + /*================================================================================*/ + + static constexpr int64_t rdCostInit = std::numeric_limits::max() >> 1; + + struct Decisions + { + int64_t rdCost[4]; + TCoeffSig absLevel[4]; + int8_t prevId[4]; + }; + + template + class State; + + struct StateMem + { + uint8_t tpl[64]; + uint8_t sum[64]; + uint8_t val[64]; + + struct + { + uint8_t sig[4]; + uint8_t cff[4]; + } ctx; + + int64_t rdCost[4]; + + int32_t sbbBits0[4]; + int32_t sbbBits1[4]; + + uint8_t numSig[4]; + int8_t refSbbCtxId[4]; + + int32_t cffBits1[RateEstimator::sm_maxNumGtxCtx + 3]; + int remRegBins[4]; + + int cffBitsCtxOffset; + bool anyRemRegBinsLt4; + unsigned effWidth; + unsigned effHeight; + int initRemRegBins; + }; + + struct SbbCtx + { + uint8_t* sbbFlags; + uint8_t* levels; + }; + + template + class CommonCtx + { + public: + CommonCtx() : m_currSbbCtx( m_allSbbCtx ), m_prevSbbCtx( m_currSbbCtx + 4 ) {} + + inline void swap() { std::swap(m_currSbbCtx, m_prevSbbCtx); } + + inline void reset( const TUParameters& tuPars, const RateEstimator &rateEst) + { + m_nbInfo = tuPars.m_scanId2NbInfoOut; + ::memcpy( m_sbbFlagBits, rateEst.sigSbbFracBits(), 2*sizeof(BinFracBits) ); + const int numSbb = tuPars.m_numSbb; + const int chunkSize = numSbb + tuPars.m_numCoeff; + uint8_t* nextMem = m_memory; + for( int k = 0; k < 8; k++, nextMem += chunkSize ) + { + m_allSbbCtx[k].sbbFlags = nextMem; + m_allSbbCtx[k].levels = nextMem + numSbb; + } + } + + inline void update( const ScanInfo &scanInfo, const int prevId, int stateId, StateMem &curr ) + { + uint8_t* sbbFlags = m_currSbbCtx[stateId].sbbFlags; + uint8_t* levels = m_currSbbCtx[stateId].levels; + uint16_t maxDist = m_nbInfo[ scanInfo.scanIdx - 1 ].maxDist; + uint16_t sbbSize = scanInfo.sbbSize; + std::size_t setCpSize = ( maxDist > sbbSize ? maxDist - sbbSize : 0 ) * sizeof(uint8_t); + if( prevId >= 0 ) + { + ::memcpy( sbbFlags, m_prevSbbCtx[prevId].sbbFlags, scanInfo.numSbb * sizeof( uint8_t ) ); + ::memcpy( levels + scanInfo.scanIdx + sbbSize, m_prevSbbCtx[prevId].levels + scanInfo.scanIdx + sbbSize, setCpSize ); + } + else + { + ::memset( sbbFlags, 0, scanInfo.numSbb * sizeof( uint8_t ) ); + ::memset( levels + scanInfo.scanIdx + sbbSize, 0, setCpSize ); + } + sbbFlags[scanInfo.sbbPos] = !!curr.numSig[stateId]; + + const int sigNSbb = ( ( scanInfo.nextSbbRight ? sbbFlags[scanInfo.nextSbbRight] : false ) || ( scanInfo.nextSbbBelow ? sbbFlags[scanInfo.nextSbbBelow] : false ) ? 1 : 0 ); + curr.refSbbCtxId[stateId] = stateId; + const BinFracBits sbbBits = m_sbbFlagBits[sigNSbb]; + + curr.sbbBits0[stateId] = sbbBits.intBits[0]; + curr.sbbBits1[stateId] = sbbBits.intBits[1]; + + if( sigNSbb || ( ( scanInfo.nextSbbRight && scanInfo.nextSbbBelow ) ? sbbFlags[scanInfo.nextSbbBelow + 1] : false ) ) + { + const int scanBeg = scanInfo.scanIdx - scanInfo.sbbSize; + const NbInfoOut* nbOut = m_nbInfo + scanBeg; + const uint8_t* absLevels = levels + scanBeg; + + for( int id = 0; id < scanInfo.sbbSize; id++, nbOut++ ) + { + const int idAddr = ( id << 2 ) + stateId; + + if( nbOut->num ) + { + TCoeff sumAbs = 0, sumAbs1 = 0, sumNum = 0; +#define UPDATE(k) {TCoeff t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=std::min(4+(t&1),t); sumNum+=!!t; } + switch( nbOut->num ) + { + default: + case 5: + UPDATE(4); + case 4: + UPDATE(3); + case 3: + UPDATE(2); + case 2: + UPDATE(1); + case 1: + UPDATE(0); + } +#undef UPDATE + curr.tpl[idAddr] = ( sumNum << 5 ) | sumAbs1; + curr.sum[idAddr] = ( uint8_t ) std::min( 255, sumAbs ); + } + } + } + } + + inline void updateAllLvls( const ScanInfo &scanInfo, const StateMem &curr ) + { + uint8_t *levels0 = m_currSbbCtx[0].levels + scanInfo.scanIdx; + uint8_t *levels1 = m_currSbbCtx[1].levels + scanInfo.scanIdx; + uint8_t *levels2 = m_currSbbCtx[2].levels + scanInfo.scanIdx; + uint8_t *levels3 = m_currSbbCtx[3].levels + scanInfo.scanIdx; + + const int regSize = 16; + const int ctxSize = scanInfo.sbbSize << 2; + + const __m128i vshuf0 = _mm_setr_epi8( 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); + const __m128i vshuf1 = _mm_setr_epi8( 1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); + const __m128i vshuf2 = _mm_setr_epi8( 2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); + const __m128i vshuf3 = _mm_setr_epi8( 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); + + for( int i = 0, j = 0; i < ctxSize; i += regSize, j += 4 ) + { + __m128i in = _mm_loadu_si128( ( const __m128i* ) &curr.val[i] ); + + _mm_storeu_si32( &levels0[j], _mm_shuffle_epi8( in, vshuf0 ) ); + _mm_storeu_si32( &levels1[j], _mm_shuffle_epi8( in, vshuf1 ) ); + _mm_storeu_si32( &levels2[j], _mm_shuffle_epi8( in, vshuf2 ) ); + _mm_storeu_si32( &levels3[j], _mm_shuffle_epi8( in, vshuf3 ) ); + } + } + + private: + const NbInfoOut* m_nbInfo; + BinFracBits m_sbbFlagBits[2]; + SbbCtx m_allSbbCtx [8]; + SbbCtx* m_currSbbCtx; + SbbCtx* m_prevSbbCtx; + uint8_t m_memory[ 8 * ( MAX_TB_SIZEY * MAX_TB_SIZEY + MLS_GRP_NUM ) ]; + }; + + template + class State + { + friend class CommonCtx; + public: + State( const RateEstimator& rateEst, CommonCtx& commonCtx, const int stateId ) + : m_stateId ( stateId ) + , m_sigFracBitsArray( rateEst.sigFlagBits(stateId) ) + , m_gtxFracBitsArray( rateEst.gtxFracBits() ) + , m_commonCtx ( commonCtx ) + { + } + + static inline void updateStates( const ScanInfo &scanInfo, const Decisions &decisions, StateMem &prev, StateMem &curr ) + { + int8_t s[4] = { 0 }, t[4] = { 0 }, l[4] = { 0 }; + +#if 1 + __m128i v254_4 = _mm_setr_epi16( 254, 254, 254, 254, 4, 4, 4, 4 ); + __m128i v01 = _mm_setr_epi16( 1, 1, 1, 1, 1, 1, 1, 1 ); + __m128i v032 = _mm_setr_epi8 ( 0, 0, 0, 0, 32, 32, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0 ); + __m128i vn1 = _mm_set1_epi8 ( -1 ); + + static_assert( sizeof( curr.rdCost ) == sizeof( decisions.rdCost ), "Non-matching array size" ); + memcpy( curr.rdCost, decisions.rdCost, sizeof( decisions.rdCost ) ); + + // in signalling, the coeffs are always max 16 bit! + __m128i v = _mm_loadu_si64( decisions.absLevel ); + v = _mm_unpacklo_epi64( v, v ); + __m128i p = _mm_loadu_si32( decisions.prevId ); + _mm_storeu_si32( s, p ); // store previous state indexes + p = _mm_shuffle_epi32( p, 0 ); + __m128i n2 = _mm_cmplt_epi8( p, vn1 ); + __m128i a_1 = _mm_and_si128( v, v01 ); + __m128i a_m = _mm_min_epi16( v, _mm_add_epi16( v254_4, a_1 ) ); + a_m = _mm_packs_epi16( a_m, vn1 ); + a_m = _mm_or_si128 ( a_m, _mm_sign_epi8( v032, a_m ) ); + a_m = _mm_andnot_si128( n2, a_m ); + _mm_storeu_si32( l, a_m ); // store abs value + a_m = _mm_shuffle_epi32( a_m, 1 ); + _mm_storeu_si32( t, a_m ); // store store capped abs value +#else + for( int i = 0; i < 4; ++i ) + { + s[ i ] = decisions[ i ].prevId; + int min4_or_5 = std::min( 4 + ( decisions[ i ].absLevel & 1 ), decisions[ i ].absLevel ); + t[ i ] = decisions[ i ].prevId > -2 ? min4_or_5 : 0; + t[ i ] |= t[i] ? 32 : 0; + l[ i ] = decisions[ i ].prevId > -2 ? std::min( decisions[i].absLevel, 255 ) : 0; + //all_above_minus_two &= decision[ i ].prevId > -2; + } +#endif + + { + const int ctxSize = 16 * 4; + const int regSize = 16; + + __m128i vshuf = _mm_loadu_si32 ( s ); + vshuf = _mm_shuffle_epi32( vshuf, 0 ); + __m128i vshufmask = _mm_cmplt_epi8 ( vshuf, _mm_setzero_si128() ); + vshuf = _mm_add_epi8 ( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) ); + vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask ); + + for( int i = 0; i < ctxSize; i += regSize ) + { + __m128i vtpl = _mm_loadu_si128( ( const __m128i* ) &prev.tpl[i] ); + vtpl = _mm_shuffle_epi8( vtpl, vshuf ); + _mm_storeu_si128( ( __m128i* ) &curr.tpl[i], vtpl ); + + __m128i vval = _mm_loadu_si128( ( const __m128i* ) &prev.val[i] ); + vval = _mm_shuffle_epi8( vval, vshuf ); + _mm_storeu_si128( ( __m128i* ) &curr.val[i], vval ); + + __m128i vsum = _mm_loadu_si128( ( const __m128i* ) &prev.sum[i] ); + vsum = _mm_shuffle_epi8( vsum, vshuf ); + _mm_storeu_si128( ( __m128i* ) &curr.sum[i], vsum ); + } + + __m128i numSig = _mm_loadu_si32( prev.numSig ); + numSig = _mm_shuffle_epi8( numSig, vshuf ); + __m128i lvls = _mm_loadu_si32( l ); + lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() ); + numSig = _mm_subs_epi8( numSig, lvls ); + _mm_storeu_si32( curr.numSig, numSig ); + + __m128i rsc = _mm_loadu_si32( prev.refSbbCtxId ); + rsc = _mm_shuffle_epi8( rsc, vshuf ); + rsc = _mm_blendv_epi8( rsc, vshuf, vshuf ); + _mm_storeu_si32( curr.refSbbCtxId, rsc ); + + vshuf = _mm_cvtepi8_epi32( vshuf ); + vshuf = _mm_shuffle_epi8( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) ); + vshuf = _mm_slli_epi32( vshuf, 2 ); + vshuf = _mm_add_epi8( vshuf, + _mm_blendv_epi8( _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ), + _mm_setzero_si128(), + vshuf ) ); + + __m128i rrb = _mm_loadu_si128( ( const __m128i* ) prev.remRegBins ); + rrb = _mm_shuffle_epi8( rrb, vshuf ); + rrb = _mm_sub_epi32( rrb, _mm_blendv_epi8( _mm_set1_epi32( 1 ), _mm_setzero_si128(), vshuf ) ); + __m128i mlvl = _mm_loadu_si32( l ); + rrb = _mm_blendv_epi8( rrb, _mm_set1_epi32( curr.initRemRegBins ), vshuf ); + + __m128i mbins = _mm_cvtepi8_epi32( mlvl ); + __m128i madd = _mm_cmpeq_epi32( mbins, _mm_set1_epi32( 1 ) ); + __m128i mmore = _mm_and_si128( _mm_cmpgt_epi32( mbins, _mm_set1_epi32( 1 ) ), _mm_set1_epi32( 3 ) ); + madd = _mm_sub_epi32( madd, mmore ); + madd = _mm_blendv_epi8( madd, _mm_setzero_si128(), _mm_cmplt_epi32(rrb, _mm_set1_epi32(4))); + rrb = _mm_add_epi32( rrb, madd ); + _mm_storeu_si128( ( __m128i* ) curr.remRegBins, rrb ); + rrb = _mm_cmplt_epi32( rrb, _mm_set1_epi32( 4 ) ); + + curr.anyRemRegBinsLt4 = !_mm_test_all_zeros( rrb, rrb ); + + __m128i lvl1 = _mm_loadu_si32( l ); + + if( scanInfo.currNbInfoSbb.numInv ) + { + //auto adds8 = []( uint8_t a, uint8_t b ) + //{ + // uint8_t c = a + b; + // if( c < a ) c = -1; + // return c; + //}; + // + //auto update_deps_scalar = [&]( int k ) + //{ + // for( int i = 0; i < 4; i++ ) + // { + // int addr = ( scanInfo.currNbInfoSbb.invInPos[k] << 2 ) + i; + // curr.sum[addr] = adds8( curr.sum[addr], decisions[i].absLevel ); + // } + //}; + + auto update_deps_vec = [&]( int k ) + { + int addr = scanInfo.currNbInfoSbb.invInPos[k] << 2; + __m128i msum = _mm_loadu_si32( &curr.sum[addr] ); + msum = _mm_adds_epu8( msum, mlvl ); + _mm_storeu_si32( &curr.sum[addr], msum ); + }; + + switch( scanInfo.currNbInfoSbb.numInv ) + { + default: + case 5: + update_deps_vec( 4 ); + case 4: + update_deps_vec( 3 ); + case 3: + update_deps_vec( 2 ); + case 2: + update_deps_vec( 1 ); + case 1: + update_deps_vec( 0 ); + } + } + + int addr = ( scanInfo.insidePos << 2 ); + _mm_storeu_si32( &curr.val[addr], lvl1 ); + } + + { + __m128i tpl1 = _mm_loadu_si32( t ); + + auto update_deps = [&]( int k ) + { + int addr = scanInfo.currNbInfoSbb.invInPos[k] << 2; + __m128i tpl = _mm_loadu_si32( &curr.tpl[addr] ); + tpl = _mm_add_epi8( tpl, tpl1 ); + _mm_storeu_si32( &curr.tpl[addr], tpl ); + }; + + switch( scanInfo.currNbInfoSbb.numInv ) + { + default: + case 5: + update_deps( 4 ); + case 4: + update_deps( 3 ); + case 3: + update_deps( 2 ); + case 2: + update_deps( 1 ); + case 1: + update_deps( 0 ); + } + } + + { + __m128i ones = _mm_set1_epi32( 1 ); + __m128i tplAcc = _mm_loadu_si128( ( __m128i * ) &curr.tpl[ ( scanInfo.nextInsidePos << 2 ) ] ); + tplAcc = _mm_cvtepu8_epi32( tplAcc ); + + __m128i sumAbs1 = _mm_and_si128 ( tplAcc, _mm_set1_epi32( 31 ) ); + __m128i sumNum = _mm_srli_epi32( tplAcc, 5 ); + __m128i sumGt1 = _mm_sub_epi32 ( sumAbs1, sumNum ); + sumGt1 = _mm_min_epi32( sumGt1, _mm_set1_epi32( 4 ) ); + sumGt1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.gtxCtxOffsetNext ), sumGt1 ); + + sumAbs1 = _mm_add_epi32( sumAbs1, ones ); + sumAbs1 = _mm_srai_epi32( sumAbs1, 1 ); + sumAbs1 = _mm_min_epi32( sumAbs1, _mm_set1_epi32( 3 ) ); + + sumAbs1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.sigCtxOffsetNext ), sumAbs1 ); + sumAbs1 = _mm_packs_epi32( sumAbs1, sumAbs1 ); + sumAbs1 = _mm_packs_epi16( sumAbs1, sumAbs1 ); + _mm_storeu_si32( curr.ctx.sig, sumAbs1 ); + + sumGt1 = _mm_packs_epi32( sumGt1, sumGt1 ); + sumGt1 = _mm_packs_epi16( sumGt1, sumGt1 ); + _mm_storeu_si32( curr.ctx.cff, sumGt1 ); + + curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext; + } + } + + static inline void updateStatesEOS(const ScanInfo &scanInfo, const Decisions &decisions, StateMem& prev, const StateMem& skip, StateMem& curr, CommonCtx &commonCtx) + { + bool rem_reg_all_gte_4 = true; + + int8_t s[4] = { 0 }, l[4] = { 0 }; + + for( int i = 0; i < 4; ++i ) + { + s[i] = decisions.prevId[i] >= 4 ? -2 : decisions.prevId[i]; + l[i] = s[i] > -2 ? std::min( decisions.absLevel[i], 254 + ( decisions.absLevel[i] & 1 ) ) : 0; + curr.rdCost[i] = decisions.rdCost[i]; + } + + { + const int ctxSize = 16 * 4; + const int regSize = 16; + + __m128i vshuf = _mm_loadu_si32( s ); + vshuf = _mm_shuffle_epi32( vshuf, 0 ); + __m128i vshufmask = _mm_cmplt_epi8 ( vshuf, _mm_setzero_si128() ); + vshuf = _mm_add_epi8 ( vshuf, _mm_setr_epi8( 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 ) ); + vshuf = _mm_blendv_epi8( vshuf, _mm_set1_epi8( -1 ), vshufmask ); + + for( int i = 0; i < ctxSize; i += regSize ) + { + __m128i vval = _mm_loadu_si128( ( const __m128i* ) &prev.val[i] ); + vval = _mm_shuffle_epi8( vval, vshuf ); + _mm_storeu_si128( ( __m128i* ) &curr.val[i], vval ); + } + + __m128i numSig = _mm_loadu_si32( prev.numSig ); + numSig = _mm_shuffle_epi8( numSig, vshuf ); + __m128i lvls = _mm_loadu_si32( l ); + lvls = _mm_cmpgt_epi8( lvls, _mm_setzero_si128() ); + numSig = _mm_subs_epi8( numSig, lvls ); + _mm_storeu_si32( curr.numSig, numSig ); + } + + { + __m128i lvl1 = _mm_loadu_si32( l ); + int addr = ( scanInfo.insidePos << 2 ); + _mm_storeu_si32( &curr.val[addr], lvl1 ); + } + + commonCtx.updateAllLvls( scanInfo, curr ); + + memset( curr.val, 0, sizeof( curr.val ) ); + memset( curr.tpl, 0, sizeof( curr.tpl ) ); + memset( curr.sum, 0, sizeof( curr.sum ) ); + + for( int i = 0; i < 4; i++ ) + { + int prevId = decisions.prevId[i]; + int level = decisions.absLevel[i]; + + if( prevId > -2 ) + { + int remRegBins = 0; + + if( prevId >= 4 ) + { + CHECKD( level != 0, "cannot happen" ); + remRegBins = skip.remRegBins[prevId - 4]; + } + else if( prevId >= 0 ) + { + remRegBins = prev.remRegBins[prevId] - 1; + if( remRegBins >= 4 ) + { + remRegBins -= ( level < 2 ? level : 3 ); + } + } + else + { + remRegBins = curr.initRemRegBins; + if( remRegBins >= 4 ) + { + remRegBins -= ( level < 2 ? level : 3 ); + } + } + + curr.remRegBins[i] = remRegBins; + + const int refId = prevId < 0 ? -1 : ( prevId < 4 ? prev.refSbbCtxId[prevId] : prevId - 4 ); + commonCtx.update( scanInfo, refId, i, curr ); + + rem_reg_all_gte_4 &= remRegBins >= 4; + } + } + + curr.anyRemRegBinsLt4 = !rem_reg_all_gte_4; + memset( curr.numSig, 0, sizeof( curr.numSig ) ); + + { + __m128i ones = _mm_set1_epi32( 1 ); + __m128i tplAcc = _mm_loadu_si128( ( __m128i * ) &curr.tpl[ ( scanInfo.nextInsidePos << 2 ) ] ); + tplAcc = _mm_cvtepu8_epi32( tplAcc ); + + __m128i sumAbs1 = _mm_and_si128 ( tplAcc, _mm_set1_epi32( 31 ) ); + __m128i sumNum = _mm_srli_epi32( tplAcc, 5 ); + __m128i sumGt1 = _mm_sub_epi32 ( sumAbs1, sumNum ); + sumGt1 = _mm_min_epi32( sumGt1, _mm_set1_epi32( 4 ) ); + sumGt1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.gtxCtxOffsetNext ), sumGt1 ); + + sumAbs1 = _mm_add_epi32( sumAbs1, ones ); + sumAbs1 = _mm_srai_epi32( sumAbs1, 1 ); + sumAbs1 = _mm_min_epi32( sumAbs1, _mm_set1_epi32( 3 ) ); + + sumAbs1 = _mm_add_epi32( _mm_set1_epi32( scanInfo.sigCtxOffsetNext ), sumAbs1 ); + sumAbs1 = _mm_packs_epi32( sumAbs1, sumAbs1 ); + sumAbs1 = _mm_packs_epi16( sumAbs1, sumAbs1 ); + _mm_storeu_si32( curr.ctx.sig, sumAbs1 ); + + sumGt1 = _mm_packs_epi32( sumGt1, sumGt1 ); + sumGt1 = _mm_packs_epi16( sumGt1, sumGt1 ); + _mm_storeu_si32( curr.ctx.cff, sumGt1 ); + + curr.cffBitsCtxOffset = scanInfo.gtxCtxOffsetNext; + } + } + + inline void init( StateMem &state ) + { + state.rdCost [m_stateId] = rdCostInit; + state.ctx.cff[m_stateId] = 0; + state.ctx.sig[m_stateId] = 0; + state.numSig [m_stateId] = 0; + state.refSbbCtxId[m_stateId] + = -1; + state.remRegBins[m_stateId] + = 4; + state.cffBitsCtxOffset = 0; + m_goRicePar = 0; + m_goRiceZero = 0; + } + + void checkRdCosts( const ScanPosType spt, const PQData& pqDataA, const PQData& pqDataB, Decisions& decisions, int idxAZ, int idxB, const StateMem& state ) const + { + const int32_t* goRiceTab = g_goRiceBits[m_goRicePar]; + int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; + int64_t rdCostB = state.rdCost[m_stateId] + pqDataB.deltaDist; + int64_t rdCostZ = state.rdCost[m_stateId]; + + if( state.remRegBins[m_stateId] >= 4 ) + { + const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]]; + const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; + + if( pqDataA.absLevel < 4 ) + rdCostA += cffBits.bits[ pqDataA.absLevel ]; + else + { + const unsigned value = ( pqDataA.absLevel - 4 ) >> 1; + rdCostA += cffBits.bits[ pqDataA.absLevel - ( value << 1 ) ] + goRiceTab[ std::min( value, RICEMAX - 1 ) ]; + } + + if( pqDataB.absLevel < 4 ) + rdCostB += cffBits.bits[ pqDataB.absLevel ]; + else + { + const unsigned value = ( pqDataB.absLevel - 4 ) >> 1; + rdCostB += cffBits.bits[ pqDataB.absLevel - ( value << 1 ) ] + goRiceTab[std::min( value, RICEMAX - 1 )]; + } + + if( spt == SCAN_ISCSBB ) + { + rdCostA += sigBits.intBits[ 1 ]; + rdCostB += sigBits.intBits[ 1 ]; + rdCostZ += sigBits.intBits[ 0 ]; + } + else if( spt == SCAN_SOCSBB ) + { + rdCostA += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ]; + rdCostB += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ]; + rdCostZ += state.sbbBits1[m_stateId] + sigBits.intBits[ 0 ]; + } + else if( state.numSig[m_stateId] ) + { + rdCostA += sigBits.intBits[ 1 ]; + rdCostB += sigBits.intBits[ 1 ]; + rdCostZ += sigBits.intBits[ 0 ]; + } + else + { + rdCostZ = rdCostInit; + } + } + else + { + rdCostA += ( 1 << SCALE_BITS ) + goRiceTab[ pqDataA.absLevel <= m_goRiceZero ? pqDataA.absLevel - 1 : std::min( pqDataA.absLevel, RICEMAX - 1 ) ]; + rdCostB += ( 1 << SCALE_BITS ) + goRiceTab[ pqDataB.absLevel <= m_goRiceZero ? pqDataB.absLevel - 1 : std::min( pqDataB.absLevel, RICEMAX - 1 ) ]; + rdCostZ += goRiceTab[ m_goRiceZero ]; + } + + if( rdCostA < rdCostZ && rdCostA < decisions.rdCost[idxAZ] ) + { + decisions.rdCost [idxAZ] = rdCostA; + decisions.absLevel[idxAZ] = pqDataA.absLevel; + decisions.prevId [idxAZ] = m_stateId; + } + else if( rdCostZ < decisions.rdCost[idxAZ] ) + { + decisions.rdCost [idxAZ] = rdCostZ; + decisions.absLevel[idxAZ] = 0; + decisions.prevId [idxAZ] = m_stateId; + } + + if( rdCostB < decisions.rdCost[idxB] ) + { + decisions.rdCost [idxB] = rdCostB; + decisions.absLevel[idxB] = pqDataB.absLevel; + decisions.prevId [idxB] = m_stateId; + } + } + + // has to be called as a first check, assumes no decision has been made yet + static void checkAllRdCosts( const ScanPosType spt, State* states, const PQData* pqData, Decisions& decisions, const StateMem& state ) + { + // State mapping + // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0 + // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2 + // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1 + // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3 + + __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[0] ); + __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[2] ); + + //int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; + //int64_t rdCostB = state.rdCost[m_stateId] + pqDataB.deltaDist; + //int64_t rdCostZ = state.rdCost[m_stateId]; + __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 ); + __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 ); + __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) ); + __m128i rdCostB01 = _mm_add_epi64( rdCostZ23, deltaDist ); + __m128i rdCostB23 = _mm_add_epi64( rdCostZ01, deltaDist ); + deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[0].deltaDist ), _mm_loadu_si64( &pqData[3].deltaDist ) ); + __m128i rdCostA01 = _mm_add_epi64( rdCostZ01, deltaDist ); + __m128i rdCostA23 = _mm_add_epi64( rdCostZ23, deltaDist ); + + //const CoeffFracBits &cffBits = m_gtxFracBitsArray[state.ctx.cff[m_stateId]]; + //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; + // + //rdCostA += cffBits.bits[ pqDataA.absLevel ]; + //rdCostB += cffBits.bits[ pqDataB.absLevel ]; + __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &states[0].m_sigFracBitsArray[state.ctx.sig[0]] ), + _mm_loadu_si64( &states[2].m_sigFracBitsArray[state.ctx.sig[2]] ) ); + __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &states[1].m_sigFracBitsArray[state.ctx.sig[1]] ), + _mm_loadu_si64( &states[3].m_sigFracBitsArray[state.ctx.sig[3]] ) ); + + { + __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); + __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); + __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 ); + sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 ); + } + + { + // coeff context is indepndent of state + auto &base = states->m_gtxFracBitsArray; + + int32_t cffBitsArr[4] = + { + base[state.ctx.cff[1]].bits[pqData[2].absLevel], + base[state.ctx.cff[3]].bits[pqData[1].absLevel], + base[state.ctx.cff[0]].bits[pqData[2].absLevel], + base[state.ctx.cff[2]].bits[pqData[1].absLevel], + }; + + __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr ); + __m128i add = _mm_cvtepi32_epi64( cffBits ); + rdCostB01 = _mm_add_epi64( rdCostB01, add ); + add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ); + rdCostB23 = _mm_add_epi64( rdCostB23, add ); + } + + { + // coeff context is indepndent of state + auto &base = states->m_gtxFracBitsArray; + + int32_t cffBitsArr[4] = + { + base[state.ctx.cff[0]].bits[pqData[0].absLevel], + base[state.ctx.cff[2]].bits[pqData[3].absLevel], + base[state.ctx.cff[1]].bits[pqData[0].absLevel], + base[state.ctx.cff[3]].bits[pqData[3].absLevel], + }; + + __m128i cffBits = _mm_loadu_si128( ( const __m128i* ) cffBitsArr ); + __m128i add = _mm_cvtepi32_epi64( cffBits ); + rdCostA01 = _mm_add_epi64( rdCostA01, add ); + add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, add ); + } + + if( spt == SCAN_ISCSBB ) + { + // rdCostZ += sigBits.intBits[ 0 ]; + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + + // rdCostB += sigBits.intBits[ 1 ]; + rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) ); + rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) ); + + // rdCostA += sigBits.intBits[ 1 ]; + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) ); + } + else if( spt == SCAN_SOCSBB ) + { + // rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; + // rdCostB += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; + // rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; + __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 ); + sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + __m128i add = _mm_cvtepi32_epi64( sbbBits ); + rdCostB23 = _mm_add_epi64( rdCostB23, add ); + rdCostA01 = _mm_add_epi64( rdCostA01, add ); + rdCostZ01 = _mm_add_epi64( rdCostZ01, add ); + add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) ); + rdCostB01 = _mm_add_epi64( rdCostB01, add ); + rdCostA23 = _mm_add_epi64( rdCostA23, add ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, add ); + + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + rdCostB01 = _mm_add_epi64( rdCostB01, _mm_cvtepi32_epi64( sgbts13 ) ); + rdCostB23 = _mm_add_epi64( rdCostB23, _mm_cvtepi32_epi64( sgbts02 ) ); + + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts13 ) ); + } + else + { + //else if( state.numSig[m_stateId] ) + //{ + // rdCostA += sigBits.intBits[ 1 ]; + // rdCostB += sigBits.intBits[ 1 ]; + // rdCostZ += sigBits.intBits[ 0 ]; + //} + //else + //{ + // rdCostZ = decisionA.rdCost; + //} + + __m128i numSig = _mm_loadu_si32( state.numSig ); + + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + __m128i mask13 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) ); + mask13 = _mm_cmpgt_epi8( mask13, _mm_setzero_si128() ); + __m128i mask02 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) ); + mask02 = _mm_cmpgt_epi8( mask02, _mm_setzero_si128() ); + + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + + rdCostB01 = _mm_add_epi64( rdCostB01, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) ); + rdCostB23 = _mm_add_epi64( rdCostB23, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) ); + + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask02, _mm_cvtepi32_epi64( sgbts02 ) ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask13, _mm_cvtepi32_epi64( sgbts13 ) ) ); + + __m128i rdMax = _mm_loadu_si64( &rdCostInit ); + rdMax = _mm_unpacklo_epi64( rdMax, rdMax ); + + rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask02 ); + rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask13 ); + } + + // decision 0: either A from 0 (pq0), or B from 1 (pq2), or 0 from 0 + // decision 1: either A from 2 (pq3), or B from 3 (pq1), or 0 from 2 + // decision 2: either A from 1 (pq0), or B from 0 (pq2), or 0 from 1 + // decision 3: either A from 3 (pq3), or B from 2 (pq1), or 0 from 3 + // Z0, or A0, or B0 + // Z1, or A1, or B1 + // B2, or Z2, or A2 + // B3, or Z3, or A3 + + __m128i rdBest01 = rdCostZ01; + __m128i rdBest23 = rdCostB23; + + __m128i valBest = _mm_setr_epi32( 0, 0, pqData[2].absLevel, pqData[1].absLevel ); + __m128i valCand = _mm_setr_epi32( pqData[0].absLevel, pqData[3].absLevel, 0, 0 ); + + __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 ); + __m128i idxCand = _mm_setr_epi32( 0, 2, 1, 3 ); + + __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 ); + __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 ); + __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 + chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 ); + rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 ); + + valBest = _mm_blendv_epi8( valBest, valCand, chng ); + idxBest = _mm_blendv_epi8( idxBest, idxCand, chng ); + + + valCand = _mm_setr_epi32( pqData[2].absLevel, pqData[1].absLevel, pqData[0].absLevel, pqData[3].absLevel ); + idxCand = _mm_setr_epi32( 1, 3, 1, 3 ); + + chng01 = _my_cmpgt_epi64( rdBest01, rdCostB01 ); + chng23 = _my_cmpgt_epi64( rdBest23, rdCostA23 ); + chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 + chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + rdBest01 = _mm_blendv_epi8( rdBest01, rdCostB01, chng01 ); + rdBest23 = _mm_blendv_epi8( rdBest23, rdCostA23, chng23 ); + + valBest = _mm_blendv_epi8( valBest, valCand, chng ); + idxBest = _mm_blendv_epi8( idxBest, idxCand, chng ); + + + valBest = _mm_packs_epi32( valBest, _mm_setzero_si128() ); + idxBest = _mm_packs_epi32( idxBest, _mm_setzero_si128() ); + idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() ); + + + _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[0], rdBest01 ); + _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[2], rdBest23 ); + + _mm_storeu_si64( decisions.absLevel, valBest ); + _mm_storeu_si32( decisions.prevId, idxBest ); + } + + void checkRdCostsOdd1( const ScanPosType spt, const PQData& pqDataA, Decisions& decisions, int idxA, int idxZ, const StateMem& state ) const + { + CHECKD( pqDataA.absLevel != 1, "" ); + + int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; + int64_t rdCostZ = state.rdCost[m_stateId]; + + if( state.remRegBins[m_stateId] >= 4 ) + { + const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; + + rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1]; + + if( spt == SCAN_ISCSBB ) + { + rdCostA += sigBits.intBits[ 1 ]; + rdCostZ += sigBits.intBits[ 0 ]; + } + else if( spt == SCAN_SOCSBB ) + { + rdCostA += state.sbbBits1[m_stateId] + sigBits.intBits[ 1 ]; + rdCostZ += state.sbbBits1[m_stateId] + sigBits.intBits[ 0 ]; + } + else if( state.numSig[m_stateId] ) + { + rdCostA += sigBits.intBits[ 1 ]; + rdCostZ += sigBits.intBits[ 0 ]; + } + else + { + rdCostZ = rdCostInit; + } + } + else + { + const int32_t* goRiceTab = g_goRiceBits[m_goRicePar]; + + rdCostA += ( 1 << SCALE_BITS ) + goRiceTab[0]; + rdCostZ += goRiceTab[m_goRiceZero]; + } + + if( rdCostA < decisions.rdCost[idxA] ) + { + decisions.rdCost [idxA] = rdCostA; + decisions.absLevel[idxA] = pqDataA.absLevel; + decisions.prevId [idxA] = m_stateId; + } + + if( rdCostZ < decisions.rdCost[idxZ] ) + { + decisions.rdCost [idxZ] = rdCostZ; + decisions.absLevel[idxZ] = 0; + decisions.prevId [idxZ] = m_stateId; + } + } + + // has to be called as a first check, assumes no decision has been made yet!!! + static void checkAllRdCostsOdd1( const ScanPosType spt, State* states, const PQData* pqData, Decisions& decisions, const StateMem& state ) + { + // State mapping + // decision 0: either 1 from 1 (pqData[2]), or 0 from 0 + // decision 1: either 1 from 3 (pqData[1]), or 0 from 2 + // decision 2: either 1 from 0 (pqData[2]), or 0 from 1 + // decision 3: either 1 from 2 (pqData[1]), or 0 from 3 + + __m128i mrd01 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[0] ); + __m128i mrd23 = _mm_loadu_si128( ( const __m128i* ) &state.rdCost[2] ); + + //int64_t rdCostA = state.rdCost[m_stateId] + pqDataA.deltaDist; // done + //int64_t rdCostZ = state.rdCost[m_stateId]; // done + __m128i rdCostZ01 = _mm_unpacklo_epi64( mrd01, mrd23 ); + __m128i rdCostZ23 = _mm_unpackhi_epi64( mrd01, mrd23 ); + __m128i deltaDist = _mm_unpacklo_epi64( _mm_loadu_si64( &pqData[2].deltaDist ), _mm_loadu_si64( &pqData[1].deltaDist ) ); + __m128i rdCostA01 = _mm_add_epi64( rdCostZ23, deltaDist ); + __m128i rdCostA23 = _mm_add_epi64( rdCostZ01, deltaDist ); + + //const BinFracBits sigBits = m_sigFracBitsArray[state.ctx.sig[m_stateId]]; + // + //rdCostA += m_gtxFracBitsArray[state.ctx.cff[m_stateId]].bits[1]; // done + // + __m128i sgbts02 = _mm_unpacklo_epi64( _mm_loadu_si64( &states[0].m_sigFracBitsArray[state.ctx.sig[0]] ), + _mm_loadu_si64( &states[2].m_sigFracBitsArray[state.ctx.sig[2]] ) ); + __m128i sgbts13 = _mm_unpacklo_epi64( _mm_loadu_si64( &states[1].m_sigFracBitsArray[state.ctx.sig[1]] ), + _mm_loadu_si64( &states[3].m_sigFracBitsArray[state.ctx.sig[3]] ) ); + + { + __m128i sgbts02_0 = _mm_shuffle_epi32( sgbts02, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); + __m128i sgbts02_1 = _mm_shuffle_epi32( sgbts02, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + __m128i sgbts13_0 = _mm_shuffle_epi32( sgbts13, 0 + ( 2 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); + __m128i sgbts13_1 = _mm_shuffle_epi32( sgbts13, 1 + ( 3 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + sgbts02 = _mm_unpacklo_epi64( sgbts02_0, sgbts02_1 ); + sgbts13 = _mm_unpacklo_epi64( sgbts13_0, sgbts13_1 ); + } + + { +#if USE_AVX2 + __m128i cffidx = _mm_cvtepi8_epi32( _mm_loadu_si32( &state.ctx.cff ) ); + cffidx = _mm_shuffle_epi32( cffidx, ( 1 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 2 << 6 ) ); + cffidx = _mm_sub_epi8( cffidx, _mm_set1_epi32( state.cffBitsCtxOffset ) ); + __m256i cffBits256 = _mm256_loadu_si256( ( const __m256i* ) &state.cffBits1[state.cffBitsCtxOffset] ); + cffBits256 = _mm256_permutevar8x32_epi32( cffBits256, _mm256_castsi128_si256( cffidx ) ); + __m128i cffBits = _mm256_castsi256_si128( cffBits256 ); +#else + __m128i cffBits; + __m128i bits0123 = _mm_loadu_si128( ( const __m128i* ) &state.cffBits1[state.cffBitsCtxOffset + 0] ); + __m128i bits4 = _mm_loadu_si32 ( &state.cffBits1[state.cffBitsCtxOffset + 4] ); + __m128i cfCtxIdx = _mm_loadu_si32 ( &state.ctx.cff ); + cfCtxIdx = _mm_cvtepi8_epi32( cfCtxIdx ); + cfCtxIdx = _mm_sub_epi8( cfCtxIdx, _mm_set1_epi32( state.cffBitsCtxOffset ) ); + cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 1 ) ); + cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_slli_si128( cfCtxIdx, 2 ) ); + cfCtxIdx = _mm_slli_epi32( cfCtxIdx, 2 ); + cfCtxIdx = _mm_add_epi8( cfCtxIdx, _mm_setr_epi8( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ) ); + cffBits = _mm_shuffle_epi8( bits4, _mm_sub_epi8( cfCtxIdx, _mm_set1_epi8( 16 ) ) ); + cfCtxIdx = _mm_or_si128( cfCtxIdx, _mm_cmpgt_epi8( cfCtxIdx, _mm_set1_epi8( 15 ) ) ); + cffBits = _mm_or_si128( cffBits, _mm_shuffle_epi8( bits0123, cfCtxIdx ) ); + cffBits = _mm_shuffle_epi32( cffBits, ( 1 << 0 ) + ( 3 << 2 ) +( 0 << 4 ) + ( 2 << 6 ) ); +#endif + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( cffBits ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( _mm_unpackhi_epi64( cffBits, cffBits ) ) ); + } + + if( spt == SCAN_ISCSBB ) + { + // rdCostZ += sigBits.intBits[ 0 ]; // done + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + + // rdCostA += sigBits.intBits[ 1 ]; // done + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) ); + } + else if( spt == SCAN_SOCSBB ) + { + // rdCostZ += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 0 ]; // done + // rdCostA += m_sbbFracBits.intBits[ 1 ] + sigBits.intBits[ 1 ]; // dome + __m128i sbbBits = _mm_loadu_si128( ( const __m128i* ) state.sbbBits1 ); + sbbBits = _mm_shuffle_epi32( sbbBits, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + __m128i add = _mm_cvtepi32_epi64( sbbBits ); + rdCostA23 = _mm_add_epi64( rdCostA23, add ); + rdCostZ01 = _mm_add_epi64( rdCostZ01, add ); + add = _mm_cvtepi32_epi64( _mm_unpackhi_epi64( sbbBits, sbbBits ) ); + rdCostA01 = _mm_add_epi64( rdCostA01, add ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, add ); + + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_cvtepi32_epi64( sgbts13 ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_cvtepi32_epi64( sgbts02 ) ); + } + else + { + //else if( m_numSigSbb ) + //{ + // rdCostA += sigBits.intBits[ 1 ]; // done + // rdCostZ += sigBits.intBits[ 0 ]; // done + //} + //else + //{ + // rdCostZ = decisionZ.rdCost; // done + //} + + __m128i numSig = _mm_loadu_si32( state.numSig ); + + rdCostZ01 = _mm_add_epi64( rdCostZ01, _mm_cvtepi32_epi64( sgbts02 ) ); + rdCostZ23 = _mm_add_epi64( rdCostZ23, _mm_cvtepi32_epi64( sgbts13 ) ); + + __m128i mask01 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3 ) ); + mask01 = _mm_cmpgt_epi8( mask01, _mm_setzero_si128() ); + __m128i mask23 = _mm_shuffle_epi8( numSig, _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2 ) ); + mask23 = _mm_cmpgt_epi8( mask23, _mm_setzero_si128() ); + sgbts02 = _mm_unpackhi_epi64( sgbts02, sgbts02 ); + sgbts13 = _mm_unpackhi_epi64( sgbts13, sgbts13 ); + rdCostA01 = _mm_add_epi64( rdCostA01, _mm_and_si128( mask01, _mm_cvtepi32_epi64( sgbts13 ) ) ); + rdCostA23 = _mm_add_epi64( rdCostA23, _mm_and_si128( mask23, _mm_cvtepi32_epi64( sgbts02 ) ) ); + + __m128i rdMax = _mm_loadu_si64( &rdCostInit ); + rdMax = _mm_unpacklo_epi64( rdMax, rdMax ); + + rdCostZ01 = _mm_blendv_epi8( rdMax, rdCostZ01, mask23 ); + rdCostZ23 = _mm_blendv_epi8( rdMax, rdCostZ23, mask01 ); + } + + //// decision 0: either 1 from 1 (pqData[2]), or 0 from 0 + //// decision 1: either 1 from 3 (pqData[1]), or 0 from 2 + //// decision 2: either 1 from 0 (pqData[2]), or 0 from 1 + //// decision 3: either 1 from 2 (pqData[1]), or 0 from 3 + + // d0: Z0, or A0 + // d1: Z1, or A1 + // d2: A2, or Z2 + // d3: A3, or Z3 + + __m128i rdBest01 = rdCostZ01; + __m128i rdBest23 = rdCostA23; + + __m128i valBest = _mm_setr_epi32( 0, 0, 1, 1 ); + __m128i valCand = _mm_setr_epi32( 1, 1, 0, 0 ); + + __m128i idxBest = _mm_setr_epi32( 0, 2, 0, 2 ); + __m128i idxCand = _mm_setr_epi32( 1, 3, 1, 3 ); + + __m128i chng01 = _my_cmpgt_epi64( rdBest01, rdCostA01 ); + __m128i chng23 = _my_cmpgt_epi64( rdBest23, rdCostZ23 ); + __m128i chng = _mm_blend_epi16( chng01, chng23, ( 3 << 2 ) + ( 3 << 6 ) ); // 00110011 + chng = _mm_shuffle_epi32( chng, ( 0 << 0 ) + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); + + rdBest01 = _mm_blendv_epi8( rdBest01, rdCostA01, chng01 ); + rdBest23 = _mm_blendv_epi8( rdBest23, rdCostZ23, chng23 ); + + _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[0], rdBest01 ); + _mm_storeu_si128( ( __m128i* ) &decisions.rdCost[2], rdBest23 ); + + valBest = _mm_packs_epi32( _mm_blendv_epi8( valBest, valCand, chng ), _mm_setzero_si128() ); + idxBest = _mm_packs_epi32( _mm_blendv_epi8( idxBest, idxCand, chng ), _mm_setzero_si128() ); + idxBest = _mm_packs_epi16( idxBest, _mm_setzero_si128() ); + + _mm_storeu_si64( decisions.absLevel, valBest ); + _mm_storeu_si32( decisions.prevId, idxBest ); + } + + inline void checkRdCostStart(int32_t lastOffset, const PQData &pqData, Decisions &decisions, int idx ) const + { + const CoeffFracBits &cffBits = m_gtxFracBitsArray[0]; + + int64_t rdCost = pqData.deltaDist + lastOffset; + if (pqData.absLevel < 4) + { + rdCost += cffBits.bits[pqData.absLevel]; + } + else + { + const unsigned value = (pqData.absLevel - 4) >> 1; + rdCost += cffBits.bits[pqData.absLevel - (value << 1)] + g_goRiceBits[0][value < RICEMAX ? value : RICEMAX-1]; + } + + if( rdCost < decisions.rdCost[idx] ) + { + decisions.rdCost [idx] = rdCost; + decisions.absLevel[idx] = pqData.absLevel; + decisions.prevId [idx] = -1; + } + } + + inline void checkRdCostSkipSbb(Decisions &decisions, int idx, const StateMem& state) const + { + int64_t rdCost = state.rdCost[m_stateId] + state.sbbBits0[m_stateId]; + if( rdCost < decisions.rdCost[idx] ) + { + decisions.rdCost [idx] = rdCost; + decisions.absLevel[idx] = 0; + decisions.prevId [idx] = 4 | m_stateId; + } + } + + inline void checkRdCostSkipSbbZeroOut(Decisions &decisions, int idx, const StateMem& state) const + { + int64_t rdCost = state.rdCost[m_stateId] + state.sbbBits0[m_stateId]; + decisions.rdCost [idx] = rdCost; + decisions.absLevel[idx] = 0; + decisions.prevId [idx] = 4 | m_stateId; + } + + inline void setRiceParam( const ScanInfo& scanInfo, const StateMem& state, bool ge4 ) + { + if( state.remRegBins[m_stateId] < 4 || ge4 ) + { + const int addr = ( scanInfo.insidePos << 2 ) + m_stateId; + TCoeff sumAbs = state.sum[addr]; + int sumSub = state.remRegBins[m_stateId] < 4 ? 0 : 4 * 5; + int sumAll = std::max( std::min( 31, ( int ) sumAbs - sumSub ), 0 ); + m_goRicePar = g_auiGoRiceParsCoeff[sumAll]; + + if( state.remRegBins[m_stateId] < 4 ) + { + m_goRiceZero = g_auiGoRicePosCoeff0( m_stateId, m_goRicePar ); + } + } + } + + private: + + int8_t m_goRicePar; + int8_t m_goRiceZero; + const int8_t m_stateId; + const BinFracBits*const m_sigFracBitsArray; + const CoeffFracBits*const m_gtxFracBitsArray; + CommonCtx& m_commonCtx; + }; + + /*================================================================================*/ + /*===== =====*/ + /*===== T C Q =====*/ + /*===== =====*/ + /*================================================================================*/ + template + class DepQuantSimd : private RateEstimator, public DepQuantImpl + { + public: + const Decisions startDec[2] = + { + Decisions + { + { rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2 }, + { -1, -1, -1, -1 }, + { -2, -2, -2, -2 }, + }, + Decisions + { + { rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2, rdCostInit >> 2 }, + { 0, 0, 0, 0 }, + { 4, 5, 6, 7 }, + } + }; + +#define TINIT(x) {*this,m_commonCtx,x} + DepQuantSimd() + : RateEstimator () + , m_commonCtx () + , m_allStates {TINIT(0),TINIT(1),TINIT(2),TINIT(3),TINIT(0),TINIT(1),TINIT(2),TINIT(3),TINIT(0),TINIT(1),TINIT(2),TINIT(3)} + , m_currStates ( m_allStates ) + , m_prevStates ( m_currStates + 4 ) + , m_skipStates ( m_prevStates + 4 ) + { + m_scansRom.init(); + + for( int t = 0; t < ( MAX_TB_SIZEY * MAX_TB_SIZEY ); t++ ) + { + memcpy( m_trellis[t], startDec, sizeof( startDec ) ); + } + } +#undef TINIT + + ~DepQuantSimd() + { + } + + void init( int dqTrVal ) + { + m_quant.init( dqTrVal ); + } + + void quant( TransformUnit &tu, const CCoeffBuf &srcCoeff, const ComponentID compID, const QpParam &cQP, const double lambda, const Ctx &ctx, TCoeff &absSum, bool enableScalingLists, int *quantCoeff ) + { + //===== reset / pre-init ===== + const TUParameters& tuPars = *m_scansRom.getTUPars( tu.blocks[compID], compID ); + m_quant.initQuantBlock ( tu, compID, cQP, lambda ); + TCoeffSig* qCoeff = tu.getCoeffs( compID ).buf; + const TCoeff* tCoeff = srcCoeff.buf; + const int numCoeff = tu.blocks[compID].area(); + ::memset( qCoeff, 0x00, numCoeff * sizeof( TCoeffSig ) ); + absSum = 0; + + const CompArea& area = tu.blocks[ compID ]; + const uint32_t width = area.width; + const uint32_t height = area.height; + const uint32_t lfnstIdx = tu.cu->lfnstIdx; + //===== scaling matrix ==== + //const int qpDQ = cQP.Qp + 1; + //const int qpPer = qpDQ / 6; + //const int qpRem = qpDQ - 6 * qpPer; + + //TCoeff thresTmp = thres; + bool zeroOut = false; + bool zeroOutforThres = false; + int effWidth = tuPars.m_width, effHeight = tuPars.m_height; + if( ( tu.mtsIdx[compID] > MTS_SKIP || ( tu.cs->sps->MTS && tu.cu->sbtInfo != 0 && tuPars.m_height <= 32 && tuPars.m_width <= 32 ) ) && compID == COMP_Y ) + { + effHeight = ( tuPars.m_height == 32 ) ? 16 : tuPars.m_height; + effWidth = ( tuPars.m_width == 32 ) ? 16 : tuPars.m_width; + zeroOut = ( effHeight < tuPars.m_height || effWidth < tuPars.m_width ); + } + zeroOutforThres = zeroOut || ( 32 < tuPars.m_height || 32 < tuPars.m_width ); + //===== find first test position ===== + int firstTestPos = std::min( tuPars.m_width, JVET_C0024_ZERO_OUT_TH ) * std::min( tuPars.m_height, JVET_C0024_ZERO_OUT_TH ) - 1; + if( lfnstIdx > 0 && tu.mtsIdx[compID] != MTS_SKIP && width >= 4 && height >= 4 ) + { + firstTestPos = ( ( width == 4 && height == 4 ) || ( width == 8 && height == 8 ) ) ? 7 : 15 ; + } + + const TCoeff defaultQuantisationCoefficient = (TCoeff)m_quant.getQScale(); + const TCoeff thres = m_quant.getLastThreshold(); + const int zeroOutWidth = ( tuPars.m_width == 32 && zeroOut ) ? 16 : 32; + const int zeroOutHeight = ( tuPars.m_height == 32 && zeroOut ) ? 16 : 32; + + if( enableScalingLists ) + { + for( ; firstTestPos >= 0; firstTestPos-- ) + { + if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue; + + const TCoeff thresTmp = TCoeff( thres / ( 4 * quantCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) ); + + if( abs( tCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) > thresTmp ) break; + } + } + else + { + const TCoeff defaultTh = TCoeff( thres / ( defaultQuantisationCoefficient << 2 ) ); + +#if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 ) + // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold + if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR ) + { + const int sbbSize = tuPars.m_sbbSize; + // move the pointer to the beginning of the current subblock + firstTestPos -= ( sbbSize - 1 ); + + const __m128i xdfTh = _mm_set1_epi32( defaultTh ); + + // for each subblock + for( ; firstTestPos >= 0; firstTestPos -= sbbSize ) + { + // skip zeroed out blocks + // for 64-point transformation the coding order takes care of that + if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) + { + continue; + } + + // read first line of the subblock and check for coefficients larger than the threshold + // assumming the subblocks are dense 4x4 blocks in raster scan order with the stride of tuPars.m_width + int pos = tuPars.m_scanId2BlkPos[firstTestPos].idx; + __m128i xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); + __m128i xdf = _mm_cmpgt_epi32( xl0, xdfTh ); + + // same for the next line in the subblock + pos += tuPars.m_width; + xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); + xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); + + // and the third line + pos += tuPars.m_width; + xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); + xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); + + // and the last line + pos += tuPars.m_width; + xl0 = _mm_abs_epi32( _mm_loadu_si128( ( const __m128i* ) &tCoeff[pos] ) ); + xdf = _mm_or_si128( xdf, _mm_cmpgt_epi32( xl0, xdfTh ) ); + + // if any of the 16 comparisons were true, break, because this subblock contains a coefficient larger than threshold + if( !_mm_testz_si128( xdf, xdf ) ) break; + } + + if( firstTestPos >= 0 ) + { + // if a coefficient was found, advance the pointer to the end of the current subblock + // for the subsequent coefficient-wise refinement (C-impl after endif) + firstTestPos += sbbSize - 1; + } + } + +#endif + for( ; firstTestPos >= 0; firstTestPos-- ) + { + if( zeroOutforThres && ( tuPars.m_scanId2BlkPos[firstTestPos].x >= zeroOutWidth || tuPars.m_scanId2BlkPos[firstTestPos].y >= zeroOutHeight ) ) continue; + if( abs( tCoeff[tuPars.m_scanId2BlkPos[firstTestPos].idx] ) > defaultTh ) break; + } + } + + if( firstTestPos < 0 ) + { + tu.lastPos[compID] = -1; + return; + } + + //===== real init ===== + RateEstimator::initCtx( tuPars, tu, compID, ctx.getFracBitsAcess() ); + m_commonCtx.reset( tuPars, *this ); + for( int k = 0; k < 12; k++ ) + { + m_allStates[k].init( m_state_mem[k>>2] ); + } + + const int numCtx = isLuma( compID ) ? 21 : 11; + const CoeffFracBits* const cffBits = gtxFracBits(); + for( int i = 0; i < numCtx; i++ ) + { + m_state_mem[0].cffBits1[i] = cffBits[i].bits[1]; + m_state_mem[1].cffBits1[i] = cffBits[i].bits[1]; + m_state_mem[2].cffBits1[i] = cffBits[i].bits[1]; + } + + int effectWidth = std::min( 32, effWidth ); + int effectHeight = std::min( 32, effHeight ); + for (int k = 0; k < 3; k++) + { + m_state_mem[k].effWidth = effectWidth; + m_state_mem[k].effHeight = effectHeight; + m_state_mem[k].initRemRegBins = ( effectWidth * effectHeight * MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT ) / 16; + m_state_mem[k].anyRemRegBinsLt4 = true; // for the first coeff use scalar impl., because it check against the init state, which + // prohibits some paths + } + + //===== populate trellis ===== + for( int scanIdx = firstTestPos; scanIdx >= 0; scanIdx-- ) + { + const ScanInfo& scanInfo = tuPars.m_scanInfo[ scanIdx ]; + if( enableScalingLists ) + { + m_quant.initQuantBlock( tu, compID, cQP, lambda, quantCoeff[scanInfo.rasterPos] ); + xDecideAndUpdate( abs( tCoeff[scanInfo.rasterPos] ), scanInfo, zeroOut && ( scanInfo.posX >= effWidth || scanInfo.posY >= effHeight ), quantCoeff[scanInfo.rasterPos] ); + } + else + xDecideAndUpdate( abs( tCoeff[scanInfo.rasterPos] ), scanInfo, zeroOut && ( scanInfo.posX >= effWidth || scanInfo.posY >= effHeight ), defaultQuantisationCoefficient ); + } + + //===== find best path ===== + int prevId = -1; + int64_t minPathCost = 0; + for( int8_t stateId = 0; stateId < 4; stateId++ ) + { + int64_t pathCost = m_trellis[0][0].rdCost[stateId]; + if( pathCost < minPathCost ) + { + prevId = stateId; + minPathCost = pathCost; + } + } + + //===== backward scanning ===== + int scanIdx = 0; + for( ; prevId >= 0; scanIdx++ ) + { + TCoeffSig absLevel = m_trellis[scanIdx][prevId >> 2].absLevel[prevId & 3]; + int32_t blkpos = tuPars.m_scanId2BlkPos[scanIdx].idx; + qCoeff[ blkpos ] = TCoeffSig( tCoeff[blkpos] < 0 ? -absLevel : absLevel ); + absSum += absLevel; + prevId = m_trellis[scanIdx][prevId >> 2].prevId[prevId & 3]; + } + + tu.lastPos[compID] = scanIdx - 1; + } + + private: + + void xDecideAndUpdate( const TCoeff absCoeff, const ScanInfo &scanInfo, bool zeroOut, int quantCoeff ) + { + Decisions *decisions = &m_trellis[scanInfo.scanIdx][0]; + + std::swap( m_prevStates, m_currStates ); + std::swap( m_prevStateI, m_currStateI ); + + xDecide( scanInfo, absCoeff, lastOffset(scanInfo.scanIdx), *decisions, zeroOut, quantCoeff ); + + if( scanInfo.scanIdx ) + { + if( scanInfo.insidePos == 0 ) + { + m_commonCtx.swap(); + State::updateStatesEOS( scanInfo, *decisions, m_state_mem[m_prevStateI], m_state_mem[m_skipStateI], m_state_mem[m_currStateI], m_commonCtx ); + ::memcpy( decisions + 1, decisions, sizeof( Decisions ) ); + } + else if( !zeroOut ) + { + State::updateStates( scanInfo, *decisions, m_state_mem[m_prevStateI], m_state_mem[m_currStateI] ); + } + + if( scanInfo.spt == SCAN_SOCSBB ) + { + std::swap( m_prevStates, m_skipStates ); + std::swap( m_prevStateI, m_skipStateI ); + } + } + } + + void xDecide( const ScanInfo &scanInfo, const TCoeff absCoeff, const int lastOffset, Decisions &decisions, bool zeroOut, int quantCoeff ) + { + ::memcpy( &decisions, startDec, sizeof( Decisions ) ); + + StateMem& prev = m_state_mem[m_prevStateI]; + StateMem& skip = m_state_mem[m_skipStateI]; + + if( zeroOut ) + { + if( scanInfo.spt==SCAN_EOCSBB ) + { + m_skipStates[0].checkRdCostSkipSbbZeroOut( decisions, 0, skip ); + m_skipStates[1].checkRdCostSkipSbbZeroOut( decisions, 1, skip ); + m_skipStates[2].checkRdCostSkipSbbZeroOut( decisions, 2, skip ); + m_skipStates[3].checkRdCostSkipSbbZeroOut( decisions, 3, skip ); + } + return; + } + + PQData pqData[4]; + //bool near0 = m_quant.preQuantCoeff( absCoeff, pqData, quantCoeff ); + + /// start inline prequant + int64_t scaledOrg = int64_t( absCoeff ) * quantCoeff; + TCoeff qIdx = TCoeff( ( scaledOrg + m_quant.m_QAdd ) >> m_quant.m_QShift ); + + if( qIdx < 0 ) + { + int64_t scaledAdd = m_quant.m_DistStepAdd - scaledOrg * m_quant.m_DistOrgFact; + PQData& pq_a = pqData[1]; + PQData& pq_b = pqData[2]; + + pq_a.deltaDist = ( ( scaledAdd + 0 * m_quant.m_DistStepAdd ) * 1 + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_a.absLevel = 1; + + pq_b.deltaDist = ( ( scaledAdd + 1 * m_quant.m_DistStepAdd ) * 2 + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_b.absLevel = 1; + /// stop inline prequant + + if( prev.anyRemRegBinsLt4 ) + { + m_prevStates[0].setRiceParam( scanInfo, prev, false ); + m_prevStates[0].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions, 2, 0, prev ); + + m_prevStates[1].setRiceParam( scanInfo, prev, false ); + m_prevStates[1].checkRdCostsOdd1( scanInfo.spt, pqData[2], decisions, 0, 2, prev ); + + m_prevStates[2].setRiceParam( scanInfo, prev, false ); + m_prevStates[2].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions, 3, 1, prev ); + + m_prevStates[3].setRiceParam( scanInfo, prev, false ); + m_prevStates[3].checkRdCostsOdd1( scanInfo.spt, pqData[1], decisions, 1, 3, prev ); + } + else + { + // has to be called as a first check, assumes no decision has been made yet + State::checkAllRdCostsOdd1( scanInfo.spt, m_prevStates, pqData, decisions, prev ); + } + + m_prevStates->checkRdCostStart( lastOffset, pqData[2], decisions, 2 ); + } + else + { + /// start inline prequant + qIdx = std::max( 1, std::min( m_quant.m_maxQIdx, qIdx ) ); + int64_t scaledAdd = qIdx * m_quant.m_DistStepAdd - scaledOrg * m_quant.m_DistOrgFact; + + PQData& pq_a = pqData[( qIdx + 0 ) & 3]; + PQData& pq_b = pqData[( qIdx + 1 ) & 3]; + PQData& pq_c = pqData[( qIdx + 2 ) & 3]; + PQData& pq_d = pqData[( qIdx + 3 ) & 3]; + + pq_a.deltaDist = ( ( scaledAdd + 0 * m_quant.m_DistStepAdd ) * ( qIdx + 0 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_a.absLevel = ( qIdx + 1 ) >> 1; + + pq_b.deltaDist = ( ( scaledAdd + 1 * m_quant.m_DistStepAdd ) * ( qIdx + 1 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_b.absLevel = ( qIdx + 2 ) >> 1; + + pq_c.deltaDist = ( ( scaledAdd + 2 * m_quant.m_DistStepAdd ) * ( qIdx + 2 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_c.absLevel = ( qIdx + 3 ) >> 1; + + pq_d.deltaDist = ( ( scaledAdd + 3 * m_quant.m_DistStepAdd ) * ( qIdx + 3 ) + m_quant.m_DistAdd ) >> m_quant.m_DistShift; + pq_d.absLevel = ( qIdx + 4 ) >> 1; + /// stop inline prequant + + bool cff02ge4 = pqData[0].absLevel >= 4/* || pqData[2].absLevel >= 4 */; + bool cff13ge4 = /* pqData[1].absLevel >= 4 || */ pqData[3].absLevel >= 4; + + if( cff02ge4 || cff13ge4 || prev.anyRemRegBinsLt4 ) + { + if( prev.anyRemRegBinsLt4 || cff02ge4 ) + { + m_prevStates[0].setRiceParam( scanInfo, prev, cff02ge4 ); + m_prevStates[1].setRiceParam( scanInfo, prev, cff02ge4 ); + } + + if( prev.anyRemRegBinsLt4 || cff13ge4 ) + { + m_prevStates[2].setRiceParam( scanInfo, prev, cff13ge4 ); + m_prevStates[3].setRiceParam( scanInfo, prev, cff13ge4 ); + } + + m_prevStates[0].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions, 0, 2, prev ); + m_prevStates[1].checkRdCosts( scanInfo.spt, pqData[0], pqData[2], decisions, 2, 0, prev ); + m_prevStates[2].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions, 1, 3, prev ); + m_prevStates[3].checkRdCosts( scanInfo.spt, pqData[3], pqData[1], decisions, 3, 1, prev ); + } + else + { + // has to be called as a first check, assumes no decision has been made yet + State::checkAllRdCosts( scanInfo.spt, m_prevStates, pqData, decisions, prev ); + } + + m_prevStates->checkRdCostStart( lastOffset, pqData[0], decisions, 0 ); + m_prevStates->checkRdCostStart( lastOffset, pqData[2], decisions, 2 ); + } + + if( scanInfo.spt==SCAN_EOCSBB ) + { + m_skipStates[0].checkRdCostSkipSbb( decisions, 0, skip ); + m_skipStates[1].checkRdCostSkipSbb( decisions, 1, skip ); + m_skipStates[2].checkRdCostSkipSbb( decisions, 2, skip ); + m_skipStates[3].checkRdCostSkipSbb( decisions, 3, skip ); + } + } + + private: + CommonCtx m_commonCtx; + State m_allStates[ 12 ]; + State* m_currStates; + State* m_prevStates; + State* m_skipStates; + Quantizer m_quant; + Decisions m_trellis[MAX_TB_SIZEY * MAX_TB_SIZEY][2]; + Rom m_scansRom; + + StateMem m_state_mem[3]; + + int m_currStateI = 0; + int m_prevStateI = 1; + int m_skipStateI = 2; + }; +}; // namespace DQIntern + +template +void DepQuant::_initDepQuantX86() +{ + p = new DQIntern::DepQuantSimd(); +} +template void DepQuant::_initDepQuantX86(); + +} // namespace vvenc + +//! \} + +; \ No newline at end of file diff --git a/source/Lib/CommonLib/x86/InitX86.cpp b/source/Lib/CommonLib/x86/InitX86.cpp index e0c598598..f3eb6ec5c 100644 --- a/source/Lib/CommonLib/x86/InitX86.cpp +++ b/source/Lib/CommonLib/x86/InitX86.cpp @@ -374,6 +374,25 @@ void Quant::initQuantX86() } } +void DepQuant::initDepQuantX86() +{ + auto vext = read_x86_extension_flags(); + switch (vext){ + case AVX512: + case AVX2: + _initDepQuantX86(); + break; + case AVX: + case SSE42: + _initDepQuantX86(); + break; + case SSE41: + _initDepQuantX86(); + break; + default: + break; + } +} #endif diff --git a/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h b/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h index ab174d349..d3bbb0678 100644 --- a/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h +++ b/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h @@ -105,7 +105,7 @@ void offsetBlock_SIMD( const int channelBitDepth, #ifdef USE_AVX2 // AVX2 - if ((width>8) && (vext >= AVX2)) + if( ( width & 15 ) == 0 && vext >= AVX2 ) { __m256i vsrca,vsrcal,vsrcar; __m256i vbaseoffset = _mm256_set1_epi16(2) ; @@ -224,7 +224,7 @@ void offsetBlock_SIMD( const int channelBitDepth, } #ifdef USE_AVX2 // AVX2 - if ((width>8) && (vext >= AVX2)) + if( ( width & 15 ) == 0 && ( vext >= AVX2 ) ) { __m256i vsrca,vsrcat,vsrcab; @@ -329,7 +329,7 @@ void offsetBlock_SIMD( const int channelBitDepth, } #ifdef USE_AVX2 // AVX2 - if ((width>8) && (vext >= AVX2)) + if( ( width & 15 ) == 0 && vext >= AVX2 ) { __m256i vsrca,vsrcat,vsrcab; @@ -504,7 +504,7 @@ void offsetBlock_SIMD( const int channelBitDepth, } #ifdef USE_AVX2 // AVX2 - if ((width>8) && (vext >= AVX2)) + if( ( width & 15 ) == 0 && vext >= AVX2 ) { __m256i vsrca,vsrcat,vsrcab; __m256i vbaseoffset = _mm256_set1_epi16(2) ; @@ -644,7 +644,7 @@ void offsetBlock_SIMD( const int channelBitDepth, } #ifdef USE_AVX2 // AVX2 - if ((width>8) && (vext >= AVX2)) + if( ( width & 15 ) == 0 && vext >= AVX2 ) { __m256i vsrc; __m256i vbaseoffset = _mm256_set1_epi16(startIdx) ; diff --git a/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp deleted file mode 100644 index 1a1bf7d77..000000000 --- a/source/Lib/CommonLib/x86/avx/AdaptiveLoopFilter_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../AdaptiveLoopFilterX86.h" diff --git a/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp b/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp deleted file mode 100644 index 6932a17a3..000000000 --- a/source/Lib/CommonLib/x86/avx/AffineGradientSearch_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../AffineGradientSearchX86.h" diff --git a/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp b/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp deleted file mode 100644 index efc1edbfc..000000000 --- a/source/Lib/CommonLib/x86/avx/InterPred_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../InterPredX86.h" diff --git a/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp deleted file mode 100644 index b8430ec4a..000000000 --- a/source/Lib/CommonLib/x86/avx/InterpolationFilter_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../InterpolationFilterX86.h" diff --git a/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp b/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp deleted file mode 100644 index de7ea6063..000000000 --- a/source/Lib/CommonLib/x86/avx/IntraPred_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../IntraPredX86.h" diff --git a/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp b/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp deleted file mode 100644 index d4218924b..000000000 --- a/source/Lib/CommonLib/x86/avx/LoopFilter_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../LoopFilterX86.h" diff --git a/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp b/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp deleted file mode 100644 index 53ca22885..000000000 --- a/source/Lib/CommonLib/x86/avx/RdCost_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../RdCostX86.h" diff --git a/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp b/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp deleted file mode 100644 index eafe24887..000000000 --- a/source/Lib/CommonLib/x86/avx/SampleAdaptiveOffset_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../SampleAdaptiveOffsetX86.h" diff --git a/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp b/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp deleted file mode 100644 index 67fa5dac8..000000000 --- a/source/Lib/CommonLib/x86/avx/Trafo_avx.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../TrafoX86.h" diff --git a/source/Lib/CommonLib/x86/avx/MCTF_avx.cpp b/source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp similarity index 98% rename from source/Lib/CommonLib/x86/avx/MCTF_avx.cpp rename to source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp index 83a8f4442..5a2498cb8 100644 --- a/source/Lib/CommonLib/x86/avx/MCTF_avx.cpp +++ b/source/Lib/CommonLib/x86/avx2/DepQuant_avx2.cpp @@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------------------- */ -#include "../MCTFX86.h" +#include "../DepQuantX86.h" diff --git a/source/Lib/CommonLib/x86/avx/Quant_avx.cpp b/source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp similarity index 98% rename from source/Lib/CommonLib/x86/avx/Quant_avx.cpp rename to source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp index b3d681789..5a2498cb8 100644 --- a/source/Lib/CommonLib/x86/avx/Quant_avx.cpp +++ b/source/Lib/CommonLib/x86/sse41/DepQuant_sse41.cpp @@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------------------- */ -#include "../QuantX86.h" +#include "../DepQuantX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp deleted file mode 100644 index 07563b572..000000000 --- a/source/Lib/CommonLib/x86/sse42/Buffer_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../BufferX86.h" diff --git a/source/Lib/CommonLib/x86/avx/Buffer_avx.cpp b/source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp similarity index 98% rename from source/Lib/CommonLib/x86/avx/Buffer_avx.cpp rename to source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp index 07563b572..5a2498cb8 100644 --- a/source/Lib/CommonLib/x86/avx/Buffer_avx.cpp +++ b/source/Lib/CommonLib/x86/sse42/DepQuant_sse42.cpp @@ -40,4 +40,4 @@ POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------------------- */ -#include "../BufferX86.h" +#include "../DepQuantX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp b/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp deleted file mode 100644 index efc1edbfc..000000000 --- a/source/Lib/CommonLib/x86/sse42/InterPred_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../InterPredX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp b/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp deleted file mode 100644 index b8430ec4a..000000000 --- a/source/Lib/CommonLib/x86/sse42/InterpolationFilter_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../InterpolationFilterX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp b/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp deleted file mode 100644 index de7ea6063..000000000 --- a/source/Lib/CommonLib/x86/sse42/IntraPred_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../IntraPredX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp b/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp deleted file mode 100644 index d4218924b..000000000 --- a/source/Lib/CommonLib/x86/sse42/LoopFilter_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../LoopFilterX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp b/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp deleted file mode 100644 index 83a8f4442..000000000 --- a/source/Lib/CommonLib/x86/sse42/MCTF_avx42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../MCTFX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp deleted file mode 100644 index b3d681789..000000000 --- a/source/Lib/CommonLib/x86/sse42/Quant_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../QuantX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp b/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp deleted file mode 100644 index 53ca22885..000000000 --- a/source/Lib/CommonLib/x86/sse42/RdCost_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../RdCostX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp b/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp deleted file mode 100644 index eafe24887..000000000 --- a/source/Lib/CommonLib/x86/sse42/SampleAdaptiveOffset_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../SampleAdaptiveOffsetX86.h" diff --git a/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp b/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp deleted file mode 100644 index 67fa5dac8..000000000 --- a/source/Lib/CommonLib/x86/sse42/Trafo_sse42.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* ----------------------------------------------------------------------------- -The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning -the Software are granted under this license. - -The Clear BSD License - -Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted (subject to the limitations in the disclaimer below) provided that -the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY -THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------------------- */ - -#include "../TrafoX86.h" diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index fd2b6b4bf..8e1e10425 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -295,7 +295,6 @@ EncCu::~EncCu() void EncCu::encodeCtu( Picture* pic, int (&prevQP)[MAX_NUM_CH], uint32_t ctuXPosInCtus, uint32_t ctuYPosInCtus ) { - PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_COMPRESS_CU, pic->cs, CH_L ); CodingStructure& cs = *pic->cs; Slice* slice = cs.slice; const PreCalcValues& pcv = *cs.pcv; @@ -390,6 +389,7 @@ void EncCu::xCompressCtu( CodingStructure& cs, const UnitArea& area, const unsig cs.initSubStructure( *tempCS, partitioner->chType, partitioner->currArea(), false, orgBuffer, rspBuffer ); cs.initSubStructure( *bestCS, partitioner->chType, partitioner->currArea(), false, orgBuffer, rspBuffer ); m_CABACEstimator->determineNeighborCus( *tempCS, partitioner->currArea(), partitioner->chType, partitioner->treeType ); + PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_COMPRESS_CU, tempCS, CH_L ); // copy the relevant area UnitArea clippedArea = clipArea( partitioner->currArea(), cs.area ); @@ -2252,7 +2252,7 @@ void EncCu::xCheckRDCostMerge( CodingStructure *&tempCS, CodingStructure *&bestC void EncCu::xCheckRDCostMergeGeo(CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &pm, const EncTestMode &encTestMode) { - PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_INTER_GPM, tempCS, partitioner.chType ); + PROFILER_SCOPE_AND_STAGE_EXT( 1, _TPROF, P_INTER_GPM, tempCS, pm.chType ); const Slice &slice = *tempCS->slice; if ((m_pcEncCfg->m_Geo > 1) && (slice.TLayer <= 1)) diff --git a/source/Lib/EncoderLib/EncGOP.cpp b/source/Lib/EncoderLib/EncGOP.cpp index 73af5cee0..78b498021 100644 --- a/source/Lib/EncoderLib/EncGOP.cpp +++ b/source/Lib/EncoderLib/EncGOP.cpp @@ -211,6 +211,7 @@ void EncGOP::init( const VVEncCfg& encCfg, const GOPCfg* gopCfg, RateCtrl& rateC { m_ticksPerFrameMul4 = (int)((int64_t)4 *(int64_t)m_pcEncCfg->m_TicksPerSecond * (int64_t)m_pcEncCfg->m_FrameScale/(int64_t)m_pcEncCfg->m_FrameRate); } + m_forceSCC = false; } @@ -1376,8 +1377,14 @@ void EncGOP::xInitPicsInCodingOrder( const PicList& picList ) CHECK( m_lastCodingNum == -1 && ! pic->gopEntry->m_isStartOfIntra, "encoding should start with an I-Slice" ); + xForceScc( *pic ); + // initialize slice header pic->encTime.startTimer(); + if( pic->gopEntry->m_isStartOfGop ) + { + xInitGopQpCascade( *pic, picList ); + } xInitFirstSlice( *pic, picList, false ); pic->encTime.stopTimer(); @@ -1444,6 +1451,115 @@ void EncGOP::xGetProcessingLists( std::list& procList, std::listm_SourceWidth * m_pcEncCfg->m_SourceHeight) / (3840.0 * 2160.0); + const bool isHighRes = (std::min (m_pcEncCfg->m_SourceWidth, m_pcEncCfg->m_SourceHeight) > 1280); + const int poc0Offset = (m_pcEncCfg->m_poc0idr ? -1 : 0); // place leading poc 0 idr in GOP -1 + const int gopNum = keyPic.gopEntry->m_gopNum + (keyPic.poc == 0 ? poc0Offset : 0); + int dQP = 0; + double qpStart = 24.0; + unsigned num = 0, sum = 0; + uint8_t gopMinNoiseLevels[QPA_MAX_NOISE_LEVELS]; + + // if max bit-rate not set or rate control enabled, skip QP adaptation + if( m_pcEncCfg->m_RCMaxBitrate <= 0 + || m_pcEncCfg->m_RCMaxBitrate == INT32_MAX + || m_pcEncCfg->m_RCNumPasses == 2 + || m_pcEncCfg->m_LookAhead > 0 + || m_pcEncCfg->m_RCTargetBitrate != 0 ) + { + return; + } + + std::fill_n (gopMinNoiseLevels, QPA_MAX_NOISE_LEVELS, 255u); + + for (auto pic : picList) + { + const int picGopNum = pic->gopEntry->m_gopNum + (pic->poc == 0 ? poc0Offset : 0); + + if (picGopNum == gopNum && pic->m_picShared->m_picMotEstError > 0) + { + CHECK( pic->isInitDone, "try to modify GOP qp of picture, which has already been initialized" ); + // summarize motion errors of all MCTF filtered pictures in GOP + gopMotEstCount++; + gopMotEstError += pic->m_picShared->m_picMotEstError; + // go through ranges, search per-range minimum in GOP + for (int i = 0; i < QPA_MAX_NOISE_LEVELS; i++) + { + gopMinNoiseLevels[i] = std::min (gopMinNoiseLevels[i], pic->m_picShared->m_minNoiseLevels[i]); + } + } + else if (picGopNum + 1 == gopNum && pic->gopEntry->m_isStartOfGop /*&& !keyPic.gopEntry->m_isStartOfIntra*/) // disabled for start of Intra segments, for segment parallel encoding + { + // store activities of previous start-of-GOP picture + gopSpVisCount = 1; + gopSpVisActLum = pic->m_picShared->m_picSpVisAct[CH_L]; + gopSpVisActChr = pic->m_picShared->m_picSpVisAct[CH_C]; + } + } + + gopSpVisCount++; // add current TL-0 spatial activities + gopSpVisActLum += keyPic.m_picShared->m_picSpVisAct[CH_L]; + gopSpVisActChr += keyPic.m_picShared->m_picSpVisAct[CH_C]; + + gopMotEstError = (gopMotEstError + (gopMotEstCount >> 1)) / std::max (1u, gopMotEstCount); + gopSpVisActLum = (gopSpVisActLum + (gopSpVisCount >> 1)) / gopSpVisCount; + gopSpVisActChr = (gopSpVisActChr + (gopSpVisCount >> 1)) / gopSpVisCount; + + for (int i = 0; i < QPA_MAX_NOISE_LEVELS; i++) // go through ranges again, find overall min-average in GOP + { + if (gopMinNoiseLevels[i] < 255) + { + num++; + sum += gopMinNoiseLevels[i]; + } + } + + if (num > 0 && sum > 0) + { + qpStart += 0.5 * (6.0 * log ((double) sum / (double) num) / log (2.0) - 1.0 - 24.0); // see RateCtrl.cpp + } + qpStart += log (resRatio4K) / log (2.0); // ICIP23 paper + + // TODO hlm, henkel: adapt GOP's QP offset (capped CQF, adaptive QP cascade) + const int bDepth = m_pcEncCfg->m_internalBitDepth[CH_L]; + const int intraP = Clip3 (m_pcEncCfg->m_GOPSize, 4 * VVENC_MAX_GOP, m_pcEncCfg->m_IntraPeriod); + const int visAct = std::max (uint16_t (gopSpVisActLum >> (12 - bDepth)), keyPic.picVisActY); // when vaY=0 + const double apa = sqrt ((m_pcEncCfg->m_usePerceptQPATempFiltISlice || !keyPic.gopEntry->m_isStartOfIntra ? 32.0 : 16.0) * double (1 << (2 * bDepth - 10)) / sqrt (resRatio4K)); + const int auxOff = (m_pcEncCfg->m_blockImportanceMapping && !keyPic.m_picShared->m_ctuBimQpOffset.empty() ? keyPic.m_picShared->m_picAuxQpOffset : 0); + const int iFrmQP = m_pcEncCfg->m_QP + (keyPic.gopEntry->m_isStartOfIntra ? m_pcEncCfg->m_intraQPOffset : 0) + auxOff + int (floor (3.0 * log (visAct / apa) / log (2.0) + 0.5)); + const int qp32BC = int (16384.0 + 7.21875 * pow ((double) gopSpVisActLum, 4.0/3.0) + 1.46875 * pow ((double) gopSpVisActChr, 4.0/3.0)) * (isHighRes ? 96 : 24); // TODO hlm + const int iFrmBC = int (0.5 + qp32BC * pow (2.0, (33 - iFrmQP) / 5.0) * sqrt (resRatio4K)); // * HD tuning + const int shift = (gopMotEstError < 32 ? 5 - (gopMotEstError >> 4) : 3); + if (keyPic.m_picShared->m_picMotEstError >= 256) gopMotEstError >>= 2; else // avoid 2 much capping at cuts + if (gopMotEstError >= 120) /*TODO tune this*/ gopMotEstError >>= 1; + const int bFrmBC = int ((4.0 * iFrmBC * intraP) / sqrt((double)gopSpVisActLum) * std::max (int (gopMotEstError * gopMotEstError) >> (bDepth / 2), (keyPic.picVisActTL0 - visAct) >> shift) * pow(2.0, -1.0 * bDepth)); + + const double fac = double (m_pcEncCfg->m_FrameScale * intraP) / m_pcEncCfg->m_FrameRate; + const double mBC = (m_pcEncCfg->m_RCMaxBitrate > 0 && m_pcEncCfg->m_RCMaxBitrate != INT32_MAX ? m_pcEncCfg->m_RCMaxBitrate * fac : 0.0); + + if (mBC > 0.0 && iFrmBC + bFrmBC > mBC) // max. I-period bit-count exceeded + { + const double d = std::max (0, iFrmQP) - (105.0 / 128.0) * sqrt ((double) std::max (1, iFrmQP)) * log (mBC / double (iFrmBC + bFrmBC)) / log (2.0); + + dQP = Clip3 (0, MAX_QP, int (0.5 + d + 0.5 * std::max (0.0, qpStart - d))) - std::max (0, iFrmQP); + } + + for (auto pic : picList) // store in all pictures of GOP + { + const int picGopNum = pic->gopEntry->m_gopNum + (pic->poc == 0 ? poc0Offset : 0); + + if (picGopNum == gopNum) + { + pic->gopAdaptedQP = dQP; + } + } + keyPic.gopAdaptedQP = dQP; // TODO: add any additional key-frame offset here +} + void EncGOP::xInitFirstSlice( Picture& pic, const PicList& picList, bool isEncodeLtRef ) { memset( pic.cs->alfAps, 0, sizeof(pic.cs->alfAps)); @@ -2613,6 +2729,20 @@ void EncGOP::xPrintPictureInfo( const Picture& pic, AccessUnitList& accessUnit, } } +void EncGOP::xForceScc( Picture& pic ) +{ + if( pic.gopEntry->m_isStartOfGop ) + { + m_forceSCC = pic.m_picShared->m_forceSCC; + } + if( m_forceSCC && (!pic.isSccStrong || !pic.isSccWeak) ) + { + pic.isSccStrong = true; + pic.isSccWeak = true; + pic.setSccFlags(m_pcEncCfg); + } +} + } // namespace vvenc //! \} diff --git a/source/Lib/EncoderLib/EncGOP.h b/source/Lib/EncoderLib/EncGOP.h index 796b21bac..633182318 100644 --- a/source/Lib/EncoderLib/EncGOP.h +++ b/source/Lib/EncoderLib/EncGOP.h @@ -150,6 +150,7 @@ class EncGOP : public EncStage std::deque m_globalApsList; std::vector m_globalCtuQpVector; + bool m_forceSCC; public: EncGOP( MsgLog& msglog ); @@ -190,6 +191,7 @@ class EncGOP : public EncStage void xSetupPicAps ( Picture* pic ); void xInitPicsInCodingOrder ( const PicList& picList ); void xGetProcessingLists ( std::list& procList, std::list& rcUpdateList, const bool lockStepMode ); + void xInitGopQpCascade ( Picture& keyPic, const PicList& picList ); void xInitFirstSlice ( Picture& pic, const PicList& picList, bool isEncodeLtRef ); void xInitSliceTMVPFlag ( PicHeader* picHeader, const Slice* slice ); void xUpdateRPRtmvp ( PicHeader* picHeader, Slice* slice ); @@ -223,6 +225,8 @@ class EncGOP : public EncStage std::lock_guard lock( m_gopEncMutex ); return ( int ) m_freePicEncoderList.size() >= std::max(1, m_pcEncCfg->m_maxParallelFrames); } + void xForceScc ( Picture& pic ); + };// END CLASS DEFINITION EncGOP } // namespace vvenc diff --git a/source/Lib/EncoderLib/EncLib.cpp b/source/Lib/EncoderLib/EncLib.cpp index f189a84f3..c8280ee4c 100644 --- a/source/Lib/EncoderLib/EncLib.cpp +++ b/source/Lib/EncoderLib/EncLib.cpp @@ -131,10 +131,11 @@ void EncLib::initEncoderLib( const vvenc_config& encCfg ) #endif #if ENABLE_TIME_PROFILING - if( g_timeProfiler == nullptr ) + if( g_timeProfiler ) { - g_timeProfiler = timeProfilerCreate( encCfg ); + delete g_timeProfiler; } + g_timeProfiler = timeProfilerCreate( encCfg ); #endif } @@ -166,12 +167,17 @@ void EncLib::uninitEncoderLib() #if ENABLE_TIME_PROFILING #if ENABLE_TIME_PROFILING_MT_MODE - for( auto& p : m_threadPool->getProfilers() ) + if( m_threadPool ) { - *g_timeProfiler += *p; + for(auto& p : m_threadPool->getProfilers()) + { + *g_timeProfiler += *p; + } } #endif timeProfilerResults( g_timeProfiler ); + delete g_timeProfiler; + g_timeProfiler = nullptr; #endif xUninitLib(); } @@ -423,7 +429,7 @@ void EncLib::encodePicture( bool flush, const vvencYUVBuffer* yuvInBuf, AccessUn } } - PROFILER_EXT_UPDATE( g_timeProfiler, P_TOP_LEVEL, pic->TLayer ); + PROFILER_EXT_UPDATE( g_timeProfiler, P_TOP_LEVEL, 0 ); // trigger stages isQueueEmpty = m_picsRcvd > 0 || ( m_picsRcvd <= 0 && flush ); diff --git a/source/Lib/EncoderLib/EncPicture.cpp b/source/Lib/EncoderLib/EncPicture.cpp index fa1fd7592..fc0600797 100644 --- a/source/Lib/EncoderLib/EncPicture.cpp +++ b/source/Lib/EncoderLib/EncPicture.cpp @@ -83,7 +83,7 @@ void EncPicture::init( const VVEncCfg& encCfg, void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder ) { - PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs, CH_L ); + PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs ); ITT_TASKSTART( itt_domain_picEncoder, itt_handle_start ); pic.encTime.startTimer(); @@ -122,7 +122,7 @@ void EncPicture::compressPicture( Picture& pic, EncGOP& gopEncoder ) void EncPicture::finalizePicture( Picture& pic ) { - PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs, CH_L ); + PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_TOP_LEVEL, pic.cs ); CodingStructure& cs = *(pic.cs); Slice* slice = pic.slices[0]; // ALF diff --git a/source/Lib/EncoderLib/EncSlice.cpp b/source/Lib/EncoderLib/EncSlice.cpp index d8b4d074c..722c3d19d 100644 --- a/source/Lib/EncoderLib/EncSlice.cpp +++ b/source/Lib/EncoderLib/EncSlice.cpp @@ -390,6 +390,8 @@ void EncSlice::xInitSliceLambdaQP( Slice* slice ) slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0; } +static const int highTL[6] = { -1, 0, 0, 2, 4, 5 }; + int EncSlice::xGetQPForPicture( const Slice* slice ) { const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ]; @@ -401,11 +403,13 @@ int EncSlice::xGetQPForPicture( const Slice* slice ) } else { - const SliceType sliceType = slice->sliceType; - - qp = m_pcEncCfg->m_QP; + qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP; - if( sliceType == VVENC_I_SLICE ) + if (m_pcEncCfg->m_usePerceptQPA) + { + qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[slice->TLayer] + ((qp * (16 + std::min (2u, slice->TLayer))) >> 4) + 0/*TODO +-1?*/); + } + else if( slice->isIntra() ) { qp += m_pcEncCfg->m_intraQPOffset; } @@ -735,7 +739,7 @@ void EncSlice::finishCompressSlice( Picture* pic, Slice& slice ) void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr ) { - PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs, CH_L ); + PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs ); CodingStructure& cs = *pic->cs; Slice& slice = *cs.slice; const PreCalcValues& pcv = *cs.pcv; diff --git a/source/Lib/EncoderLib/EncStage.h b/source/Lib/EncoderLib/EncStage.h index caf267088..a3f396a8c 100644 --- a/source/Lib/EncoderLib/EncStage.h +++ b/source/Lib/EncoderLib/EncStage.h @@ -64,6 +64,7 @@ class PicShared GOPEntry m_gopEntry; bool m_isSccWeak; bool m_isSccStrong; + bool m_forceSCC; uint16_t m_picVisActTL0; uint16_t m_picVisActY; uint16_t m_picSpVisAct[MAX_NUM_CH]; @@ -88,6 +89,7 @@ class PicShared PicShared() : m_isSccWeak ( false ) , m_isSccStrong ( false ) + , m_forceSCC ( false ) , m_picVisActTL0 ( 0 ) , m_picVisActY ( 0 ) , m_picMemorySTA ( 0 ) @@ -137,6 +139,7 @@ class PicShared m_isSccWeak = false; m_isSccStrong = false; + m_forceSCC = false; m_picVisActTL0 = 0; m_picVisActY = 0; m_picMemorySTA = 0; diff --git a/source/Lib/EncoderLib/PreProcess.cpp b/source/Lib/EncoderLib/PreProcess.cpp index a8aca551c..e7b6a69a2 100644 --- a/source/Lib/EncoderLib/PreProcess.cpp +++ b/source/Lib/EncoderLib/PreProcess.cpp @@ -299,7 +299,7 @@ void PreProcess::xGetVisualActivity( Picture* pic, const PicList& picList ) cons uint16_t picVisActTL0 = 0; uint16_t picVisActY = 0; - if( m_doVisAct && ! m_doVisActQpa ) // for the time being qpa activity done on ctu basis in applyQPAdaptationSlice(), which for now sums up luma activity + if( ( m_doVisAct && !m_doVisActQpa ) || ( cappedCRF && m_encCfg->m_usePerceptQPA && pic->gopEntry->m_temporalId == 0 ) ) // for the time being qpa activity done on ctu basis in applyQPAdaptationSlice(), which for now sums up luma activity { // find previous pictures const Picture* prevPics[ NUM_QPA_PREV_FRAMES ]; @@ -441,121 +441,153 @@ void PreProcess::xDisableTempDown( Picture* pic, const PicList& picList ) void PreProcess::xDetectScc( Picture* pic ) const { - CPelUnitBuf yuvOrgBuf = pic->getOrigBuf(); - - bool isSccWeak = false; - bool isSccStrong = false; - - const int SIZE_BL = 4; - const int minLevel = 1 << (m_encCfg->m_internalBitDepth[CH_L] - (!m_encCfg->m_videoFullRangeFlag ? 4 : 6)); // 1/16th or 1/64th of range - const int K_SC = 25; - const Pel* piSrc = yuvOrgBuf.Y().buf; - const uint32_t uiStride = yuvOrgBuf.Y().stride; - const uint32_t uiWidth = yuvOrgBuf.Y().width; - const uint32_t uiHeight = yuvOrgBuf.Y().height; - int size = SIZE_BL; - unsigned hh, ww; - int SizeS = SIZE_BL << 1; + if( m_encCfg->m_forceScc > 0 ) + { + pic->isSccStrong = pic->m_picShared->m_isSccStrong = m_encCfg->m_forceScc >= 3; + pic->isSccWeak = pic->m_picShared->m_isSccWeak = m_encCfg->m_forceScc >= 2; + return; + } + + CPelBuf yuvOrgBuf = pic->getOrigBuf().Y(); + + // blocksize and threshold + static constexpr int SIZE_BL = 4; + static constexpr int K_SC = 23; + static constexpr int K_noSC = 8; + + // mean and variance fixed point accuracy + static constexpr int accM = 4; + static constexpr int accV = 2; + + static_assert( accM <= 4 && accV <= 4, "Maximum Mean and Variance accuracy of 4 allowed!" ); + static constexpr int shfM = 4 - accM; + static constexpr int shfV = 4 + accM - accV; + static constexpr int addM = 1 << shfM >> 1; + static constexpr int addV = 1 << shfV >> 1; + + static constexpr int SizeS = SIZE_BL << 1; + + const int minLevel = 1 << ( m_encCfg->m_internalBitDepth[CH_L] - ( m_encCfg->m_videoFullRangeFlag ? 6 : 4 ) ); // 1/16th or 1/64th of range + + const Pel* piSrc = yuvOrgBuf.buf; + const uint32_t uiStride = yuvOrgBuf.stride; + const uint32_t uiWidth = yuvOrgBuf.width; + const uint32_t uiHeight = yuvOrgBuf.height; + + CHECK( ( uiWidth & 7 ) != 0 || ( uiHeight & 7 ) != 0, "Width and height have to be multiples of 8!" ); + + const int amountBlock = ( uiWidth >> 2 ) * ( uiHeight >> 2 ); + int sR[4] = { 0, 0, 0, 0 }; // strong SCC data int zR[4] = { 0, 0, 0, 0 }; // zero input data - const int amountBlock = (uiWidth >> 2) * (uiHeight >> 2); - for( hh = 0; hh < uiHeight; hh += SizeS ) + + for( int hh = 0; hh < uiHeight; hh += SizeS ) { - for( ww = 0; ww < uiWidth; ww += SizeS ) + for( int ww = 0; ww < uiWidth; ww += SizeS ) { - int Rx = ww >= (uiWidth >> 1) ? 1 : 0; - int Ry = hh >= (uiHeight >> 1) ? 1 : 0; - Ry = Ry << 1 | Rx; + int Rx = ww >= ( uiWidth >> 1 ) ? 1 : 0; + int Ry = hh >= ( uiHeight >> 1 ) ? 2 : 0; + Ry = Ry | Rx; - int i = ww; - int j = hh; int n = 0; int Var[4]; - for( j = hh; (j < hh + SizeS) && (j < uiHeight); j += size ) + + for( int j = hh; j < hh + SizeS; j += SIZE_BL ) { - for( i = ww; (i < ww + SizeS) && (i < uiWidth); i += size ) + for( int i = ww; i < ww + SizeS; i += SIZE_BL ) { - int sum = 0; + const Pel *p0 = &piSrc[j * uiStride + i]; + int Mit = 0; - int V = 0; - int h = j; - int w = i; - for( h = j; (h < j + size) && (h < uiHeight); h++ ) + int V = 0; + + for( int h = 0; h < SIZE_BL; h++, p0 += uiStride ) { - for( w = i; (w < i + size) && (w < uiWidth); w++ ) + for( int w = 0; w < SIZE_BL; w++ ) { - sum += int(piSrc[h * uiStride + w]); + Mit += p0[w]; } } - int sizeEnd = ((h - j) * (w - i)); - Mit = sum / sizeEnd; - for( h = j; (h < j + size) && (h < uiHeight); h++ ) + + Mit = ( Mit + addM ) >> shfM; + + p0 = &piSrc[j * uiStride + i]; + + for( int h = 0; h < SIZE_BL; h++, p0 += uiStride ) { - for( w = i; (w < i + size) && (w < uiWidth); w++ ) + for( int w = 0; w < SIZE_BL; w++ ) { - V += abs(Mit - int(piSrc[h * uiStride + w])); + V += abs( Mit - ( int( p0[w] ) << accM ) ); } } - // Variance in Block (SIZE_BL*SIZE_BL) - if (V < sizeEnd && Mit <= minLevel) + + // if variance is lower than 1 and mean is lower/equal to minLevel + if( V < ( 1 << ( accM + 4 ) ) && Mit <= ( minLevel << accM ) ) { Var[n] = -1; } else { - Var[n] = V / sizeEnd; + Var[n] = ( V + addV ) >> shfV; } + n++; } } + for( int i = 0; i < 2; i++ ) { - if( Var[i] == Var[i + 2] ) + const int var0 = Var[ i]; + const int var1 = Var[ i + 2]; + const int var2 = Var[ i << 1]; + const int var3 = Var[(i << 1) + 1]; + + if( var0 < 0 && var1 < 0 && zR[Ry] * 20 < amountBlock ) { - if( Var[i] < 0 && zR[Ry] * 20 < amountBlock ) - { - zR[Ry]++; - } - else - { - sR[Ry]++; - } + zR[Ry]++; } - if( Var[i << 1] == Var[(i << 1) + 1] ) + else if( var0 == var1 ) { - if( Var[i << 1] < 0 && zR[Ry] * 20 < amountBlock ) - { - zR[Ry]++; - } - else - { - sR[Ry]++; - } + sR[Ry]++; + } + + if( var2 < 0 && var3 < 0 && zR[Ry] * 20 < amountBlock ) + { + zR[Ry]++; + } + else if( var2 == var3 ) + { + sR[Ry]++; } } } } - int s = 0; - isSccStrong = true; - size = 0; + + bool isSccWeak = false; + bool isSccStrong = false; + bool isNoSccStrong = false; + + int numAll = 0; + int numMin = amountBlock, numMax = 0; + int numBelow = 0; + for( int r = 0; r < 4; r++ ) { - s += sR[r]; - if (size < sR[r]) // find peak quarter - { - size = sR[r]; - } - if ((sR[r] * 100 / (amountBlock >> 2)) <= K_SC) - { - isSccStrong = false; - } - } - isSccWeak = ((s * 100 / amountBlock) > K_SC); - if (isSccWeak && (size * 93 / (amountBlock >> 1)) > K_SC) - { - isSccStrong = true; // peak quarter is above 2.15*K_SC threshold + numAll += sR[r]; + numMax = std::max( numMax, sR[r] ); + numMin = std::min( numMin, sR[r] ); + numBelow += sR[r] * 100 <= K_SC * ( amountBlock >> 2 ) ? 1 : 0; } + // lowest quarter is above K_SC threshold + isSccStrong = numMin * 100 > K_SC * ( amountBlock >> 2 ); + // lowest quarter is below K_noSC threshold and theres more than one quarter below K_SC threshold + isNoSccStrong = numMin * 100 <= K_noSC * ( amountBlock >> 2 ) && numBelow > 1; + // overall is above K_SC threshold + isSccWeak = numAll * 100 > K_SC * amountBlock; + // peak quarter is above 2.15*K_SC threshold + isSccStrong |= isSccWeak && !isNoSccStrong && numMax * 186 > K_SC * amountBlock; + PicShared* picShared = pic->m_picShared; pic->isSccWeak = isSccWeak; pic->isSccStrong = isSccStrong; diff --git a/source/Lib/EncoderLib/RateCtrl.cpp b/source/Lib/EncoderLib/RateCtrl.cpp index 43b2b6c3b..8f261371b 100644 --- a/source/Lib/EncoderLib/RateCtrl.cpp +++ b/source/Lib/EncoderLib/RateCtrl.cpp @@ -1285,7 +1285,7 @@ void RateCtrl::initRateControlPic( Picture& pic, Slice* slice, int& qp, double& tmpVal = updateQPstartModelVal() + log (sqrOfResRatio) / log (2.0); // GOP's QPstart d /= (double)it->numBits; d = firstPassSliceQP - ( 105.0 / 128.0 ) * sqrt( (double)std::max( 1, firstPassSliceQP ) ) * log( d ) / log( 2.0 ); - sliceQP = int( 0.5 + d + ( it->isIntra && m_pcEncCfg->m_HdrMode != vvencHDRMode::VVENC_HDR_OFF ? 0.375 : 0.5 ) * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] ); + sliceQP = int( 0.5 + d + ( it->isIntra ? 0.375 : 0.5 ) * std::max( 0.0, tmpVal - d ) + encRCSeq->qpCorrection[ frameLevel ] ); encRcPic->clipTargetQP( getPicList(), ( m_pcEncCfg->m_LookAhead ? getBaseQP() : m_pcEncCfg->m_QP ) + ( it->isIntra ? m_pcEncCfg->m_intraQPOffset : 0 ), 5 - budgetRelaxScale, ( it->poc < encRCSeq->gopSize ? 0 : ( m_pcEncCfg->m_maxTLayer + 1 ) >> 1 ), sqrOfResRatio, sliceQP, &encRCSeq->lastAverageQP ); diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h index b6f51f0f9..7f28a2936 100644 --- a/source/Lib/apputils/VVEncAppCfg.h +++ b/source/Lib/apputils/VVEncAppCfg.h @@ -858,6 +858,7 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("AddGOP32refPics", c->m_addGOP32refPics, "Use different QP offsets and reference pictures in GOP structure") ("NumRefPics", c->m_numRefPics, "Number of reference pictures in RPL (0: default for RPL, <10: apply for all temporal layers, >=10: each decimal digit specifies the number for a temporal layer, last digit applying to the highest TL)" ) ("NumRefPicsSCC", c->m_numRefPicsSCC, "Number of reference pictures in RPL for SCC pictures (semantic analogue to NumRefPics, -1: equal to NumRefPics)" ) + ("ForceSCC", c->m_forceScc, "Force SCC treatment, instead of detection (<=0: use detection, 1: treat all frames as not SCC, 2: treat all frames as weak SCC, 3: treat all frames as strong SCC)" ) ; opts.setSubSection("Low-level QT-BTT partitioning options"); diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt index b224d9f76..3cdbca21c 100644 --- a/source/Lib/vvenc/CMakeLists.txt +++ b/source/Lib/vvenc/CMakeLists.txt @@ -29,8 +29,8 @@ if( VVENC_ENABLE_X86_SIMD ) # get x86 include files file( GLOB X86_INC_FILES "../CommonLib/x86/*.h" ) - ## get avx source files - #file( GLOB AVX_SRC_FILES "../CommonLib/x86/avx/*.cpp" ) + # get avx source files + file( GLOB AVX_SRC_FILES "../CommonLib/x86/avx/*.cpp" ) # get avx2 source files file( GLOB AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" ) @@ -38,8 +38,8 @@ if( VVENC_ENABLE_X86_SIMD ) # get sse4.1 source files file( GLOB SSE41_SRC_FILES "../CommonLib/x86/sse41/*.cpp" ) - ## get sse4.2 source files - #file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" ) + # get sse4.2 source files + file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" ) endif() if( VVENC_ENABLE_ARM_SIMD ) @@ -93,31 +93,30 @@ set( CMAKE_VISIBILITY_INLINES_HIDDEN TRUE ) if( VVENC_ENABLE_X86_SIMD ) # set needed compile definitions set_property( SOURCE ${SSE41_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE41 ) - #set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 ) - #set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX ) + set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 ) + set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX ) set_property( SOURCE ${AVX2_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 ) # set needed compile flags if( MSVC ) - #set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" ) + set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" ) set_property( SOURCE ${AVX2_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) elseif( UNIX OR MINGW ) include( vvencCompilerSupport ) set_if_compiler_supports_flag( FLAG_mxsave -mxsave ) set_if_compiler_supports_flag( FLAG_msse41 -msse4.1 ) - #set_if_compiler_supports_flag( FLAG_msse42 -msse4.2 ) - #set_if_compiler_supports_flag( FLAG_mavx -mavx ) + set_if_compiler_supports_flag( FLAG_msse42 -msse4.2 ) + set_if_compiler_supports_flag( FLAG_mavx -mavx ) set_if_compiler_supports_flag( FLAG_mavx2 -mavx2 ) set_property( SOURCE ${X86_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS ${FLAG_mxsave} ) set_property( SOURCE ${SSE41_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse41}" ) - #set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" ) - #set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}" ) + set_property( SOURCE ${SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" ) + set_property( SOURCE ${AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}" ) set_property( SOURCE ${AVX2_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}" ) endif() - #add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} ) - add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${AVX2_SRC_FILES} ) + add_library( ${LIB_NAME}_x86_simd OBJECT ${SSE41_SRC_FILES} ${SSE42_SRC_FILES} ${AVX_SRC_FILES} ${AVX2_SRC_FILES} ) # disble LTO for the files compiled with special architecture flags set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES INTERPROCEDURAL_OPTIMIZATION OFF diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp index 982d2b9e2..eb98284af 100644 --- a/source/Lib/vvenc/vvencCfg.cpp +++ b/source/Lib/vvenc/vvencCfg.cpp @@ -687,8 +687,9 @@ VVENC_DECL void vvenc_config_default(vvenc_config *c ) c->m_FirstPassMode = 0; + c->m_forceScc = 0; + c->m_reservedFlag = false; - c->m_reservedInt = 0; memset( c->m_reservedDouble, 0, sizeof(c->m_reservedDouble) ); // init default preset @@ -760,9 +761,11 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) { vvenc_confirmParameter( c, c->m_bufferingPeriodSEIEnabled, "Enabling bufferingPeriod SEI requires rate control" ); vvenc_confirmParameter( c, c->m_pictureTimingSEIEnabled, "Enabling pictureTiming SEI requires rate control" ); - vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0, "Specifying a maximum bitrate requires rate control" ); + vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX && !c->m_usePerceptQPA, "Enabling capped CQF requires PerceptQPA to be enabled" ); + vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0 && c->m_RCInitialQP > 0, "Specifying an RCInitialQP value requires rate control" ); + vvenc_confirmParameter( c, c->m_RCMaxBitrate < 0, "Cannot specify a relative max rate when using QCF, please specify an absolute value" ); } - else if ( c->m_RCMaxBitrate == 0 ) + if( c->m_RCMaxBitrate == 0 ) { c->m_RCMaxBitrate = INT32_MAX; } @@ -804,8 +807,8 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) } } - const double d = (3840.0 * 2160.0) / double (c->m_SourceWidth * c->m_SourceHeight); - const int rcQP = (c->m_RCInitialQP > 0 ? std::min (vvenc::MAX_QP, c->m_RCInitialQP) : std::max (0, vvenc::MAX_QP_PERCEPT_QPA - (c->m_FirstPassMode > 2 ? 4 : 2) - int (0.5 + sqrt ((d * std::max (0, c->m_RCTargetBitrate)) / 500000.0)))); + const double d = (c->m_RCTargetBitrate != VVENC_RC_OFF ? 1.0 : 2.25) * (3840.0 * 2160.0) / double (c->m_SourceWidth * c->m_SourceHeight); + const int rcQP = (c->m_RCInitialQP > 0 ? std::min (vvenc::MAX_QP, c->m_RCInitialQP) : std::max (0, vvenc::MAX_QP_PERCEPT_QPA - (c->m_FirstPassMode > 2 ? 4 : 2) - int (0.5 + sqrt ((d * std::max (0, (c->m_RCTargetBitrate != VVENC_RC_OFF ? c->m_RCTargetBitrate : c->m_RCMaxBitrate))) / 500000.0)))); // TODO 2.0: make this an error //vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && c->m_QP != VVENC_AUTO_QP && c->m_QP != VVENC_DEFAULT_QP, "Rate-control and QP based encoding are mutually exclusive!" ); @@ -823,6 +826,7 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && ( c->m_RCTargetBitrate < 0 || c->m_RCTargetBitrate > 800000000 ), "TargetBitrate must be between 0 and 800000000" ); vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && (int64_t) c->m_RCMaxBitrate * 2 < (int64_t) c->m_RCTargetBitrate * 3, "MaxBitrate must be at least 1.5*TargetBitrate" ); + vvenc_confirmParameter( c, c->m_RCTargetBitrate == VVENC_RC_OFF && c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX && rcQP + sqrt (c->m_FrameRate / (double) c->m_FrameScale) > c->m_QP + 10.125, "Capped CQF is used and MaxBitrate is too low for specified QP and frame rate/scale" ); vvenc_confirmParameter( c, c->m_RCTargetBitrate != VVENC_RC_OFF && ( c->m_FirstPassMode < 0 || c->m_FirstPassMode > 4 ), "FirstPassMode must be 0, 1, 2, 3, or 4" ); if ( c->m_internChromaFormat < 0 || c->m_internChromaFormat >= VVENC_NUM_CHROMA_FORMAT ) @@ -2619,7 +2623,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset ) c->m_MinQT[ 0 ] = 8; c->m_MinQT[ 1 ] = 8; c->m_MinQT[ 2 ] = 4; - c->m_maxMTTDepth = 221111; + c->m_maxMTTDepth = 1; c->m_maxMTTDepthI = 2; // speedups @@ -2628,7 +2632,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset ) c->m_contentBasedFastQtbt = true; c->m_fastHad = false; c->m_usePbIntraFast = 1; - c->m_useFastMrg = 3; + c->m_useFastMrg = 2; c->m_fastLocalDualTreeMode = 1; c->m_fastSubPel = 1; c->m_FastIntraTools = 1; @@ -2639,7 +2643,7 @@ VVENC_DECL int vvenc_init_preset( vvenc_config *c, vvencPresetMode preset ) c->m_numIntraModesFullRD = -1; c->m_reduceIntraChromaModesFullRD = true; c->m_meReduceTap = 2; - c->m_numRefPics = 222111; + c->m_numRefPics = 222221; c->m_numRefPicsSCC = 0; // tools @@ -2976,17 +2980,23 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve } else css << "single-pass"; - if( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX ) + } + else + { + css << "QP " << c->m_QP; + } + if( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX ) + { + if ( c->m_RCTargetBitrate <= 0 ) { - if( c->m_RCMaxBitrate < 1000000 ) - css << " (max. rate " << (double)c->m_RCMaxBitrate/1000.0 << " kbps)"; - else - css << " (max. rate " << (double)c->m_RCMaxBitrate/1000000.0 << " Mbps)"; + css << " capped CQF"; } - css << "\n"; + if( c->m_RCMaxBitrate < 1000000 ) + css << " (max. rate " << (double)c->m_RCMaxBitrate/1000.0 << " kbps)"; + else + css << " (max. rate " << (double)c->m_RCMaxBitrate/1000000.0 << " Mbps)"; } - else - css << "QP " << c->m_QP << "\n"; + css << "\n"; css << loglvl << "Perceptual optimization : " << (c->m_usePerceptQPA ? "Enabled" : "Disabled") << "\n"; css << loglvl << "Intra period (keyframe) : " << c->m_IntraPeriod << "\n"; @@ -3186,16 +3196,23 @@ VVENC_DECL const char* vvenc_get_config_as_string( vvenc_config *c, vvencMsgLeve css << "Passes:" << c->m_RCNumPasses << " "; css << "Pass:" << c->m_RCPass << " "; css << "TargetBitrate:" << c->m_RCTargetBitrate << " "; - if ( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX ) + if ( c->m_RCInitialQP > 0 ) { - css << "MaxBitrate:" << c->m_RCMaxBitrate << " "; + css << "RCInitialQP:" << c->m_RCInitialQP << " "; } - css << "RCInitialQP:" << c->m_RCInitialQP << " "; } else { css << "QP:" << c->m_QP << " "; } + if ( c->m_RCMaxBitrate > 0 && c->m_RCMaxBitrate != INT32_MAX ) + { + if ( c->m_RCTargetBitrate <= 0 ) + { + css << "(capped CQF) "; + } + css << "MaxBitrate:" << c->m_RCMaxBitrate << " "; + } css << "LookAhead:" << c->m_LookAhead << " "; css << "FirstPassMode:" << c->m_FirstPassMode << " "; diff --git a/thirdparty/simde/x86/avx2.h b/thirdparty/simde/x86/avx2.h index 1247b5193..9fd0d9490 100644 --- a/thirdparty/simde/x86/avx2.h +++ b/thirdparty/simde/x86/avx2.h @@ -4080,7 +4080,7 @@ simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i]; } return simde__m256i_from_private(r_); @@ -4104,7 +4104,7 @@ simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i]; } return simde__m256i_from_private(r_); @@ -4128,7 +4128,7 @@ simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; + r_.i32[i] = (b_.i32[i] == INT32_C(0)) ? INT32_C(0) : (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; } return simde__m256i_from_private(r_);