From 47cbd9e1c7bc78fc9d42ca2febcb8072a15ecded Mon Sep 17 00:00:00 2001 From: Adam Wieckowski Date: Tue, 19 Dec 2023 13:00:38 +0100 Subject: [PATCH] Initial NEON code, improved stats output, allowing maxrate setting as factor --- AUTHORS.md | 1 + CMakeLists.txt | 19 +- include/vvenc/vvencCfg.h | 6 +- source/App/vvencFFapp/EncApp.cpp | 4 +- source/App/vvencapp/vvencapp.cpp | 4 +- source/Lib/CommonLib/AdaptiveLoopFilter.h | 2 + source/Lib/CommonLib/AffineGradientSearch.h | 2 + source/Lib/CommonLib/Buffer.h | 19 + source/Lib/CommonLib/CommonDef.h | 69 +++- source/Lib/CommonLib/DepQuant.cpp | 2 +- source/Lib/CommonLib/InterPrediction.h | 2 + source/Lib/CommonLib/InterpolationFilter.cpp | 8 +- source/Lib/CommonLib/InterpolationFilter.h | 9 + source/Lib/CommonLib/IntraPrediction.h | 2 + source/Lib/CommonLib/LoopFilter.cpp | 9 - source/Lib/CommonLib/LoopFilter.h | 1 + source/Lib/CommonLib/MCTF.h | 2 + source/Lib/CommonLib/Quant.h | 2 + source/Lib/CommonLib/QuantRDOQ2.cpp | 2 +- source/Lib/CommonLib/RdCost.cpp | 3 + source/Lib/CommonLib/RdCost.h | 22 +- source/Lib/CommonLib/SampleAdaptiveOffset.h | 2 + source/Lib/CommonLib/TrQuant.h | 2 + source/Lib/CommonLib/TrQuant_EMT.h | 4 +- source/Lib/CommonLib/arm/BufferARM.h | 385 ++++++++++++++++++ source/Lib/CommonLib/arm/CommonDefARM.cpp | 64 +++ source/Lib/CommonLib/arm/CommonDefARM.h | 66 +++ source/Lib/CommonLib/arm/InitARM.cpp | 115 ++++++ .../CommonLib/arm/InterpolationFilterARM.h | 295 ++++++++++++++ source/Lib/CommonLib/arm/RdCostARM.h | 265 ++++++++++++ source/Lib/CommonLib/arm/neon/Buffer_neon.cpp | 43 ++ .../arm/neon/InterpolationFilter_neon.cpp | 43 ++ source/Lib/CommonLib/arm/neon/RdCost_neon.cpp | 43 ++ .../Lib/CommonLib/x86/AdaptiveLoopFilterX86.h | 2 + source/Lib/CommonLib/x86/CommonDefX86.h | 5 +- .../Lib/EncoderLib/EncAdaptiveLoopFilter.cpp | 8 +- source/Lib/apputils/IStreamIO.h | 21 +- source/Lib/apputils/Stats.h | 60 ++- source/Lib/apputils/VVEncAppCfg.h | 20 +- source/Lib/vvenc/CMakeLists.txt | 35 +- source/Lib/vvenc/vvencCfg.cpp | 6 +- source/Lib/vvenc/vvencimpl.cpp | 19 +- 42 files changed, 1615 insertions(+), 78 deletions(-) create mode 100644 source/Lib/CommonLib/arm/BufferARM.h create mode 100644 source/Lib/CommonLib/arm/CommonDefARM.cpp create mode 100644 source/Lib/CommonLib/arm/CommonDefARM.h create mode 100644 source/Lib/CommonLib/arm/InitARM.cpp create mode 100644 source/Lib/CommonLib/arm/InterpolationFilterARM.h create mode 100644 source/Lib/CommonLib/arm/RdCostARM.h create mode 100644 source/Lib/CommonLib/arm/neon/Buffer_neon.cpp create mode 100644 source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp create mode 100644 source/Lib/CommonLib/arm/neon/RdCost_neon.cpp diff --git a/AUTHORS.md b/AUTHORS.md index ea8c981e1..e9b601d1a 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,3 +12,4 @@ * Christian Stoffers, , Fraunhofer HHI * Gabriel Hege, , Fraunhofer HHI * Jens Güther, , Fraunhofer HHI +* Florian Eisenreich, , Fraunhofer HHI diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fb95898b..5565dcb4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,8 +21,18 @@ endif() set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules" ) message( STATUS "CMAKE_MODULE_PATH: updating module path to: ${CMAKE_MODULE_PATH}" ) +# check for arm architecture support +set( VVENC_ARM_SIMD_DEFAULT FALSE ) +if( ( "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64\|arm" + OR "${CMAKE_CXX_COMPILER}" MATCHES "aarch64\|arm" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64\|armv" ) + AND NOT "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86\|x64" ) + set( VVENC_ARM_SIMD_DEFAULT TRUE ) +endif() + # we enable x86 intrinsics for all target architectures, because they are implemented through simd-everywhere on non-x86 set( VVENC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 intrinsics" ) +set( VVENC_ENABLE_ARM_SIMD ${VVENC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" ) include( vvencCompilerSupport ) @@ -39,8 +49,13 @@ if( VVENC_ENABLE_X86_SIMD ) check_missing_intrinsics() endif() - set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_SIMD_X86" ) - set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTARGET_SIMD_X86" ) + message( STATUS "x86 SIMD intrinsics enabled (using SIMDE for non-x86 targets)" ) + add_compile_definitions( TARGET_SIMD_X86 ) +endif() + +if( VVENC_ENABLE_ARM_SIMD ) + message( STATUS "ARM SIMD intrinsics enabled" ) + add_compile_definitions( TARGET_SIMD_ARM ) endif() if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) diff --git a/include/vvenc/vvencCfg.h b/include/vvenc/vvencCfg.h index e379b2c78..83e1c556d 100644 --- a/include/vvenc/vvencCfg.h +++ b/include/vvenc/vvencCfg.h @@ -769,7 +769,11 @@ typedef struct vvenc_config int8_t m_sliceTypeAdapt; // enable slice type adaptation (STA) bool m_treatAsSubPic; - int m_RCMaxBitrate; // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR)) +#define VVENC_SET_MAXRATE_FACTOR(f) (-((int)(f*16+0.5))) + int m_RCMaxBitrate; // maximum bitrate in bps (default: 0 (RC disabled or least constrained VBR), + // if negative, the absolute value is interpreted as a 4-bit fixed point multiplier of the target bitrate). + // -24, i.e. -1.1000 binary, means the maxrate would be set to be the 1.5x of the target bitrate. + // for convenience use VVENC_SET_MAXRATE_FACTOR, e.g. VVENC_SET_MAXRATE_FACTOR(1.5), to set the multiplier int m_reservedInt; double m_reservedDouble[9]; diff --git a/source/App/vvencFFapp/EncApp.cpp b/source/App/vvencFFapp/EncApp.cpp index a6a449d45..88b558bcb 100644 --- a/source/App/vvencFFapp/EncApp.cpp +++ b/source/App/vvencFFapp/EncApp.cpp @@ -307,7 +307,7 @@ int EncApp::encode() int64_t frameCount = apputils::VVEncAppCfg::getFrameCount( appCfg.m_inputFileName, vvencCfg.m_SourceWidth, vvencCfg.m_SourceHeight, vvencCfg.m_inputBitDepth[0], appCfg.m_packedYUVInput ); frameCount = std::max( 0, frameCount-appCfg.m_FrameSkip ); int64_t framesToEncode = (vvencCfg.m_framesToBeEncoded == 0 || vvencCfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvencCfg.m_framesToBeEncoded; - cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " ); + cStats.init( vvencCfg.m_FrameRate, vvencCfg.m_FrameScale, (int)framesToEncode, vvencCfg.m_verbosity, "vvenc [info]: " ); bool statsInfoReady = false; // loop over input YUV data @@ -370,6 +370,7 @@ int EncApp::encode() if( statsInfoReady ) { msgApp( VVENC_INFO, cStats.getInfoString().c_str() ); + fflush( stdout ); } } @@ -383,6 +384,7 @@ int EncApp::encode() if( appCfg.m_printStats ) { msgApp( VVENC_INFO, cStats.getFinalStats().c_str() ); + fflush( stdout ); } } diff --git a/source/App/vvencapp/vvencapp.cpp b/source/App/vvencapp/vvencapp.cpp index 19612ab1a..39a644b84 100644 --- a/source/App/vvencapp/vvencapp.cpp +++ b/source/App/vvencapp/vvencapp.cpp @@ -357,7 +357,7 @@ int main( int argc, char* argv[] ) int64_t framesToEncode = (vvenccfg.m_framesToBeEncoded == 0 || vvenccfg.m_framesToBeEncoded >= frameCount) ? frameCount : vvenccfg.m_framesToBeEncoded; apputils::Stats cStats; - cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, "vvenc [info]: " ); + cStats.init( vvenccfg.m_FrameRate, vvenccfg.m_FrameScale, (int)framesToEncode, vvenccfg.m_verbosity, "vvenc [info]: " ); bool statsInfoReady = false; while( !bEof || !bEncodeDone ) @@ -404,6 +404,7 @@ int main( int argc, char* argv[] ) if( statsInfoReady ) { msgApp( nullptr, VVENC_INFO, cStats.getInfoString().c_str() ); + fflush( stdout ); } } @@ -426,6 +427,7 @@ int main( int argc, char* argv[] ) if( vvencappCfg.m_printStats ) { msgApp( nullptr, VVENC_INFO, cStats.getFinalStats().c_str() ); + fflush( stdout ); } } diff --git a/source/Lib/CommonLib/AdaptiveLoopFilter.h b/source/Lib/CommonLib/AdaptiveLoopFilter.h index 1aa271491..ad3e8d2b5 100644 --- a/source/Lib/CommonLib/AdaptiveLoopFilter.h +++ b/source/Lib/CommonLib/AdaptiveLoopFilter.h @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + struct AlfClassifier { AlfClassifier() {} diff --git a/source/Lib/CommonLib/AffineGradientSearch.h b/source/Lib/CommonLib/AffineGradientSearch.h index f06995157..ca1a07d6c 100644 --- a/source/Lib/CommonLib/AffineGradientSearch.h +++ b/source/Lib/CommonLib/AffineGradientSearch.h @@ -53,6 +53,8 @@ namespace vvenc { //! \ingroup CommonLib //! \{ +using namespace x86_simd; + class AffineGradientSearch { public: diff --git a/source/Lib/CommonLib/Buffer.h b/source/Lib/CommonLib/Buffer.h index f86cc846a..adf927e40 100644 --- a/source/Lib/CommonLib/Buffer.h +++ b/source/Lib/CommonLib/Buffer.h @@ -66,6 +66,9 @@ struct vvencYUVBuffer; namespace vvenc { +using namespace x86_simd; +using namespace arm_simd; + // --------------------------------------------------------------------------- // AreaBuf struct // --------------------------------------------------------------------------- @@ -81,6 +84,22 @@ struct PelBufferOps template void _initPelBufOpsX86(); #endif + +#if ENABLE_SIMD_OPT_BUFFER && defined( TARGET_SIMD_ARM ) + void initPelBufOpsARM(); + template + void _initPelBufOpsARM(); +#endif + +#define INCX( ptr, stride ) { ptr++; } +#define INCY( ptr, stride ) { ptr += ( stride ); } +#define OFFSETX( ptr, stride, x ) { ptr += ( x ); } +#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); } +#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); } +#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) ) +#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) ) +#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) ) // need in loopFilter.cpp + some ARM files + void ( *roundGeo ) ( const Pel* src, Pel* dest, const int numSamples, unsigned rshift, int offset, const ClpRng &clpRng); void ( *addAvg ) ( const Pel* src0, const Pel* src1, Pel* dst, int numsamples, unsigned shift, int offset, const ClpRng& clpRng ); void ( *reco ) ( const Pel* src0, const Pel* src1, Pel* dst, int numSamples, const ClpRng& clpRng ); diff --git a/source/Lib/CommonLib/CommonDef.h b/source/Lib/CommonLib/CommonDef.h index 178c9b139..e4b7e0329 100644 --- a/source/Lib/CommonLib/CommonDef.h +++ b/source/Lib/CommonLib/CommonDef.h @@ -636,22 +636,6 @@ static inline T* aligned_malloc(size_t len, size_t alignement) { # define ALWAYS_INLINE #endif -#ifdef TARGET_SIMD_X86 -typedef enum -{ - UNDEFINED = -1, - SCALAR = 0, - SSE41, - SSE42, - AVX, - AVX2, - AVX512 -} X86_VEXT; -#endif - -template inline ValueType leftShiftU (const ValueType value, const unsigned shift) { return value << shift; } -template inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; } - #if defined( _WIN32 ) && defined( TARGET_SIMD_X86 ) static inline unsigned int bit_scan_reverse( int a ) { @@ -672,6 +656,59 @@ static inline unsigned int bit_scan_reverse( int a ) } #endif +#if ENABLE_SIMD_LOG2 +static inline int getLog2( int val ) +{ + return bit_scan_reverse( val ); +} +#else +extern int8_t g_aucLog2[MAX_CU_SIZE + 1]; +static inline int getLog2( int val ) +{ + CHECKD( g_aucLog2[2] != 1, "g_aucLog2[] has not been initialized yet." ); + if( val > 0 && val < (int) sizeof( g_aucLog2 ) ) + { + return g_aucLog2[val]; + } + return std::log2( val ); +} +#endif + +#if ENABLE_SIMD_OPT + +namespace x86_simd +{ +#ifdef TARGET_SIMD_X86 + typedef enum + { + UNDEFINED = -1, + SCALAR = 0, + SSE41, + SSE42, + AVX, + AVX2, + AVX512 + } X86_VEXT; +#endif +} + +namespace arm_simd +{ +#ifdef TARGET_SIMD_ARM + typedef enum + { + UNDEFINED = -1, + SCALAR = 0, + NEON, + } ARM_VEXT; +#endif // TARGET_SIMD_ARM +} // namespace arm_simd + +#endif //ENABLE_SIMD_OPT + +template inline ValueType leftShiftU (const ValueType value, const unsigned shift) { return value << shift; } +template inline ValueType rightShiftU (const ValueType value, const unsigned shift) { return value >> shift; } + #if ENABLE_SIMD_LOG2 && defined( TARGET_SIMD_X86 ) static inline int floorLog2( int val ) { diff --git a/source/Lib/CommonLib/DepQuant.cpp b/source/Lib/CommonLib/DepQuant.cpp index 8f883a8ec..67ecb3855 100644 --- a/source/Lib/CommonLib/DepQuant.cpp +++ b/source/Lib/CommonLib/DepQuant.cpp @@ -1518,7 +1518,7 @@ namespace DQIntern #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 ) // if more than one 4x4 coding subblock is available, use SIMD to find first subblock with coefficient larger than threshold - if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > SCALAR ) + if( firstTestPos >= 16 && tuPars.m_log2SbbWidth == 2 && tuPars.m_log2SbbHeight == 2 && read_x86_extension_flags() > x86_simd::SCALAR ) { const int sbbSize = tuPars.m_sbbSize; // move the pointer to the beginning of the current subblock diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index 7d02c1949..7660fc018 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + // forward declaration class Mv; diff --git a/source/Lib/CommonLib/InterpolationFilter.cpp b/source/Lib/CommonLib/InterpolationFilter.cpp index 20b522582..112090b75 100644 --- a/source/Lib/CommonLib/InterpolationFilter.cpp +++ b/source/Lib/CommonLib/InterpolationFilter.cpp @@ -1071,12 +1071,16 @@ void InterpolationFilter::xWeightedGeoBlk(const ClpRngs &clpRngs, const CodingUn void InterpolationFilter::initInterpolationFilter( bool enable ) { #if ENABLE_SIMD_OPT_MCIF -#ifdef TARGET_SIMD_X86 if ( enable ) { +#ifdef TARGET_SIMD_X86 initInterpolationFilterX86(); - } #endif + +#ifdef TARGET_SIMD_ARM + initInterpolationFilterARM(); +#endif + } #endif } diff --git a/source/Lib/CommonLib/InterpolationFilter.h b/source/Lib/CommonLib/InterpolationFilter.h index 6e3c83ee4..7fd05e03d 100644 --- a/source/Lib/CommonLib/InterpolationFilter.h +++ b/source/Lib/CommonLib/InterpolationFilter.h @@ -55,6 +55,9 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; +using namespace arm_simd; + #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision #define IF_FILTER_PREC 6 ///< Log2 of sum of filter taps #define IF_INTERNAL_OFFS (1<<(IF_INTERNAL_PREC-1)) ///< Offset used internally @@ -117,6 +120,12 @@ class InterpolationFilter template void _initInterpolationFilterX86(); #endif + +#ifdef TARGET_SIMD_ARM + void initInterpolationFilterARM(); + template + void _initInterpolationFilterARM(); +#endif void filterN2_2D(const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, const ClpRng& clpRng); void filter4x4 (const ComponentID compID, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, int fracX, int fracY, bool isLast, const ChromaFormat fmt, const ClpRng& clpRng, bool useAltHpelIf = false, int nFilterIdx = 0); diff --git a/source/Lib/CommonLib/IntraPrediction.h b/source/Lib/CommonLib/IntraPrediction.h index 16b1aa740..78fded61b 100644 --- a/source/Lib/CommonLib/IntraPrediction.h +++ b/source/Lib/CommonLib/IntraPrediction.h @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + // ==================================================================================================================== // Class definition // ==================================================================================================================== diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp index d77f4e771..1738e988d 100644 --- a/source/Lib/CommonLib/LoopFilter.cpp +++ b/source/Lib/CommonLib/LoopFilter.cpp @@ -90,15 +90,6 @@ const uint8_t LoopFilter::sm_betaTable[MAX_QP + 1] = // utility functions // ==================================================================================================================== -#define INCX( ptr, stride ) { ptr++; } -#define INCY( ptr, stride ) { ptr += ( stride ); } -#define OFFSETX( ptr, stride, x ) { ptr += ( x ); } -#define OFFSETY( ptr, stride, y ) { ptr += ( y ) * ( stride ); } -#define OFFSET( ptr, stride, x, y ) { ptr += ( x ) + ( y ) * ( stride ); } -#define GET_OFFSETX( ptr, stride, x ) ( ( ptr ) + ( x ) ) -#define GET_OFFSETY( ptr, stride, y ) ( ( ptr ) + ( y ) * ( stride ) ) -#define GET_OFFSET( ptr, stride, x, y ) ( ( ptr ) + ( x ) + ( y ) * ( stride ) ) - #define BsSet( val, compIdx ) ( ( val ) << ( ( compIdx ) << 1 ) ) #define BsGet( val, compIdx ) ( ( ( val ) >> ( ( compIdx ) << 1 ) ) & 3 ) diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h index 9e2276407..3833c25fa 100644 --- a/source/Lib/CommonLib/LoopFilter.h +++ b/source/Lib/CommonLib/LoopFilter.h @@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; #define DEBLOCK_SMALLEST_BLOCK 8 diff --git a/source/Lib/CommonLib/MCTF.h b/source/Lib/CommonLib/MCTF.h index 9956d83cb..2a5e3195f 100644 --- a/source/Lib/CommonLib/MCTF.h +++ b/source/Lib/CommonLib/MCTF.h @@ -54,6 +54,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + class NoMallocThreadPool; //! \ingroup EncoderLib diff --git a/source/Lib/CommonLib/Quant.h b/source/Lib/CommonLib/Quant.h index 4e1a041f5..1d136a3a8 100644 --- a/source/Lib/CommonLib/Quant.h +++ b/source/Lib/CommonLib/Quant.h @@ -56,6 +56,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + // ==================================================================================================================== // Constants // ==================================================================================================================== diff --git a/source/Lib/CommonLib/QuantRDOQ2.cpp b/source/Lib/CommonLib/QuantRDOQ2.cpp index d9a32a99f..17b69cd16 100644 --- a/source/Lib/CommonLib/QuantRDOQ2.cpp +++ b/source/Lib/CommonLib/QuantRDOQ2.cpp @@ -584,7 +584,7 @@ int QuantRDOQ2::xRateDistOptQuantFast( TransformUnit &tu, const ComponentID &com const bool scanFirstBlk = !bUseScalingList && log2CGSize == 4 && cctx.log2CGWidth() == 2; #if ENABLE_SIMD_OPT_QUANT && defined( TARGET_SIMD_X86 ) - const bool isSimd = read_x86_extension_flags() > SCALAR; + const bool isSimd = read_x86_extension_flags() > x86_simd::SCALAR; #endif int subSetId = iScanPos >> log2CGSize; diff --git a/source/Lib/CommonLib/RdCost.cpp b/source/Lib/CommonLib/RdCost.cpp index 7495fa18f..5928bf2d2 100644 --- a/source/Lib/CommonLib/RdCost.cpp +++ b/source/Lib/CommonLib/RdCost.cpp @@ -141,6 +141,9 @@ void RdCost::create() #ifdef TARGET_SIMD_X86 initRdCostX86(); #endif +#ifdef TARGET_SIMD_ARM + initRdCostARM(); +#endif #endif m_costMode = VVENC_COST_STANDARD_LOSSY; diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h index b87fadfa1..eeed5e2b7 100644 --- a/source/Lib/CommonLib/RdCost.h +++ b/source/Lib/CommonLib/RdCost.h @@ -57,6 +57,9 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; +using namespace arm_simd; + class DistParam; // ==================================================================================================================== @@ -144,6 +147,12 @@ class RdCost template void _initRdCostX86(); #endif + +#ifdef TARGET_SIMD_ARM + void initRdCostARM(); + template + void _initRdCostARM(); +#endif // TARGET_SIMD_ARM void setReshapeParams ( const uint32_t* pPLUT, double chrWght) { m_reshapeLumaLevelToWeightPLUT = pPLUT; m_chromaWeight = chrWght; } void setDistortionWeight ( const ComponentID compID, const double distortionWeight ) { m_distortionWeight[compID] = distortionWeight; } @@ -201,6 +210,11 @@ class RdCost } void getMotionCostIBC(int add) { m_dCostIBC = m_dLambdaMotionSAD + add; } Distortion getBvCostMultiplePredsIBC(int x, int y, bool useIMV); + + static Distortion xGetSAD8 ( const DistParam& pcDtParam ); + static Distortion xGetSAD16 ( const DistParam& pcDtParam ); // needs to be public for xGetSAD_MxN_SIMD ( NOTE: they are all public in vvdec ) + static void xGetSAD16X5 ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos ); // needs to be public for xGetSADX5_16xN_SIMD ( NOTE: they are all public in vvdec ) + private: Distortion xGetSSE_WTD ( const DistParam& pcDtParam ) const; @@ -215,15 +229,12 @@ class RdCost static Distortion xGetSAD ( const DistParam& pcDtParam ); static Distortion xGetSAD4 ( const DistParam& pcDtParam ); - static Distortion xGetSAD8 ( const DistParam& pcDtParam ); - static Distortion xGetSAD16 ( const DistParam& pcDtParam ); static Distortion xGetSAD32 ( const DistParam& pcDtParam ); static Distortion xGetSAD64 ( const DistParam& pcDtParam ); static Distortion xGetSAD128 ( const DistParam& pcDtParam ); static Distortion xGetSADwMask ( const DistParam &pcDtParam ); static void xGetSAD8X5 ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos ); - static void xGetSAD16X5 ( const DistParam& pcDtParam, Distortion* cost, bool isCalCentrePos ); static Distortion xCalcHADs2x2 ( const Pel* piOrg, const Pel* piCur, int iStrideOrg, int iStrideCur ); static Distortion xGetHAD2SADs ( const DistParam& pcDtParam ); @@ -255,6 +266,11 @@ class RdCost static Distortion xGetSADwMask_SIMD( const DistParam &pcDtParam ); #endif +#ifdef TARGET_SIMD_ARM + template + static void xGetSADX5_16xN_SIMD ( const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos ); +#endif + unsigned int getBitsMultiplePredsIBC(int x, int y, bool useIMV); public: diff --git a/source/Lib/CommonLib/SampleAdaptiveOffset.h b/source/Lib/CommonLib/SampleAdaptiveOffset.h index 5b81b7f4c..2100c7478 100644 --- a/source/Lib/CommonLib/SampleAdaptiveOffset.h +++ b/source/Lib/CommonLib/SampleAdaptiveOffset.h @@ -53,6 +53,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + template static inline int sgn( T val ) { return ( T( 0 ) < val ) - ( val < T( 0 ) ); diff --git a/source/Lib/CommonLib/TrQuant.h b/source/Lib/CommonLib/TrQuant.h index 875fa004f..3b7261f0c 100644 --- a/source/Lib/CommonLib/TrQuant.h +++ b/source/Lib/CommonLib/TrQuant.h @@ -58,6 +58,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + typedef void FwdTrans(const TCoeff*, TCoeff*, int, int, int, int); typedef void InvTrans(const TCoeff*, TCoeff*, int, int, int, int, const TCoeff, const TCoeff); diff --git a/source/Lib/CommonLib/TrQuant_EMT.h b/source/Lib/CommonLib/TrQuant_EMT.h index 23c640e5c..ba1c6db04 100644 --- a/source/Lib/CommonLib/TrQuant_EMT.h +++ b/source/Lib/CommonLib/TrQuant_EMT.h @@ -52,6 +52,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + #if ENABLE_SIMD_TRAFO struct TCoeffOps { @@ -61,8 +63,8 @@ struct TCoeffOps void initTCoeffOpsX86(); template void _initTCoeffOpsX86(); - #endif + void( *cpyResi8 ) ( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height ); void( *cpyResi4 ) ( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height ); void( *cpyCoeff8 ) ( const Pel* src, ptrdiff_t stride, TCoeff* dst, unsigned width, unsigned height ); diff --git a/source/Lib/CommonLib/arm/BufferARM.h b/source/Lib/CommonLib/arm/BufferARM.h new file mode 100644 index 000000000..aa6a2f482 --- /dev/null +++ b/source/Lib/CommonLib/arm/BufferARM.h @@ -0,0 +1,385 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** \file BufferARM.h + \brief SIMD averaging. +*/ + +//! \ingroup CommonLib +//! \{ + +#define DONT_UNDEF_SIZE_AWARE_PER_EL_OP 1 + + +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" +#include "CommonLib/Unit.h" +#include "CommonLib/Buffer.h" +#include "CommonLib/InterpolationFilter.h" + +#if ENABLE_SIMD_OPT_BUFFER +#ifdef TARGET_SIMD_ARM + +namespace vvenc +{ + +template +void applyLut_SIMD( const Pel* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, const Pel* lut ) +{ + if( ( width & 31 ) == 0 ) + { + int16x8x4_t xtmp1; + int16x8x4_t xtmp2; + int16x8x4_t xtmp3; + int16x8x4_t xtmp4; + + for( int y = 0; y < height; y += 4 ) + { + for( int x = 0; x < width; x += 32 ) + { + xtmp1.val[ 0 ][ 0 ] = lut[ src[ x + 0 ] ]; + xtmp1.val[ 1 ][ 0 ] = lut[ src[ x + 1 ] ]; + xtmp1.val[ 2 ][ 0 ] = lut[ src[ x + 2 ] ]; + xtmp1.val[ 3 ][ 0 ] = lut[ src[ x + 3 ] ]; + xtmp1.val[ 0 ][ 1 ] = lut[ src[ x + 4 ] ]; + xtmp1.val[ 1 ][ 1 ] = lut[ src[ x + 5 ] ]; + xtmp1.val[ 2 ][ 1 ] = lut[ src[ x + 6 ] ]; + xtmp1.val[ 3 ][ 1 ] = lut[ src[ x + 7 ] ]; + xtmp1.val[ 0 ][ 2 ] = lut[ src[ x + 8 ] ]; + xtmp1.val[ 1 ][ 2 ] = lut[ src[ x + 9 ] ]; + xtmp1.val[ 2 ][ 2 ] = lut[ src[ x + 10 ] ]; + xtmp1.val[ 3 ][ 2 ] = lut[ src[ x + 11 ] ]; + xtmp1.val[ 0 ][ 3 ] = lut[ src[ x + 12 ] ]; + xtmp1.val[ 1 ][ 3 ] = lut[ src[ x + 13 ] ]; + xtmp1.val[ 2 ][ 3 ] = lut[ src[ x + 14 ] ]; + xtmp1.val[ 3 ][ 3 ] = lut[ src[ x + 15 ] ]; + xtmp1.val[ 0 ][ 4 ] = lut[ src[ x + 16 ] ]; + xtmp1.val[ 1 ][ 4 ] = lut[ src[ x + 17 ] ]; + xtmp1.val[ 2 ][ 4 ] = lut[ src[ x + 18 ] ]; + xtmp1.val[ 3 ][ 4 ] = lut[ src[ x + 19 ] ]; + xtmp1.val[ 0 ][ 5 ] = lut[ src[ x + 20 ] ]; + xtmp1.val[ 1 ][ 5 ] = lut[ src[ x + 21 ] ]; + xtmp1.val[ 2 ][ 5 ] = lut[ src[ x + 22 ] ]; + xtmp1.val[ 3 ][ 5 ] = lut[ src[ x + 23 ] ]; + xtmp1.val[ 0 ][ 6 ] = lut[ src[ x + 24 ] ]; + xtmp1.val[ 1 ][ 6 ] = lut[ src[ x + 25 ] ]; + xtmp1.val[ 2 ][ 6 ] = lut[ src[ x + 26 ] ]; + xtmp1.val[ 3 ][ 6 ] = lut[ src[ x + 27 ] ]; + xtmp1.val[ 0 ][ 7 ] = lut[ src[ x + 28 ] ]; + xtmp1.val[ 1 ][ 7 ] = lut[ src[ x + 29 ] ]; + xtmp1.val[ 2 ][ 7 ] = lut[ src[ x + 30 ] ]; + xtmp1.val[ 3 ][ 7 ] = lut[ src[ x + 31 ] ]; + + xtmp2.val[ 0 ][ 0 ] = lut[ src[ x + 1 * srcStride ] ]; + xtmp2.val[ 1 ][ 0 ] = lut[ src[ x + 1 * srcStride + 1 ] ]; + xtmp2.val[ 2 ][ 0 ] = lut[ src[ x + 1 * srcStride + 2 ] ]; + xtmp2.val[ 3 ][ 0 ] = lut[ src[ x + 1 * srcStride + 3 ] ]; + xtmp2.val[ 0 ][ 1 ] = lut[ src[ x + 1 * srcStride + 4 ] ]; + xtmp2.val[ 1 ][ 1 ] = lut[ src[ x + 1 * srcStride + 5 ] ]; + xtmp2.val[ 2 ][ 1 ] = lut[ src[ x + 1 * srcStride + 6 ] ]; + xtmp2.val[ 3 ][ 1 ] = lut[ src[ x + 1 * srcStride + 7 ] ]; + xtmp2.val[ 0 ][ 2 ] = lut[ src[ x + 1 * srcStride + 8 ] ]; + xtmp2.val[ 1 ][ 2 ] = lut[ src[ x + 1 * srcStride + 9 ] ]; + xtmp2.val[ 2 ][ 2 ] = lut[ src[ x + 1 * srcStride + 10 ] ]; + xtmp2.val[ 3 ][ 2 ] = lut[ src[ x + 1 * srcStride + 11 ] ]; + xtmp2.val[ 0 ][ 3 ] = lut[ src[ x + 1 * srcStride + 12 ] ]; + xtmp2.val[ 1 ][ 3 ] = lut[ src[ x + 1 * srcStride + 13 ] ]; + xtmp2.val[ 2 ][ 3 ] = lut[ src[ x + 1 * srcStride + 14 ] ]; + xtmp2.val[ 3 ][ 3 ] = lut[ src[ x + 1 * srcStride + 15 ] ]; + xtmp2.val[ 0 ][ 4 ] = lut[ src[ x + 1 * srcStride + 16 ] ]; + xtmp2.val[ 1 ][ 4 ] = lut[ src[ x + 1 * srcStride + 17 ] ]; + xtmp2.val[ 2 ][ 4 ] = lut[ src[ x + 1 * srcStride + 18 ] ]; + xtmp2.val[ 3 ][ 4 ] = lut[ src[ x + 1 * srcStride + 19 ] ]; + xtmp2.val[ 0 ][ 5 ] = lut[ src[ x + 1 * srcStride + 20 ] ]; + xtmp2.val[ 1 ][ 5 ] = lut[ src[ x + 1 * srcStride + 21 ] ]; + xtmp2.val[ 2 ][ 5 ] = lut[ src[ x + 1 * srcStride + 22 ] ]; + xtmp2.val[ 3 ][ 5 ] = lut[ src[ x + 1 * srcStride + 23 ] ]; + xtmp2.val[ 0 ][ 6 ] = lut[ src[ x + 1 * srcStride + 24 ] ]; + xtmp2.val[ 1 ][ 6 ] = lut[ src[ x + 1 * srcStride + 25 ] ]; + xtmp2.val[ 2 ][ 6 ] = lut[ src[ x + 1 * srcStride + 26 ] ]; + xtmp2.val[ 3 ][ 6 ] = lut[ src[ x + 1 * srcStride + 27 ] ]; + xtmp2.val[ 0 ][ 7 ] = lut[ src[ x + 1 * srcStride + 28 ] ]; + xtmp2.val[ 1 ][ 7 ] = lut[ src[ x + 1 * srcStride + 29 ] ]; + xtmp2.val[ 2 ][ 7 ] = lut[ src[ x + 1 * srcStride + 30 ] ]; + xtmp2.val[ 3 ][ 7 ] = lut[ src[ x + 1 * srcStride + 31 ] ]; + + xtmp3.val[ 0 ][ 0 ] = lut[ src[ x + 2 * srcStride + 0 ] ]; + xtmp3.val[ 1 ][ 0 ] = lut[ src[ x + 2 * srcStride + 1 ] ]; + xtmp3.val[ 2 ][ 0 ] = lut[ src[ x + 2 * srcStride + 2 ] ]; + xtmp3.val[ 3 ][ 0 ] = lut[ src[ x + 2 * srcStride + 3 ] ]; + xtmp3.val[ 0 ][ 1 ] = lut[ src[ x + 2 * srcStride + 4 ] ]; + xtmp3.val[ 1 ][ 1 ] = lut[ src[ x + 2 * srcStride + 5 ] ]; + xtmp3.val[ 2 ][ 1 ] = lut[ src[ x + 2 * srcStride + 6 ] ]; + xtmp3.val[ 3 ][ 1 ] = lut[ src[ x + 2 * srcStride + 7 ] ]; + xtmp3.val[ 0 ][ 2 ] = lut[ src[ x + 2 * srcStride + 8 ] ]; + xtmp3.val[ 1 ][ 2 ] = lut[ src[ x + 2 * srcStride + 9 ] ]; + xtmp3.val[ 2 ][ 2 ] = lut[ src[ x + 2 * srcStride + 10 ] ]; + xtmp3.val[ 3 ][ 2 ] = lut[ src[ x + 2 * srcStride + 11 ] ]; + xtmp3.val[ 0 ][ 3 ] = lut[ src[ x + 2 * srcStride + 12 ] ]; + xtmp3.val[ 1 ][ 3 ] = lut[ src[ x + 2 * srcStride + 13 ] ]; + xtmp3.val[ 2 ][ 3 ] = lut[ src[ x + 2 * srcStride + 14 ] ]; + xtmp3.val[ 3 ][ 3 ] = lut[ src[ x + 2 * srcStride + 15 ] ]; + xtmp3.val[ 0 ][ 4 ] = lut[ src[ x + 2 * srcStride + 16 ] ]; + xtmp3.val[ 1 ][ 4 ] = lut[ src[ x + 2 * srcStride + 17 ] ]; + xtmp3.val[ 2 ][ 4 ] = lut[ src[ x + 2 * srcStride + 18 ] ]; + xtmp3.val[ 3 ][ 4 ] = lut[ src[ x + 2 * srcStride + 19 ] ]; + xtmp3.val[ 0 ][ 5 ] = lut[ src[ x + 2 * srcStride + 20 ] ]; + xtmp3.val[ 1 ][ 5 ] = lut[ src[ x + 2 * srcStride + 21 ] ]; + xtmp3.val[ 2 ][ 5 ] = lut[ src[ x + 2 * srcStride + 22 ] ]; + xtmp3.val[ 3 ][ 5 ] = lut[ src[ x + 2 * srcStride + 23 ] ]; + xtmp3.val[ 0 ][ 6 ] = lut[ src[ x + 2 * srcStride + 24 ] ]; + xtmp3.val[ 1 ][ 6 ] = lut[ src[ x + 2 * srcStride + 25 ] ]; + xtmp3.val[ 2 ][ 6 ] = lut[ src[ x + 2 * srcStride + 26 ] ]; + xtmp3.val[ 3 ][ 6 ] = lut[ src[ x + 2 * srcStride + 27 ] ]; + xtmp3.val[ 0 ][ 7 ] = lut[ src[ x + 2 * srcStride + 28 ] ]; + xtmp3.val[ 1 ][ 7 ] = lut[ src[ x + 2 * srcStride + 29 ] ]; + xtmp3.val[ 2 ][ 7 ] = lut[ src[ x + 2 * srcStride + 30 ] ]; + xtmp3.val[ 3 ][ 7 ] = lut[ src[ x + 2 * srcStride + 31 ] ]; + + // interleaved assign -> there is only interleaved storing/loading + xtmp4.val[ 0 ][ 0 ] = lut[ src[ x + 3 * srcStride + 0 ] ]; + xtmp4.val[ 1 ][ 0 ] = lut[ src[ x + 3 * srcStride + 1 ] ]; + xtmp4.val[ 2 ][ 0 ] = lut[ src[ x + 3 * srcStride + 2 ] ]; + xtmp4.val[ 3 ][ 0 ] = lut[ src[ x + 3 * srcStride + 3 ] ]; + xtmp4.val[ 0 ][ 1 ] = lut[ src[ x + 3 * srcStride + 4 ] ]; + xtmp4.val[ 1 ][ 1 ] = lut[ src[ x + 3 * srcStride + 5 ] ]; + xtmp4.val[ 2 ][ 1 ] = lut[ src[ x + 3 * srcStride + 6 ] ]; + xtmp4.val[ 3 ][ 1 ] = lut[ src[ x + 3 * srcStride + 7 ] ]; + xtmp4.val[ 0 ][ 2 ] = lut[ src[ x + 3 * srcStride + 8 ] ]; + xtmp4.val[ 1 ][ 2 ] = lut[ src[ x + 3 * srcStride + 9 ] ]; + xtmp4.val[ 2 ][ 2 ] = lut[ src[ x + 3 * srcStride + 10 ] ]; + xtmp4.val[ 3 ][ 2 ] = lut[ src[ x + 3 * srcStride + 11 ] ]; + xtmp4.val[ 0 ][ 3 ] = lut[ src[ x + 3 * srcStride + 12 ] ]; + xtmp4.val[ 1 ][ 3 ] = lut[ src[ x + 3 * srcStride + 13 ] ]; + xtmp4.val[ 2 ][ 3 ] = lut[ src[ x + 3 * srcStride + 14 ] ]; + xtmp4.val[ 3 ][ 3 ] = lut[ src[ x + 3 * srcStride + 15 ] ]; + xtmp4.val[ 0 ][ 4 ] = lut[ src[ x + 3 * srcStride + 16 ] ]; + xtmp4.val[ 1 ][ 4 ] = lut[ src[ x + 3 * srcStride + 17 ] ]; + xtmp4.val[ 2 ][ 4 ] = lut[ src[ x + 3 * srcStride + 18 ] ]; + xtmp4.val[ 3 ][ 4 ] = lut[ src[ x + 3 * srcStride + 19 ] ]; + xtmp4.val[ 0 ][ 5 ] = lut[ src[ x + 3 * srcStride + 20 ] ]; + xtmp4.val[ 1 ][ 5 ] = lut[ src[ x + 3 * srcStride + 21 ] ]; + xtmp4.val[ 2 ][ 5 ] = lut[ src[ x + 3 * srcStride + 22 ] ]; + xtmp4.val[ 3 ][ 5 ] = lut[ src[ x + 3 * srcStride + 23 ] ]; + xtmp4.val[ 0 ][ 6 ] = lut[ src[ x + 3 * srcStride + 24 ] ]; + xtmp4.val[ 1 ][ 6 ] = lut[ src[ x + 3 * srcStride + 25 ] ]; + xtmp4.val[ 2 ][ 6 ] = lut[ src[ x + 3 * srcStride + 26 ] ]; + xtmp4.val[ 3 ][ 6 ] = lut[ src[ x + 3 * srcStride + 27 ] ]; + xtmp4.val[ 0 ][ 7 ] = lut[ src[ x + 3 * srcStride + 28 ] ]; + xtmp4.val[ 1 ][ 7 ] = lut[ src[ x + 3 * srcStride + 29 ] ]; + xtmp4.val[ 2 ][ 7 ] = lut[ src[ x + 3 * srcStride + 30 ] ]; + xtmp4.val[ 3 ][ 7 ] = lut[ src[ x + 3 * srcStride + 31 ] ]; + + // deinterleaved storing + vst4q_s16( &dst[ x ], xtmp1 ); + vst4q_s16( &dst[ x + 1 * dstStride ], xtmp2 ); + vst4q_s16( &dst[ x + 2 * dstStride ], xtmp3 ); + vst4q_s16( &dst[ x + 3 * dstStride ], xtmp4 ); + } + src += ( srcStride << 2 ); + dst += ( dstStride << 2 ); + } + } + else if( ( width & 15 ) == 0 ) + { + int16x8x2_t xtmp1; + int16x8x2_t xtmp2; + int16x8x2_t xtmp3; + int16x8x2_t xtmp4; + + for( int y = 0; y < height; y += 4 ) + { + for( int x = 0; x < width; x += 16 ) + { + // vld2q_s16( &src[ x ] ); + + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 0 ] ], xtmp1.val[ 0 ], 0 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 ] ], xtmp1.val[ 1 ], 0 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 ] ], xtmp1.val[ 0 ], 1 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 ] ], xtmp1.val[ 1 ], 1 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 4 ] ], xtmp1.val[ 0 ], 2 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 5 ] ], xtmp1.val[ 1 ], 2 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 6 ] ], xtmp1.val[ 0 ], 3 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 7 ] ], xtmp1.val[ 1 ], 3 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 8 ] ], xtmp1.val[ 0 ], 4 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 9 ] ], xtmp1.val[ 1 ], 4 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 10 ] ], xtmp1.val[ 0 ], 5 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 11 ] ], xtmp1.val[ 1 ], 5 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 12 ] ], xtmp1.val[ 0 ], 6 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 13 ] ], xtmp1.val[ 1 ], 6 ); + xtmp1.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 14 ] ], xtmp1.val[ 0 ], 7 ); + xtmp1.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 15 ] ], xtmp1.val[ 1 ], 7 ); + + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 0 ] ], xtmp2.val[ 0 ], 0 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 1 ] ], xtmp2.val[ 1 ], 0 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 2 ] ], xtmp2.val[ 0 ], 1 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 3 ] ], xtmp2.val[ 1 ], 1 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 4 ] ], xtmp2.val[ 0 ], 2 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 5 ] ], xtmp2.val[ 1 ], 2 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 6 ] ], xtmp2.val[ 0 ], 3 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 7 ] ], xtmp2.val[ 1 ], 3 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 8 ] ], xtmp2.val[ 0 ], 4 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 9 ] ], xtmp2.val[ 1 ], 4 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 10 ] ], xtmp2.val[ 0 ], 5 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 11 ] ], xtmp2.val[ 1 ], 5 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 12 ] ], xtmp2.val[ 0 ], 6 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 13 ] ], xtmp2.val[ 1 ], 6 ); + xtmp2.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 14 ] ], xtmp2.val[ 0 ], 7 ); + xtmp2.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 15 ] ], xtmp2.val[ 1 ], 7 ); + + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 0 ] ], xtmp3.val[ 0 ], 0 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 1 ] ], xtmp3.val[ 1 ], 0 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 2 ] ], xtmp3.val[ 0 ], 1 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 3 ] ], xtmp3.val[ 1 ], 1 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 4 ] ], xtmp3.val[ 0 ], 2 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 5 ] ], xtmp3.val[ 1 ], 2 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 6 ] ], xtmp3.val[ 0 ], 3 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 7 ] ], xtmp3.val[ 1 ], 3 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 8 ] ], xtmp3.val[ 0 ], 4 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 9 ] ], xtmp3.val[ 1 ], 4 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 10 ] ], xtmp3.val[ 0 ], 5 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 11 ] ], xtmp3.val[ 1 ], 5 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 12 ] ], xtmp3.val[ 0 ], 6 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 13 ] ], xtmp3.val[ 1 ], 6 ); + xtmp3.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 14 ] ], xtmp3.val[ 0 ], 7 ); + xtmp3.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 15 ] ], xtmp3.val[ 1 ], 7 ); + + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 0 ] ], xtmp4.val[ 0 ], 0 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 1 ] ], xtmp4.val[ 1 ], 0 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 2 ] ], xtmp4.val[ 0 ], 1 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 3 ] ], xtmp4.val[ 1 ], 1 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 4 ] ], xtmp4.val[ 0 ], 2 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 5 ] ], xtmp4.val[ 1 ], 2 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 6 ] ], xtmp4.val[ 0 ], 3 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 7 ] ], xtmp4.val[ 1 ], 3 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 8 ] ], xtmp4.val[ 0 ], 4 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 9 ] ], xtmp4.val[ 1 ], 4 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 10 ] ], xtmp4.val[ 0 ], 5 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 11 ] ], xtmp4.val[ 1 ], 5 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 12 ] ], xtmp4.val[ 0 ], 6 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 13 ] ], xtmp4.val[ 1 ], 6 ); + xtmp4.val[ 0 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 14 ] ], xtmp4.val[ 0 ], 7 ); + xtmp4.val[ 1 ] = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 15 ] ], xtmp4.val[ 1 ], 7 ); + + vst2q_s16( &dst[ x ], xtmp1 ); + vst2q_s16( &dst[ x + 1 * dstStride ], xtmp2 ); + vst2q_s16( &dst[ x + 2 * dstStride ], xtmp3 ); + vst2q_s16( &dst[ x + 3 * dstStride ], xtmp4 ); + } + src += ( srcStride << 2 ); + dst += ( dstStride << 2 ); + } + } + else if( ( width & 7 ) == 0 ) + { + int16x8_t xtmp1; + int16x8_t xtmp2; + int16x8_t xtmp3; + int16x8_t xtmp4; + + for( int y = 0; y < height; y += 4 ) + { + for( int x = 0; x < width; x += 8 ) + { + xtmp1 = vsetq_lane_s16( lut[ src[ x + 0 ] ], xtmp1, 0 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 1 ] ], xtmp1, 1 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 2 ] ], xtmp1, 2 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 3 ] ], xtmp1, 3 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 4 ] ], xtmp1, 4 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 5 ] ], xtmp1, 5 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 6 ] ], xtmp1, 6 ); + xtmp1 = vsetq_lane_s16( lut[ src[ x + 7 ] ], xtmp1, 7 ); + + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 0 ] ], xtmp2, 0 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 1 ] ], xtmp2, 1 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 2 ] ], xtmp2, 2 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 3 ] ], xtmp2, 3 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 4 ] ], xtmp2, 4 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 5 ] ], xtmp2, 5 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 6 ] ], xtmp2, 6 ); + xtmp2 = vsetq_lane_s16( lut[ src[ x + 1 * srcStride + 7 ] ], xtmp2, 7 ); + + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 0 ] ], xtmp3, 0 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 1 ] ], xtmp3, 1 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 2 ] ], xtmp3, 2 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 3 ] ], xtmp3, 3 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 4 ] ], xtmp3, 4 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 5 ] ], xtmp3, 5 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 6 ] ], xtmp3, 6 ); + xtmp3 = vsetq_lane_s16( lut[ src[ x + 2 * srcStride + 7 ] ], xtmp3, 7 ); + + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 0 ] ], xtmp4, 0 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 1 ] ], xtmp4, 1 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 2 ] ], xtmp4, 2 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 3 ] ], xtmp4, 3 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 4 ] ], xtmp4, 4 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 5 ] ], xtmp4, 5 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 6 ] ], xtmp4, 6 ); + xtmp4 = vsetq_lane_s16( lut[ src[ x + 3 * srcStride + 7 ] ], xtmp4, 7 ); + + vst1q_s16( &dst[ x ], xtmp1 ); + vst1q_s16( &dst[ x + 1 * dstStride ], xtmp2 ); + vst1q_s16( &dst[ x + 2 * dstStride ], xtmp3 ); + vst1q_s16( &dst[ x + 3 * dstStride ], xtmp4 ); + } + + src += ( srcStride << 2 ); + dst += ( dstStride << 2 ); + } + } + + return; +} + +template +void PelBufferOps::_initPelBufOpsARM() +{ + applyLut = applyLut_SIMD; +} + +template void PelBufferOps::_initPelBufOpsARM(); + +} // namespace vvdec + +#endif // TARGET_SIMD_ARM +#endif // ENABLE_SIMD_OPT_BUFFER +//! \} diff --git a/source/Lib/CommonLib/arm/CommonDefARM.cpp b/source/Lib/CommonLib/arm/CommonDefARM.cpp new file mode 100644 index 000000000..cf29a9439 --- /dev/null +++ b/source/Lib/CommonLib/arm/CommonDefARM.cpp @@ -0,0 +1,64 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** \file CommonDefX86.cpp +*/ + +#include "CommonDefARM.h" + +namespace vvenc +{ +using namespace arm_simd; + +ARM_VEXT read_arm_extension_flags( ARM_VEXT request ) +{ + static ARM_VEXT ext_flags = NEON; // We assume NEON is always supported for relevant ARM processors + + if( request != UNDEFINED ) + { + ext_flags = request; + } + + return ext_flags; +}; + +} // namespace vvdec diff --git a/source/Lib/CommonLib/arm/CommonDefARM.h b/source/Lib/CommonLib/arm/CommonDefARM.h new file mode 100644 index 000000000..006d12617 --- /dev/null +++ b/source/Lib/CommonLib/arm/CommonDefARM.h @@ -0,0 +1,66 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** \file CommonDefARM.h + */ + +#pragma once + +#include "CommonDef.h" + +#ifdef TARGET_SIMD_ARM + +#ifdef USE_NEON +#define SIMDARM NEON +#include +#endif + +namespace vvenc +{ +using namespace arm_simd; + +ARM_VEXT read_arm_extension_flags( ARM_VEXT request = arm_simd::UNDEFINED ); +// std::string read_arm_extension_name(); + +} // namespace vvdec + +#endif // TARGET_SIMD_ARM diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp new file mode 100644 index 000000000..d97d716b3 --- /dev/null +++ b/source/Lib/CommonLib/arm/InitARM.cpp @@ -0,0 +1,115 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/* + * \ingroup CommonLib + * \file InitARM.cpp + * \brief Initialize encoder SIMD functions. + */ + +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" +#include "CommonLib/InterpolationFilter.h" +#include "CommonLib/TrQuant.h" +#include "CommonLib/RdCost.h" +#include "CommonLib/Buffer.h" +#include "CommonLib/TrQuant_EMT.h" +#include "CommonLib/IntraPrediction.h" +#include "CommonLib/LoopFilter.h" +#include "CommonLib/Picture.h" + +#include "CommonLib/AdaptiveLoopFilter.h" +#include "CommonLib/SampleAdaptiveOffset.h" + +namespace vvenc +{ + +#ifdef TARGET_SIMD_ARM + +#if ENABLE_SIMD_OPT_MCIF +void InterpolationFilter::initInterpolationFilterARM() +{ + auto vext = read_arm_extension_flags(); + switch( vext ) + { + case NEON: + _initInterpolationFilterARM(); + break; + default: + break; + } +} +#endif + +#if ENABLE_SIMD_OPT_BUFFER +void PelBufferOps::initPelBufOpsARM() +{ + auto vext = read_arm_extension_flags(); + switch( vext ) + { + case NEON: + _initPelBufOpsARM(); + break; + default: + break; + } +} +#endif + +#if ENABLE_SIMD_OPT_DIST +void RdCost::initRdCostARM() +{ + auto vext = read_arm_extension_flags(); + switch( vext ) + { + case NEON: + _initRdCostARM(); + break; + default: + break; + } +} +#endif + +#endif // TARGET_SIMD_ARM + +} // namespace vvdec diff --git a/source/Lib/CommonLib/arm/InterpolationFilterARM.h b/source/Lib/CommonLib/arm/InterpolationFilterARM.h new file mode 100644 index 000000000..fe631ebfd --- /dev/null +++ b/source/Lib/CommonLib/arm/InterpolationFilterARM.h @@ -0,0 +1,295 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** + * \file + * \brief Implementation of InterpolationFilter class + */ +// ==================================================================================================================== +// Includes +// ==================================================================================================================== + +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" +#include "../InterpolationFilter.h" + + +namespace vvenc +{ + +#ifdef TARGET_SIMD_ARM +#if __ARM_ARCH >= 8 + +template +static void simdInterpolateN2_2D( const ClpRng& clpRng, const Pel* src, const int srcStride, Pel* dst, const int dstStride, int width, int height, TFilterCoeff const *ch, TFilterCoeff const *cv ) +{ + const int shift1st = IF_FILTER_PREC_BILINEAR - ( IF_INTERNAL_PREC_BILINEAR - clpRng.bd ); + const int offset1st = 1 << ( shift1st - 1 ); + + const int shift2nd = 4; + const int offset2nd = 1 << ( shift2nd - 1 ); + + int16x8_t mmOffset1 = vdupq_n_s16( offset1st ); + int16x8_t mmOffset2 = vdupq_n_s16( offset2nd ); + int16x8_t mmCoeffH = vdupq_n_s16( ch[ 1 ] ); + int16x8_t mmCoeffV = vdupq_n_s16( cv[ 1 ] ); + + int16x8_t mmLastH[ 16 ]; + + int16x8_t mmLast4H; + + // workaround for over-sensitive compilers + mmLastH[ 0 ] = vdupq_n_s16( 0 ); + + int16x8_t shift1inv = vdupq_n_s16( -shift1st ); + int16x8_t shift2inv = vdupq_n_s16( -shift2nd ); + + for( int row = -1; row < height; row++ ) + { + int16x8_t mmPix = vld1q_s16( src ); + int16x8_t mmPix1 = vld1q_s16( src + 1 ); + + int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); + + mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); + mmFiltered = vshlq_s16( mmFiltered, shift1inv ); + + if( row >= 0 ) + { + int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mmLast4H, 16 ); + mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mmLast4H ), mmCoeffV ); + mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); + + vst1q_lane_s64( (int64_t*) dst, (int64x2_t) mmFiltered2, 0 ); + } + + mmLast4H = mmFiltered; + + for( int x = 4; x < width; x += 8 ) + { + int16x8_t mmPix = vld1q_s16( src + x ); + int16x8_t mmPix1 = vld1q_s16( src + x + 1 ); + + int16x8_t mmFiltered = vmlaq_n_s16( mmOffset1, mmPix, 16 ); + mmFiltered = vmlaq_s16( mmFiltered, vsubq_s16( mmPix1, mmPix ), mmCoeffH ); + mmFiltered = vshlq_s16( mmFiltered, shift1inv ); + + int idx = x >> 3; + int16x8_t mLast = mmLastH[ idx ]; + mmLastH[ idx ] = mmFiltered; + + if( row >= 0 ) + { + int16x8_t mmFiltered2 = vmlaq_n_s16( mmOffset2, mLast, 16 ); + mmFiltered2 = vmlaq_s16( mmFiltered2, vsubq_s16( mmFiltered, mLast ), mmCoeffV ); + mmFiltered2 = vshlq_s16( mmFiltered2, shift2inv ); + + vst1q_s16( ( dst + x ), mmFiltered2 ); + } + } + if( row >= 0 ) + dst += dstStride; + + src += srcStride; + } +} + +template +void simdFilter16xX_N8( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV ) +{ + OFFSET( src, srcStride, -3, -3 ); + + int offset1st, offset2nd; + int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); + const int shift1st = IF_FILTER_PREC - headRoom; + int shift2nd = IF_FILTER_PREC; + int extHeight = height + 7; + // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be + // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 + + // shift1st -= headRoom; + offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st ); + + if( isLast ) + { + shift2nd += headRoom; + offset2nd = 1 << ( shift2nd - 1 ); + offset2nd += IF_INTERNAL_OFFS << IF_FILTER_PREC; + } + else + { + offset2nd = 0; + } + const int32x4_t voffset1 = vdupq_n_s32( offset1st ); + + const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); + const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); + + int64x1x2_t vcoeff0 = vld2_s64( (int64_t*) coeffH ); + int16x8_t vsum; + int32x4_t vsuma, vsumb; + + int32x4_t vsrcv[ 2 ][ 9 ]; + + int32x4_t invshift1st = vdupq_n_s32( -shift1st ); + int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); + + for( int row = 0; row < extHeight; row++ ) + { + int32x4_t vsrc0, vsrc1; + int16x4_t vsrca00, vsrca01, vsrca10, vsrca11; + int16x4_t vsrcb00, vsrcb01, vsrcb10, vsrcb11; + + vsrca00 = vld1_s16( &src[ 0 ] ); + vsrca01 = vld1_s16( &src[ 1 ] ); + vsrca10 = vld1_s16( &src[ 2 ] ); + vsrca11 = vld1_s16( &src[ 3 ] ); + + for( int j = 0; j < 2; j++ ) + { + vsrcb00 = vld1_s16( &src[ ( j << 3 ) + 4 ] ); + vsrcb01 = vld1_s16( &src[ ( j << 3 ) + 5 ] ); + vsrcb10 = vld1_s16( &src[ ( j << 3 ) + 6 ] ); + vsrcb11 = vld1_s16( &src[ ( j << 3 ) + 7 ] ); + + vsuma[ 0 ] = vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsuma[ 1 ] = vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsuma[ 2 ] = vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsuma[ 3 ] = vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + + vsumb[ 0 ] = vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsumb[ 1 ] = vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsumb[ 2 ] = vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + vsumb[ 3 ] = vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 0 ] ) ) ); + + vsrc1[ 0 ] = vaddvq_s32( vmull_s16( vsrcb00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc1[ 1 ] = vaddvq_s32( vmull_s16( vsrcb01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc1[ 2 ] = vaddvq_s32( vmull_s16( vsrcb10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc1[ 3 ] = vaddvq_s32( vmull_s16( vsrcb11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + + vsrca00 = vld1_s16( &src[ ( j << 3 ) + 8 ] ); + vsrca01 = vld1_s16( &src[ ( j << 3 ) + 9 ] ); + vsrca10 = vld1_s16( &src[ ( j << 3 ) + 10 ] ); + vsrca11 = vld1_s16( &src[ ( j << 3 ) + 11 ] ); + + vsrc0[ 0 ] = vaddvq_s32( vmull_s16( vsrca00, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc0[ 1 ] = vaddvq_s32( vmull_s16( vsrca01, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc0[ 2 ] = vaddvq_s32( vmull_s16( vsrca10, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + vsrc0[ 3 ] = vaddvq_s32( vmull_s16( vsrca11, vreinterpret_s16_s64( vcoeff0.val[ 1 ] ) ) ); + + vsuma = vaddq_s32( vsuma, vsrc1 ); + vsumb = vaddq_s32( vsumb, vsrc0 ); + + vsuma = vaddq_s32( vsuma, voffset1 ); + vsumb = vaddq_s32( vsumb, voffset1 ); + + vsuma = vshlq_s32( vsuma, invshift1st ); + vsumb = vshlq_s32( vsumb, invshift1st ); + + vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb ); + + if( row < 7 ) + { + vsrcv[ j ][ row + 1 ] = (int32x4_t) vsum; + } + else + { + vsrcv[ j ][ 8 ] = (int32x4_t) vsum; + vsuma = vsumb = vdupq_n_s32( offset2nd ); + + for( int i = 0; i < 8; i += 2 ) + { + vsrc0 = vsrcv[ j ][ i + 1 ]; + vsrc1 = vsrcv[ j ][ i + 2 ]; + int16x4_t vsrc0l = vget_low_s16( (int16x8_t) vsrc0 ); // 0a 0b 0c 0d + int16x4_t vsrc0h = vget_high_s16( (int16x8_t) vsrc0 ); // 0e 0f 0g 0h + int16x4_t vsrc1l = vget_low_s16( (int16x8_t) vsrc1 ); // 1a 1b 1c 1d + int16x4_t vsrc1h = vget_high_s16( (int16x8_t) vsrc1 ); // 1e 1f 1g 1h + vsuma = vmlal_n_s16( vsuma, vsrc0l, coeffV[ i ] ); // 0a * c0 + offset2nd, 0b * c0 + offset2nd, ... + vsuma = vmlal_n_s16( vsuma, vsrc1l, coeffV[ i + 1 ] ); // 1a * c1 + 0a * c1 + offset2nd, 1b * c1 + 0b * c0 + offset2nd, ... + vsumb = vmlal_n_s16( vsumb, vsrc0h, coeffV[ i ] ); + vsumb = vmlal_n_s16( vsumb, vsrc1h, coeffV[ i + 1 ] ); + vsrcv[ j ][ i ] = vsrc0; + vsrcv[ j ][ i + 1 ] = vsrc1; + } + vsuma = vshlq_s32( vsuma, invshift2nd ); + vsumb = vshlq_s32( vsumb, invshift2nd ); + + vsum = vqmovn_high_s32( vqmovn_s32( vsuma ), vsumb ); + + if( isLast ) // clip + { + vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); + } + + vst1q_s16( &dst[ j << 3 ], vsum ); + + INCY( dst, j * dstStride ); + } + } + + INCY( src, srcStride ); + } +} + +template +void InterpolationFilter::_initInterpolationFilterARM() +{ + m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8; + m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8; + + m_filterN2_2D = simdInterpolateN2_2D; +} + +#else // !__ARM_ARCH >= 8 + +template +void InterpolationFilter::_initInterpolationFilterARM() +{} + +#endif // !__ARM_ARCH >= 8 + +template void InterpolationFilter::_initInterpolationFilterARM(); + +#endif // #ifdef TARGET_SIMD_ARM + +} // namespace vvdec diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h new file mode 100644 index 000000000..0d5be3728 --- /dev/null +++ b/source/Lib/CommonLib/arm/RdCostARM.h @@ -0,0 +1,265 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +/** \file RdCostARM.h + \brief RD cost computation class, SIMD version +*/ + +#include +#include + +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" +#include "../RdCost.h" + +namespace vvenc +{ + +#ifdef TARGET_SIMD_ARM +#if __ARM_ARCH >= 8 + +template +Distortion xGetSAD_MxN_SIMD( const DistParam& rcDtParam ) +{ + if( rcDtParam.bitDepth > 10 ) + return isWdt16 ? RdCost::xGetSAD16( rcDtParam ) : RdCost::xGetSAD8( rcDtParam ); + + // assert( rcDtParam.iCols == iWidth); + const short* pSrc1 = (const short*) rcDtParam.org.buf; + const short* pSrc2 = (const short*) rcDtParam.cur.buf; + const int iRows = rcDtParam.org.height; + const int iSubShift = rcDtParam.subShift; + const ptrdiff_t iStrideSrc1 = rcDtParam.org.stride << iSubShift; + const ptrdiff_t iStrideSrc2 = rcDtParam.cur.stride << iSubShift; + + uint32_t uiSum = 0; + + int16x8_t vsum16 = vdupq_n_s16( 0 ); + + for( int i = 0; i < ( iRows >> 3 ); i++ ) + { + // 0 + int16x8_t vsrc1 = vld1q_s16( pSrc1 ); + int16x8_t vsrc2 = vld1q_s16( pSrc2 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + + if( isWdt16 ) + { + vsrc1 = vld1q_s16( pSrc1 + 8 ); + vsrc2 = vld1q_s16( pSrc2 + 8 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + } + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + + // 1 + vsrc1 = vld1q_s16( pSrc1 ); + vsrc2 = vld1q_s16( pSrc2 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + + if( isWdt16 ) + { + vsrc1 = vld1q_s16( pSrc1 + 8 ); + vsrc2 = vld1q_s16( pSrc2 + 8 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + } + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + + // 2 + vsrc1 = vld1q_s16( pSrc1 ); + vsrc2 = vld1q_s16( pSrc2 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + + if( isWdt16 ) + { + vsrc1 = vld1q_s16( pSrc1 + 8 ); + vsrc2 = vld1q_s16( pSrc2 + 8 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + } + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + + // 3 + vsrc1 = vld1q_s16( pSrc1 ); + vsrc2 = vld1q_s16( pSrc2 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + + if( isWdt16 ) + { + vsrc1 = vld1q_s16( pSrc1 + 8 ); + vsrc2 = vld1q_s16( pSrc2 + 8 ); + + vsum16 = vabaq_s16( vsum16, vsrc1, vsrc2 ); + } + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + } + + uiSum = vaddlvq_s16( vsum16 ); + uiSum <<= iSubShift; + return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); +} + +template +void xGetSADX5_16xN_SIMDImp( const DistParam& rcDtParam, Distortion* cost ) +{ + int i, j; + const Pel* piOrg = rcDtParam.org.buf; + const Pel* piCur = rcDtParam.cur.buf - 4; + int height = rcDtParam.org.height; + int iSubShift = rcDtParam.subShift; + int iSubStep = ( 1 << iSubShift ); + ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep; + ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep; + + int16x8_t sum0 = vdupq_n_s16( 0 ); + int16x8_t sum1 = vdupq_n_s16( 0 ); + int16x8_t sum2 = vdupq_n_s16( 0 ); + int16x8_t sum3 = vdupq_n_s16( 0 ); + int16x8_t sum4 = vdupq_n_s16( 0 ); + + for( i = 0; i < height; i += iSubStep ) + { + for( j = 0; j < 16; j += 8 ) + { + int16x8_t s0 = vld1q_s16( piOrg + j + 0 ); + int16x8_t s1 = vld1q_s16( piCur + j + 0 ); + int16x8_t s2 = vcombine_s16( vld1_s16( piOrg + j + 8 ), vdup_n_s16( 0 ) ); + int16x8_t s3 = vcombine_s16( vld1_s16( piCur + j + 8 ), vdup_n_s16( 0 ) ); + + int16x8_t org0, org1, org2, org3, org4; + org0 = s0; + org1 = vextq_s16( s0, s2, 1 ); + if( isCalCentrePos ) + org2 = vextq_s16( s0, s2, 2 ); + org3 = vextq_s16( s0, s2, 3 ); + org4 = vextq_s16( s0, s2, 4 ); + + int16x8_t cur0, cur1, cur2, cur3, cur4; + cur4 = s1; + cur0 = vextq_s16( s1, s3, 4 ); + cur1 = vextq_s16( s1, s3, 3 ); + if( isCalCentrePos ) + cur2 = vextq_s16( s1, s3, 2 ); + cur3 = vextq_s16( s1, s3, 1 ); + + sum0 = vabaq_s16( sum0, org0, cur0 ); // komplett insane + sum1 = vabaq_s16( sum1, org1, cur1 ); + if( isCalCentrePos ) + sum2 = vabaq_s16( sum2, org2, cur2 ); + sum3 = vabaq_s16( sum3, org3, cur3 ); + sum4 = vabaq_s16( sum4, org4, cur4 ); + } + + INCY( piOrg, iStrideOrg ); + INCY( piCur, iStrideCur ); + } + + int32x4_t sum = { vaddlvq_s16( sum0 ), vaddlvq_s16( sum1 ), vaddlvq_s16( sum3 ), vaddlvq_s16( sum4 ) }; + + int32x4_t sumTwo; + if( isCalCentrePos ) + sumTwo = vdupq_n_s32( vaddlvq_s16( sum2 ) ); + + // vshlq_n_s32 doesnt work because iSubShift ist not a const. + sum = vshlq_s32( sum, vdupq_n_s32( iSubShift ) ); + if( isCalCentrePos ) + sumTwo = vshlq_s32( sumTwo, vdupq_n_s32( iSubShift ) ); + + sum = vshrq_n_s32( sum, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) ); + if( isCalCentrePos ) + sumTwo = vshrq_n_s32( sumTwo, ( 1 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) ) ) ); + + vst1q_lane_u64( (uint64_t*) &cost[ 0 ], (uint64x2_t) sum, 0 ); + if( isCalCentrePos ) + cost[ 2 ] = vgetq_lane_s32( sumTwo, 0 ); + vst1q_lane_u64( (uint64_t*) &cost[ 3 ], (uint64x2_t) sum, 1 ); +} + +template +void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) +{ + if( rcDtParam.bitDepth > 10 ) + { + RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos ); + return; + } + + if( isCalCentrePos ) + xGetSADX5_16xN_SIMDImp( rcDtParam, cost ); + else + xGetSADX5_16xN_SIMDImp( rcDtParam, cost ); +} + +template +void RdCost::_initRdCostARM() +{ + m_afpDistortFunc[0][DF_SAD8 ] = xGetSAD_MxN_SIMD; + m_afpDistortFunc[0][DF_SAD16 ] = xGetSAD_MxN_SIMD; + m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD; +} + +#else // !__ARM_ARCH >= 8 + +template +void RdCost::_initRdCostARM() +{} + +#endif // !__ARM_ARCH >= 8 + +template void RdCost::_initRdCostARM(); + +#endif // TARGET_SIMD_ARM + +} // namespace vvenc diff --git a/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp b/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp new file mode 100644 index 000000000..f89806f63 --- /dev/null +++ b/source/Lib/CommonLib/arm/neon/Buffer_neon.cpp @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +#include "../BufferARM.h" diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp new file mode 100644 index 000000000..eb6520c2e --- /dev/null +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +#include "../InterpolationFilterARM.h" diff --git a/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp b/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp new file mode 100644 index 000000000..0d8616eb1 --- /dev/null +++ b/source/Lib/CommonLib/arm/neon/RdCost_neon.cpp @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2023, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +#include "../RdCostARM.h" diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index 234302159..e0574e79f 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -49,6 +49,8 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { +using namespace x86_simd; + template void simdDeriveClassificationBlk(AlfClassifier *classifier, const CPelBuf& srcLuma, const Area& blkDst, const Area& blk, const int shift, const int vbCTUHeight, int vbPos) { diff --git a/source/Lib/CommonLib/x86/CommonDefX86.h b/source/Lib/CommonLib/x86/CommonDefX86.h index df537dc7c..ad2529f80 100644 --- a/source/Lib/CommonLib/x86/CommonDefX86.h +++ b/source/Lib/CommonLib/x86/CommonDefX86.h @@ -84,10 +84,13 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { + +using namespace x86_simd; + const std::string& vext_to_string( X86_VEXT vext ); X86_VEXT string_to_vext( const std::string& ext_name ); -X86_VEXT read_x86_extension_flags( X86_VEXT request = UNDEFINED ); +X86_VEXT read_x86_extension_flags( X86_VEXT request = x86_simd::UNDEFINED ); const std::string& read_x86_extension_name(); diff --git a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp index 50de7a132..b11f75589 100644 --- a/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp +++ b/source/Lib/EncoderLib/EncAdaptiveLoopFilter.cpp @@ -623,7 +623,7 @@ static alf_float_t calcErrorForCoeffsLin_13_SSE( const AlfCovariance::TKE& E, co const AlfCovariance& AlfCovariance::operator+= ( const AlfCovariance& src ) { #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF - if( numCoeff == 13 && read_x86_extension_flags() > SCALAR ) + if( numCoeff == 13 && read_x86_extension_flags() > x86_simd::SCALAR ) { for( int b0 = 0; b0 < numBins; b0++ ) { @@ -737,7 +737,7 @@ alf_float_t AlfCovariance::calcErrorForCoeffs( const int *clip, const int if( numCoeff == 13 ) { #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF - if( read_x86_extension_flags() > SCALAR ) + if( read_x86_extension_flags() > x86_simd::SCALAR ) return calcErrorForCoeffsLin_13_SSE( E, y, coeff, invFactor ); else #endif @@ -3172,7 +3172,7 @@ void EncAdaptiveLoopFilter::getPreBlkStats(AlfCovariance* alfCovariance, const A const int halfFilterLength = shape.filterLength >> 1; #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF - const bool useSimd = read_x86_extension_flags() > SCALAR; + const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR; #else const bool useSimd = false; #endif @@ -6135,7 +6135,7 @@ void EncAdaptiveLoopFilter::getBlkStatsCcAlf(AlfCovariance &alfCovariance, const int effStride = recStride << getComponentScaleY(compID, m_chromaFormat); #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_OPT_ALF - const bool useSimd = read_x86_extension_flags() > SCALAR; + const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR; #endif Pel ELocal[MAX_NUM_CC_ALF_CHROMA_COEFF][16]; diff --git a/source/Lib/apputils/IStreamIO.h b/source/Lib/apputils/IStreamIO.h index c73b33b46..6a3b2c934 100644 --- a/source/Lib/apputils/IStreamIO.h +++ b/source/Lib/apputils/IStreamIO.h @@ -673,6 +673,22 @@ class IStreamToAbbr const std::vector>* toMap; }; +template class FloatRoundingOffset +{ +}; + +template<> class FloatRoundingOffset +{ +public: + static const int offset = 0; +}; + +template<> class FloatRoundingOffset +{ +public: + static const int offset = 1; +}; + template inline std::istream& operator >> ( std::istream& in, IStreamToAbbr& toValue ) { @@ -698,7 +714,10 @@ inline std::istream& operator >> ( std::istream& in, IStreamToAbbr& toValue double value = strtod(str.c_str(), NULL); // convert input string to double value *= map.value; // scale depending on given abbreviation/scaling factor - *toValue.dstVal = (T)value; + double roundDir = value < 0 ? -1 : ( value > 0 ? 1 : 0 ); + double roundOffset = ( FloatRoundingOffset::value>::offset / 2.0 ); + value += roundDir * roundOffset; + *toValue.dstVal = ( T ) value; return in; } } diff --git a/source/Lib/apputils/Stats.h b/source/Lib/apputils/Stats.h index d1d62b669..8c2176f1e 100644 --- a/source/Lib/apputils/Stats.h +++ b/source/Lib/apputils/Stats.h @@ -54,6 +54,12 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#if defined (_WIN32) || defined (WIN32) || defined (_WIN64) || defined (WIN64) + #include +#elif __linux || __APPLE__ + #include +#endif + #include "vvenc/vvenc.h" @@ -165,10 +171,11 @@ class Stats { } - int init( int framerate, int framescale, int maxFrames, std::string prependString = "" ) + int init( int framerate, int framescale, int maxFrames, vvencMsgLevel verbosity, std::string prependString = "" ) { m_framerate = (framerate/(double)framescale); m_maxFrames = maxFrames; + m_verbosity = verbosity; m_preString = prependString; m_bytes = 0; m_bytesCur = 0; @@ -181,6 +188,12 @@ class Stats m_AUStats[VVENC_P_SLICE].reset(m_framerate); m_AUStats[VVENC_B_SLICE].reset(m_framerate); +#if defined (_WIN32) || defined (WIN32) || defined (_WIN64) || defined (WIN64) + m_istty = _isatty( _fileno(stdout)); +#elif __linux || __APPLE__ + m_istty = isatty( fileno(stdout)); +#endif + return 0; } @@ -207,20 +220,29 @@ class Stats return 0; } - std::string getInfoString() + std::string getInfoString( bool finalInfo = false ) { std::stringstream css; - m_tEnd = std::chrono::steady_clock::now(); + m_tEnd = std::chrono::steady_clock::now(); m_tGlobEnd = std::chrono::steady_clock::now(); if( m_bytesCur ) { - double bitrate = (m_bytesCur*8 * m_framerate / (double)m_framesCur ); - double dTime = (double)std::chrono::duration_cast(m_tEnd-m_tStart).count() / 1000.0; - double dGlobTime = (double)std::chrono::duration_cast(m_tGlobEnd-m_tGlobStart).count() / 1000.0; - double dFps = dTime ? (double)m_framesCur / dTime : 0; - double dFpsAvg = dGlobTime ? (double)m_frames / dGlobTime : 0; + double dTime = (double)std::chrono::duration_cast(m_tEnd-m_tStart).count() / 1000.0; + double dGlobTime = (double)std::chrono::duration_cast(m_tGlobEnd-m_tGlobStart).count() / 1000.0; + double bitrateAvg = m_bytes*8 * m_framerate/(double)m_frames / 1000.0; + double dFpsAvg = dGlobTime ? (double)m_frames / dGlobTime : 0; + + double bitrate = finalInfo ? bitrateAvg : (m_bytesCur*8 * m_framerate / (double)m_framesCur/ 1000.0 ); + double dFps = finalInfo ? dFpsAvg : (dTime ? (double)m_framesCur / dTime : 0); + + if( bitrate > (double)m_maxratekbps ) + { + m_maxratekbps = static_cast(bitrate); + } + int setwBR = std::max( 8, (int)std::log10(m_maxratekbps) + 4 ); + if ( m_verbosity <= VVENC_INFO && m_istty ) css << "\r"; css << m_preString << "stats: "; if( m_maxFrames > 0 ) { @@ -234,11 +256,12 @@ class Stats css << " frame= " << std::setfill(' ') << std::setw(4) << m_frames; } - css << " fps= " << std::setfill(' ') << std::setw(4) << dFps << " avg_fps= " << std::setfill(' ') << std::setw(4) << dFpsAvg; - css << std::fixed << std::setprecision(2) << " bitrate= " << std::setfill(' ') << std::setw(7) << bitrate/1000.0 << " kbps"; - - bitrate = m_bytes*8 * m_framerate/(double)m_frames; - css << " avg_bitrate= " << std::setfill(' ') << std::setw(7) << bitrate/1000.0 << " kbps"; + css << std::fixed << std::setprecision(1); + css << " fps= " << std::setfill(' ') << std::setw(5) << dFps; + css << " avg_fps= " << std::setfill(' ') << std::setw(5) << dFpsAvg; + css << std::fixed << std::setprecision(2); + css << " bitrate= " << std::setfill(' ') << std::setw(setwBR) << bitrate << " kbps"; + css << " avg_bitrate= " << std::setfill(' ') << std::setw(setwBR) << bitrateAvg << " kbps"; int sec = std::ceil(dGlobTime); int days = sec/86400; @@ -269,7 +292,9 @@ class Stats css << std::setfill('0') << std::setw(2) << min << "m:"; css << std::setfill('0') << std::setw(2) << sec << "s"; } - css << std::setprecision(-1) << std::endl; + css << std::setprecision(-1) << " "; + + if ( m_verbosity > VVENC_INFO || 0 == m_istty ) css << std::endl; } m_bytesCur = 0; @@ -290,7 +315,9 @@ class Stats double dGlobTime = (double)std::chrono::duration_cast(m_tGlobEnd-m_tGlobStart).count() / 1000.0; double dFpsAvg = dGlobTime ? (double)m_frames / dGlobTime : 0; - css << std::endl << m_preString << "stats summary:"; + css << getInfoString( true ); + css << std::endl; + css << m_preString << "stats summary:"; css << " frame= " << m_frames; if( m_maxFrames > 0 ) { @@ -326,10 +353,13 @@ class Stats double m_framerate = 1.0; int m_maxFrames = 0; + vvencMsgLevel m_verbosity = VVENC_INFO; std::string m_preString; + int m_istty = 0; uint64_t m_bytes = 0; uint64_t m_bytesCur = 0; + int m_maxratekbps = 0; int m_frames = 0; int m_framesCur = 0; diff --git a/source/Lib/apputils/VVEncAppCfg.h b/source/Lib/apputils/VVEncAppCfg.h index 9aff214b6..98abe766f 100644 --- a/source/Lib/apputils/VVEncAppCfg.h +++ b/source/Lib/apputils/VVEncAppCfg.h @@ -380,6 +380,16 @@ const std::vector> BitrateAbrevToIntMap = { "bps", 1 } // bit/sec }; +const std::vector> BitrateOrScaleAbrevToIntMap = +{ + { "Mbps", 1000000 }, // mega bit/sec + { "M", 1000000 }, + { "kbps", 1000 }, // kilo bit/sec + { "k", 1000 }, + { "bps", 1 }, // bit/sec + { "x", -16 } // negative value: multiplier of target bitrate, with a fixed-point accuracy of 4 bit +}; + //// ==================================================================================================================== //// string <-> enum //// ==================================================================================================================== @@ -494,8 +504,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) IStreamToRefVec toNumTiles ( { &c->m_numTileCols, &c->m_numTileRows }, true, 'x' ); IStreamToFunc toInputFormatBitdepth ( setInputBitDepthAndColorSpace, this, c, &BitColorSpaceToIntMap, YUV420_8 ); - IStreamToAbbr toBitrate ( &c->m_RCTargetBitrate, &BitrateAbrevToIntMap ); - IStreamToAbbr toMaxRate ( &c->m_RCMaxBitrate, &BitrateAbrevToIntMap ); + IStreamToAbbr toBitrate ( &c->m_RCTargetBitrate, &BitrateAbrevToIntMap ); + IStreamToAbbr toMaxRate ( &c->m_RCMaxBitrate, &BitrateOrScaleAbrevToIntMap ); IStreamToEnum toDecRefreshType ( &c->m_DecodingRefreshType, &DecodingRefreshTypeToEnumMap ); IStreamToEnum toAud ( &c->m_AccessUnitDelimiter, &FlagToIntMap ); @@ -618,7 +628,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("bitrate,b", toBitrate, "bitrate for rate control (0: constant-QP encoding without rate control; otherwise\n" "bits/second; use e.g. 1.5M, 1.5Mbps, 1500k, 1500kbps, 1500000bps, 1500000)") ("maxrate,m", toMaxRate, "approximate maximum instantaneous bitrate for constrained VBR in rate control (0:\n" - "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000)") + "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000), use suffix 'x' " + "to specify as a multiple of target bitrate") ("passes,p", c->m_RCNumPasses, "number of encoding passes with rate control (1: single-pass, -1, 2: two-pass RC)") ("pass", c->m_RCPass, "rate control pass for two-pass rate control (-1: both, 1: first, 2: second pass)") ("rcstatsfile", m_RCStatsFileName, "rate control statistics file name") @@ -1134,7 +1145,8 @@ int parse( int argc, char* argv[], vvenc_config* c, std::ostream& rcOstr ) ("bitrate", toBitrate, "bitrate for rate control (0: constant-QP encoding without rate control, otherwise " "bits/second; use e.g. 1.5M, 1.5Mbps, 1500k, 1500kbps, 1500000bps, 1500000)") ("maxrate", toMaxRate, "approximate maximum instantaneous bitrate for constrained VBR in rate control (0: " - "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000)") + "no rate cap; use e.g. 3.5M, 3.5Mbps, 3500k, 3500kbps, 3500000bps, 3500000), use suffix 'x' " + "to specify as a multiple of target bitrate") ("qpa", toQPA, "Enable perceptually motivated QP adaptation, XPSNR based (0:off, 1:on)", true) ; } diff --git a/source/Lib/vvenc/CMakeLists.txt b/source/Lib/vvenc/CMakeLists.txt index 96bcfc471..b224d9f76 100644 --- a/source/Lib/vvenc/CMakeLists.txt +++ b/source/Lib/vvenc/CMakeLists.txt @@ -42,15 +42,21 @@ if( VVENC_ENABLE_X86_SIMD ) #file( GLOB SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" ) endif() +if( VVENC_ENABLE_ARM_SIMD ) + file( GLOB ARM_SRC_FILES "../CommonLib/arm/*.cpp" ) + file( GLOB ARM_INC_FILES "../CommonLib/arm/*.h" ) + + file( GLOB ARM_NEON_SRC_FILES "../CommonLib/arm/neon/*.cpp" ) +endif() + # get public/extern include files file( GLOB PUBLIC_INC_FILES "../../../include/${LIB_NAME}/*.h" ) -# get all source files -set( SRC_FILES ${BASE_SRC_FILES} ${X86_SRC_FILES} ) - -# get all include files -file( GLOB PRIVATE_INC_FILES ${BASE_INC_FILES} ${X86_INC_FILES} ) +# get all private include files +set( PRIVATE_INC_FILES ${BASE_INC_FILES} ${X86_INC_FILES} ${ARM_INC_FILES} ) +# get all source files +set( SRC_FILES ${BASE_SRC_FILES} ${X86_SRC_FILES} ${ARM_SRC_FILES} ) set( INC_FILES ${PRIVATE_INC_FILES} ${PUBLIC_INC_FILES} ) # NATVIS files for Visual Studio @@ -69,7 +75,7 @@ add_compile_definitions( ${LIB_NAME_UC}_SOURCE ) # set PRIVATE include directories for all targets in this directory include_directories( $ $ ) -include_directories( . .. ../DecoderLib ../EncoderLib ../CommonLib ../CommonLib/x86 ../apputils ) +include_directories( . .. ../DecoderLib ../EncoderLib ../CommonLib ../CommonLib/x86 ../CommonLib/arm ../apputils ) include_directories( SYSTEM ../../../thirdparty ) # set common warning flags @@ -122,12 +128,27 @@ if( VVENC_ENABLE_X86_SIMD ) set_target_properties( ${LIB_NAME}_x86_simd PROPERTIES FOLDER lib ) endif() +if( VVENC_ENABLE_ARM_SIMD ) + # set needed compile definitions + set_property( SOURCE ${ARM_NEON_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_NEON ) + + add_library( ${LIB_NAME}_arm_simd OBJECT ${ARM_NEON_SRC_FILES} ) + # NEON is enabled by default for all files, so don't need to disable LTO + # set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES + # INTERPROCEDURAL_OPTIMIZATION OFF + # INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF + # INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF + # INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF ) + + set_target_properties( ${LIB_NAME}_arm_simd PROPERTIES FOLDER lib ) +endif() + # set resource file for MSVC compilers if( MSVC ) set( RESOURCE_FILE ${LIB_NAME}.rc ) endif() -add_library( ${LIB_NAME} ${SRC_FILES} $<$:$> ${INC_FILES} ${NATVIS_FILES} ${RESOURCE_FILE} ) +add_library( ${LIB_NAME} ${SRC_FILES} $<$:$> $<$:$> ${INC_FILES} ${NATVIS_FILES} ${RESOURCE_FILE} ) target_compile_definitions( ${LIB_NAME} PUBLIC $<$,SHARED_LIBRARY>:${LIB_NAME_UC}_DYN_LINK> ) diff --git a/source/Lib/vvenc/vvencCfg.cpp b/source/Lib/vvenc/vvencCfg.cpp index 9765a5045..24a32987e 100644 --- a/source/Lib/vvenc/vvencCfg.cpp +++ b/source/Lib/vvenc/vvencCfg.cpp @@ -762,10 +762,14 @@ VVENC_DECL bool vvenc_init_config_parameter( vvenc_config *c ) vvenc_confirmParameter( c, c->m_pictureTimingSEIEnabled, "Enabling pictureTiming SEI requires rate control" ); vvenc_confirmParameter( c, c->m_RCMaxBitrate > 0, "Specifying a maximum bitrate requires rate control" ); } - else if ( c->m_RCMaxBitrate <= 0 ) + else if ( c->m_RCMaxBitrate == 0 ) { c->m_RCMaxBitrate = INT32_MAX; } + else if( c->m_RCMaxBitrate < 0 ) + { + c->m_RCMaxBitrate = ( -c->m_RCMaxBitrate * c->m_RCTargetBitrate + 8 ) >> 4; + } vvenc_confirmParameter( c, c->m_HdrMode < VVENC_HDR_OFF || c->m_HdrMode > VVENC_SDR_BT470BG, "Sdr/Hdr Mode must be in the range 0 - 8" ); diff --git a/source/Lib/vvenc/vvencimpl.cpp b/source/Lib/vvenc/vvencimpl.cpp index 0f957e853..a12a75f80 100644 --- a/source/Lib/vvenc/vvencimpl.cpp +++ b/source/Lib/vvenc/vvencimpl.cpp @@ -803,23 +803,28 @@ const char* VVEncImpl::setSIMDExtension( const char* simdId ) THROW( "requested SIMD level (" << simdReqStr << ") not supported by current CPU (max " << read_x86_extension_name() << ")." ); } -# if ENABLE_SIMD_OPT_BUFFER +#if ENABLE_SIMD_OPT_BUFFER + #if defined( TARGET_SIMD_X86 ) g_pelBufOP.initPelBufOpsX86(); -# endif -# if ENABLE_SIMD_TRAFO - g_tCoeffOps.initTCoeffOpsX86(); -# endif + #endif + #if defined( TARGET_SIMD_ARM ) + g_pelBufOP.initPelBufOpsARM(); + #endif +#endif +#if ENABLE_SIMD_TRAFO + g_tCoeffOps.initTCoeffOpsX86(); +#endif return read_x86_extension_name().c_str(); } -# if HANDLE_EXCEPTION +#if HANDLE_EXCEPTION catch( Exception& e ) { MsgLog msg; msg.log( VVENC_ERROR, "\n%s\n", e.what() ); return nullptr; } -# endif // HANDLE_EXCEPTION +#endif // HANDLE_EXCEPTION #else // !TARGET_SIMD_X86 if( !simdReqStr.empty() && simdReqStr != "SCALAR" ) {