diff --git a/setup.py b/setup.py index 13bf9d044..93d53639c 100644 --- a/setup.py +++ b/setup.py @@ -215,8 +215,8 @@ def run(self): ], 'ort': [ 'onnx>=1.10.2', - 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine != "aarch64"', - 'onnxruntime-gpu; python_version>="3.10" and platform_machine != "aarch64"', + 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"', + 'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"', ], 'nvidia': ['py3nvml>=0.2.6'], 'amd': ['amdsmi'], diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt index 907f616fa..1022aed3d 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -4,121 +4,120 @@ cmake_minimum_required(VERSION 3.18) project(cuda_decode_performance) - - # Check architecture - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") - message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") - else() - find_package(CUDA QUIET) - if(CUDA_FOUND) - set(CMAKE_CXX_STANDARD 17) - set(CMAKE_CXX_STANDARD_REQUIRED ON) - - set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) - set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) - set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) - set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) - set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) - - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - find_package(PkgConfig REQUIRED) - pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) - pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) - pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) - pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) - - set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) - find_library(AVCODEC_LIBRARY NAMES avcodec - HINTS - ${PC_AVCODEC_LIBDIR} - ${PC_AVCODEC_LIBRARY_DIRS} - ) - find_library(AVFORMAT_LIBRARY NAMES avformat - HINTS - ${PC_AVFORMAT_LIBDIR} - ${PC_AVFORMAT_LIBRARY_DIRS} - ) - find_library(AVUTIL_LIBRARY NAMES avutil - HINTS - ${PC_AVUTIL_LIBDIR} - ${PC_AVUTIL_LIBRARY_DIRS} - ) - find_library(SWRESAMPLE_LIBRARY NAMES swresample - HINTS - ${PC_SWRESAMPLE_LIBDIR} - ${PC_SWRESAMPLE_LIBRARY_DIRS} - ) - set(AVCODEC_LIB ${AVCODEC_LIBRARY}) - set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) - set(AVUTIL_LIB ${AVUTIL_LIBRARY}) - set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) - endif() - - set(APP_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp - ) - - set(NV_DEC_SOURCES - ${NV_DEC_DIR}/NvDecoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp - ) - - set(NV_DEC_HDRS - ${NV_DEC_DIR}/NvDecoder.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h - ${NVCODEC_UTILS_DIR}/NvCodecUtils.h - ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h - ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h - ) - - source_group( "headers" FILES ${NV_DEC_HDRS} ) - source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) - set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") - find_package(CUDA) - set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") - if ( CMAKE_COMPILER_IS_GNUCC ) - if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) - list(APPEND CUDA_NVCC_FLAGS -std=c++11) - endif() - endif() - - # Check if the file exists - if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) - execute_process( - COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so - RESULT_VARIABLE result - ) - if(result) - message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") - endif() - endif () - - find_library(CUVID_LIB nvcuvid - HINTS - "/usr/local/lib/" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" - ) - - cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) - - set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} - ${NVCODEC_PUBLIC_INTERFACE_DIR} - ${NVCODEC_UTILS_DIR} - ${NV_CODEC_DIR} - ${NV_APPDEC_COMMON_DIR} - ${NV_FFMPEG_HDRS} - ${THIRD_PARTY_SAMPLE_DIR} - ) - - target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} - ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) - - install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) - endif() - - endif() +# Check architecture +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") +else() + find_package(CUDA QUIET) + if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp + ) + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp + ) + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h + ) + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid + HINTS + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" + ) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) + endif() + +endif() diff --git a/third_party/Makefile b/third_party/Makefile index 383a206dd..63ca48f36 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -19,19 +19,16 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm # Build targets. -ifeq ($(shell uname -m), aarch64) -all: cuda -cuda_with_msccl: cuda cuda_msccl -cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_gpuburn megatron_lm megatron_deepspeed -cpu: common cpu_perftest -common: cpu_stream fio -else all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm -cpu: common cpu_perftest -common: cpu_hpl cpu_stream fio +cpu: common cpu_perftest cpu_stream +common: fio + +# non aarch64 specific targets +ifneq ($(shell uname -m), aarch64) +common: fio cpu_hpl directx_amd: directx_amf_encoding_latency endif diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile index 8a86c0c59..a652defd9 100644 --- a/third_party/stream-tests/Makefile +++ b/third_party/stream-tests/Makefile @@ -1,29 +1,28 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 -ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 -ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 -NEO2FLAGS= -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 +GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000 +ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 +ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 +NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 -GEN_OUTPUT= streamx86.exe -ZEN3_OUTPUT= streamZen3.exe -ZEN4_OUTPUT= streamZen4.exe -NEO2_OUTPUT= streamNeo2.exe +GEN_OUTPUT := streamx86.exe +ZEN3_OUTPUT := streamZen3.exe +ZEN4_OUTPUT := streamZen4.exe +NEO2_OUTPUT := streamNeo2.exe ARCH := $(shell uname -m) ifeq ($(ARCH), aarch64) -CFLAGS = -Ofast -fopenmp -DNTIMES=200 -CC=gcc +CFLAGS := -Ofast -fopenmp -DNTIMES=200 +CC := gcc all: NEO2 else -CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang -CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 +CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang +CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 all: ZEN3 ZEN4 X86 endif - ZEN3: stream.c $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) ZEN4: