From 479491279e724619b1f48c1a3c40eeb081a0a872 Mon Sep 17 00:00:00 2001 From: pdr Date: Wed, 6 Nov 2024 15:16:12 -0800 Subject: [PATCH] Dockerfile - Add support for arm64 build (#660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for arm64 build: - Updated dockerfile for arm64 build - extend cpu stream compilation for neoverse  - handle onnxruntime-gpu installation - third party builds filtering based on arch - disable cuda decode perf build for non x86 --- .github/workflows/build-image.yml | 6 +- dockerfile/cuda12.4.dockerfile | 46 ++-- setup.py | 4 +- .../cpu_stream_performance.py | 4 +- .../cuda_decode_performance/CMakeLists.txt | 226 +++++++++--------- third_party/Makefile | 13 +- third_party/stream-tests/Makefile | 34 ++- 7 files changed, 187 insertions(+), 146 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index bc6a6cf4a..fdfe9114c 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -28,21 +28,25 @@ jobs: - name: cuda12.4 dockerfile: cuda12.4 tags: superbench/main:cuda12.4 + platforms: linux/amd64 # TODO: linux/arm64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" - name: cuda12.2 dockerfile: cuda12.2 tags: superbench/main:cuda12.2 + platforms: linux/amd64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest + platforms: linux/amd64 runner: ubuntu-latest build_args: "NUM_MAKE_JOBS=8" - name: rocm6.2 dockerfile: rocm6.2.x tags: superbench/main:rocm6.2 + platforms: linux/amd64 runner: [self-hosted] build_args: "NUM_MAKE_JOBS=16" steps: @@ -125,7 +129,7 @@ jobs: id: docker_build uses: docker/build-push-action@v2 with: - platforms: linux/amd64 + platforms: ${{ matrix.platforms }} context: . file: ${{ steps.metadata.outputs.dockerfile }} push: ${{ github.event_name != 'pull_request' }} diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile index 3ec82f39d..560f0908a 100644 --- a/dockerfile/cuda12.4.dockerfile +++ b/dockerfile/cuda12.4.dockerfile @@ -19,6 +19,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 LABEL maintainer="SuperBench" ENV DEBIAN_FRONTEND=noninteractive + RUN apt-get update && \ apt-get install -y --no-install-recommends \ autoconf \ @@ -60,11 +61,13 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* /tmp/* ARG NUM_MAKE_JOBS= +ARG TARGETPLATFORM +ARG TARGETARCH # Install Docker ENV DOCKER_VERSION=20.10.8 -RUN cd /tmp && \ - wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ +RUN TARGETARCH_HW=$(uname -m) && \ + wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ rm docker.tgz @@ -80,40 +83,43 @@ RUN mkdir -p /root/.ssh && \ # Install OFED ENV OFED_VERSION=23.07-0.5.1.2 -RUN cd /tmp && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ +RUN TARGETARCH_HW=$(uname -m) && \ + cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* # Install HPC-X ENV HPCX_VERSION=v2.18 -RUN cd /opt && \ +RUN TARGETARCH_HW=$(uname -m) && \ + cd /opt && \ rm -rf hpcx && \ - wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz -O hpcx.tbz && \ + wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \ tar xf hpcx.tbz && \ - mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 hpcx && \ + mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW} hpcx && \ rm hpcx.tbz -# Install Intel MLC -RUN cd /tmp && \ +# Installs specific to amd64 platform +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + # Install Intel MLC + cd /tmp && \ wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ - rm -rf ./Linux mlc.tgz - -# Install AOCC compiler -RUN cd /tmp && \ + rm -rf ./Linux mlc.tgz && \ + # Install AOCC compiler wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ - rm -rf aocc-compiler-4.0.0_1_amd64.deb - -# Install AMD BLIS -RUN cd /tmp && \ + rm -rf aocc-compiler-4.0.0_1_amd64.deb && \ + # Install AMD BLIS wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ mv amd-blis /opt/AMD && \ - rm -rf aocl-blis-linux-aocc-4.0.tar.gz + rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ + else \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ + fi # Install NCCL 2.23.4 RUN cd /tmp && \ diff --git a/setup.py b/setup.py index 686bef0b9..93d53639c 100644 --- a/setup.py +++ b/setup.py @@ -215,8 +215,8 @@ def run(self): ], 'ort': [ 'onnx>=1.10.2', - 'onnxruntime-gpu==1.10.0; python_version<"3.10"', - 'onnxruntime-gpu; python_version>="3.10"', + 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"', + 'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"', ], 'nvidia': ['py3nvml>=0.2.6'], 'amd': ['amdsmi'], diff --git a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py index 6045e8868..57b4eb7db 100644 --- a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py +++ b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py @@ -23,7 +23,7 @@ def __init__(self, name, parameters=''): super().__init__(name, parameters) self._bin_name = 'streamZen3.exe' - self.__cpu_arch = ['other', 'zen3', 'zen4'] + self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2'] def add_parser_arguments(self): """Add the specified arguments.""" @@ -80,6 +80,8 @@ def _preprocess(self): exe = 'streamZen3.exe' elif self._args.cpu_arch == 'zen4': exe = 'streamZen4.exe' + elif self._args.cpu_arch == 'neo2': + exe = 'streamNeo2.exe' else: exe = 'streamx86.exe' diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt index 83cb15067..1022aed3d 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -4,114 +4,120 @@ cmake_minimum_required(VERSION 3.18) project(cuda_decode_performance) -find_package(CUDA QUIET) -if(CUDA_FOUND) - set(CMAKE_CXX_STANDARD 17) - set(CMAKE_CXX_STANDARD_REQUIRED ON) - - set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) - set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) - set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) - set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) - set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) - - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - find_package(PkgConfig REQUIRED) - pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) - pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) - pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) - pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) - - set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) - find_library(AVCODEC_LIBRARY NAMES avcodec - HINTS - ${PC_AVCODEC_LIBDIR} - ${PC_AVCODEC_LIBRARY_DIRS} - ) - find_library(AVFORMAT_LIBRARY NAMES avformat - HINTS - ${PC_AVFORMAT_LIBDIR} - ${PC_AVFORMAT_LIBRARY_DIRS} - ) - find_library(AVUTIL_LIBRARY NAMES avutil - HINTS - ${PC_AVUTIL_LIBDIR} - ${PC_AVUTIL_LIBRARY_DIRS} - ) - find_library(SWRESAMPLE_LIBRARY NAMES swresample - HINTS - ${PC_SWRESAMPLE_LIBDIR} - ${PC_SWRESAMPLE_LIBRARY_DIRS} - ) - set(AVCODEC_LIB ${AVCODEC_LIBRARY}) - set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) - set(AVUTIL_LIB ${AVUTIL_LIBRARY}) - set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) - endif() - - set(APP_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp - ) - - set(NV_DEC_SOURCES - ${NV_DEC_DIR}/NvDecoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp - ) - - set(NV_DEC_HDRS - ${NV_DEC_DIR}/NvDecoder.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h - ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h - ${NVCODEC_UTILS_DIR}/NvCodecUtils.h - ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h - ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h - ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h - ) - - source_group( "headers" FILES ${NV_DEC_HDRS} ) - source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) - set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") - find_package(CUDA) - set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") - if ( CMAKE_COMPILER_IS_GNUCC ) - if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) - list(APPEND CUDA_NVCC_FLAGS -std=c++11) - endif() - endif() - - # Check if the file exists - if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) - execute_process( - COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so - RESULT_VARIABLE result - ) - if(result) - message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") - endif() - endif () - - find_library(CUVID_LIB nvcuvid - HINTS - "/usr/local/lib/" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" - ) - - cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) - - set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} - ${NVCODEC_PUBLIC_INTERFACE_DIR} - ${NVCODEC_UTILS_DIR} - ${NV_CODEC_DIR} - ${NV_APPDEC_COMMON_DIR} - ${NV_FFMPEG_HDRS} - ${THIRD_PARTY_SAMPLE_DIR} - ) - - target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} - ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) - - install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) +# Check architecture +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.") +else() + find_package(CUDA QUIET) + if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp + ) + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp + ) + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h + ) + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid + HINTS + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" + ) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) + endif() + endif() diff --git a/third_party/Makefile b/third_party/Makefile index 7abac4fb4..63ca48f36 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -18,14 +18,19 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm -# Build all targets. +# Build targets. all: cuda rocm cuda_with_msccl: cuda cuda_msccl cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm -cpu: common cpu_perftest -common: cpu_hpl cpu_stream fio +cpu: common cpu_perftest cpu_stream +common: fio + +# non aarch64 specific targets +ifneq ($(shell uname -m), aarch64) +common: fio cpu_hpl directx_amd: directx_amf_encoding_latency +endif # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed. sb_micro_path: @@ -59,7 +64,7 @@ else endif if [ -d cuda-samples ]; then rm -rf cuda-samples; fi git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git - cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS) + cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS) cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/ # Build nccl-tests from commit 8274cb4 of default branch. diff --git a/third_party/stream-tests/Makefile b/third_party/stream-tests/Makefile index a5ed5ff35..a652defd9 100644 --- a/third_party/stream-tests/Makefile +++ b/third_party/stream-tests/Makefile @@ -1,16 +1,27 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang -CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 -GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 -ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 -ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 -GEN_OUTPUT= streamx86.exe -ZEN3_OUTPUT= streamZen3.exe -ZEN4_OUTPUT= streamZen4.exe +GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000 +ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 +ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 +NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 +GEN_OUTPUT := streamx86.exe +ZEN3_OUTPUT := streamZen3.exe +ZEN4_OUTPUT := streamZen4.exe +NEO2_OUTPUT := streamNeo2.exe + +ARCH := $(shell uname -m) + +ifeq ($(ARCH), aarch64) +CFLAGS := -Ofast -fopenmp -DNTIMES=200 +CC := gcc +all: NEO2 +else +CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang +CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 all: ZEN3 ZEN4 X86 +endif ZEN3: stream.c $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) @@ -18,6 +29,13 @@ ZEN4: $(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT) X86: $(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT) +NEO2: + $(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT) +ifeq ($(ARCH), aarch64) +clean: + rm $(NEO2_OUTPUT) +else clean: rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT) +endif