diff --git a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake index d5cadba9f..a2bf7ee81 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_common.cmake +++ b/superbench/benchmarks/micro_benchmarks/cuda_common.cmake @@ -33,6 +33,6 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED) list(APPEND NVCC_ARCHS_SUPPORTED 86) endif() if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.8) - list(APPEND NVCC_ARCHS_SUPPORTED 90) + list(APPEND NVCC_ARCHS_SUPPORTED 89 90) endif() endif() diff --git a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py index 8982905cb..fac19a8b7 100644 --- a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py +++ b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py @@ -47,6 +47,8 @@ def __init__(self, name, parameters=''): # Skip FP64 for RTX Turing/Ampere and Tesla T4/GA10x due to very limited FP64 TFLOP rate self.__kernel_map[7.5] = {k: self.__kernel_map[7.0][k] for k in self.__kernel_map[7.0] if 'fp64' not in k} self.__kernel_map[8.6] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'fp64' not in k} + # Skip FP64 for Ada Lovelace L4/L40 due to no native CUDA/Tensor Cores + self.__kernel_map[8.9] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'fp64' not in k} # Skip INT4 for Hopper due to no native CUDA/Tensor Cores self.__kernel_map[9.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k} self.__parse_logline = [ diff --git a/third_party/Makefile b/third_party/Makefile index 69623af8d..0a47bd455 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -33,7 +33,7 @@ sb_micro_path: # Build cutlass. cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1) - $(eval ARCHS := "70;75;80;86;90") + $(eval ARCHS := "70;75;80;86;89;90") else $(eval ARCHS := "70;75;80;86") endif