From 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7 Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Wed, 30 Oct 2024 11:40:19 -0700 Subject: [PATCH 1/6] Init cpu copy. --- .../micro_benchmarks/cpu_copy_performance.py | 113 +++++++ .../cpu_copy_performance/CMakeLists.txt | 44 +++ .../cpu_copy_performance/cpu_copy.cu | 289 ++++++++++++++++++ 3 files changed, 446 insertions(+) create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py new file mode 100644 index 000000000..3b4d52c6d --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py @@ -0,0 +1,113 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the GPU Copy Bandwidth Performance benchmark.""" + +import os + +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, ReturnCode +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke): + """The CPU copy bandwidth performance benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + + self._bin_name = 'cpu_copy' + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + + self._parser.add_argument( + '--size', + type=int, + default=256 * 1024**2, + required=False, + help='Size of data buffer in bytes.', + ) + + self._parser.add_argument( + '--num_warm_up', + type=int, + default=20, + required=False, + help='Number of warm up rounds', + ) + + self._parser.add_argument( + '--num_loops', + type=int, + default=100, + required=False, + help='Number of data buffer copies performed.', + ) + + self._parser.add_argument( + '--check_data', + action='store_true', + help='Enable data checking', + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + # TODO: enable hugepages? + + self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name) + + args = '--size %d --num_warm_up %d --num_loops %d' % ( + self._args.size, self._args.num_warm_up, self._args.num_loops + ) + + if self._args.check_data: + args += ' --check_data' + + self._commands = ['%s %s' % (self.__bin_path, args)] + + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to parse raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) + + try: + for output_line in raw_output.strip().splitlines(): + self._result.add_result(output_line.strip()) + except BaseException as e: + self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format( + self._curr_run_index, self._name, raw_output, str(e) + ) + ) + return False + + return True + + +BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark) diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt new file mode 100644 index 000000000..2929afa50 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.18) + +project(cpu_copy LANGUAGES CXX) + +find_package(CUDAToolkit QUIET) + +# Cuda environment +if(CUDAToolkit_FOUND) + message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) + + include(../cuda_common.cmake) + add_executable(cpu_copy cpu_copy.cu) + set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) + target_link_libraries(cpu_copy numa) +else() + # ROCm environment + include(../rocm_common.cmake) + find_package(hip QUIET) + if(hip_FOUND) + message(STATUS "Found ROCm: " ${HIP_VERSION}) + + # Convert cuda code to hip code in cpp + execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/) + + # link hip device lib + add_executable(cpu_copy cpu_copy.cpp) + + include(CheckSymbolExists) + check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) + if(${HIP_UNCACHED_MEMORY}) + target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY) + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + target_link_libraries(cpu_copy numa hip::device) + else() + message(FATAL_ERROR "No CUDA or ROCm environment found.") + endif() +endif() + +install(TARGETS cpu_copy RUNTIME DESTINATION bin) diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu new file mode 100644 index 000000000..0c205a3d5 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu @@ -0,0 +1,289 @@ +#include +#include // for memcpy +#include +#include // for setting precision +#include +#include +#include +#include + +// Options accepted by this program. +struct Opts { + // Data buffer size for copy benchmark. + uint64_t size = 0; + + // Number of warm up rounds to run. + uint64_t num_warm_up = 0; + + // Number of loops to run. + uint64_t num_loops = 0; + + // Whether check data after copy. + bool check_data = false; +}; + +/** + * @brief Print the usage instructions for this program. + * + * This function outputs the correct way to execute the program, + * including any necessary command-line arguments and their descriptions. + */ +void PrintUsage() { + std::cout << "Usage: gpu_copy " + << "--size " + << "--num_warm_up " + << "--num_loops " + << "[--check_data]" << std::endl; +} + +/** + * @brief Checks if the system has CPUs available for a given NUMA node. + * + * This function determines whether there are CPUs available for the specified + * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with + * multiple processors to optimize memory access times. + * + * @param node_id The identifier of the NUMA node to check. + * @return true if there are CPUs available for the specified NUMA node, false otherwise. + */ +bool HasCPUsForNumaNode(int node) { + struct bitmask *bm = numa_allocate_nodemask(); + std::vector cpus; + if (numa_node_to_cpus(node, bm) < 0) { + perror("numa_node_to_cpus"); + numa_bitmask_free(bm); + return false; // On error + } + + for (int i = 0; i < numa_bitmask_weight(bm); i++) { + if (numa_bitmask_isbitset(bm, i)) { + numa_bitmask_free(bm); + return true; + } + } + return false; +} + +/** + * @brief Parses command-line options for the CPU copy performance benchmark. + * + * This function processes the command-line arguments provided to the benchmark + * and sets the appropriate configuration options based on the input. + * + * @param argc The number of command-line arguments. + * @param argv The array of command-line arguments. + * @return An integer indicating the success or failure of the option parsing. + * Returns 0 on success, and a non-zero value on failure. + */ +/**/ +int ParseOpts(int argc, char **argv, Opts *opts) { + enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData }; + const struct option options[] = {{"size", required_argument, nullptr, static_cast(OptIdx::kSize)}, + {"num_warm_up", required_argument, nullptr, static_cast(OptIdx::kNumWarmUp)}, + {"num_loops", required_argument, nullptr, static_cast(OptIdx::kNumLoops)}, + {"check_data", no_argument, nullptr, static_cast(OptIdx::kEnableCheckData)}}; + int getopt_ret = 0; + int opt_idx = 0; + bool size_specified = false; + bool num_warm_up_specified = false; + bool num_loops_specified = false; + bool parse_err = false; + + while (true) { + getopt_ret = getopt_long(argc, argv, "", options, &opt_idx); + if (getopt_ret == -1) { + if (!size_specified || !num_warm_up_specified || !num_loops_specified) { + parse_err = true; + } + break; + } else if (getopt_ret == '?') { + parse_err = true; + break; + } + switch (opt_idx) { + case static_cast(OptIdx::kSize): + if (1 != sscanf(optarg, "%lu", &(opts->size))) { + std::cerr << "Invalid size: " << optarg << std::endl; + parse_err = true; + } else { + size_specified = true; + } + break; + case static_cast(OptIdx::kNumWarmUp): + if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) { + std::cerr << "Invalid num_warm_up: " << optarg << std::endl; + parse_err = true; + } else { + num_warm_up_specified = true; + } + break; + case static_cast(OptIdx::kNumLoops): + if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) { + std::cerr << "Invalid num_loops: " << optarg << std::endl; + parse_err = true; + } else { + num_loops_specified = true; + } + break; + case static_cast(OptIdx::kEnableCheckData): + opts->check_data = true; + break; + default: + parse_err = true; + } + if (parse_err) { + break; + } + } + + if (parse_err) { + PrintUsage(); + return -1; + } + + return 0; +} + +/** + * @brief Benchmark the memory copy performance between two NUMA nodes. + * + * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node. + * + * @param src_node The source NUMA node from which memory will be copied. + * @param dst_node The destination NUMA node to which memory will be copied. + * @param opts A reference to an Opts structure containing various options and configurations for the benchmark. + * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency. + */ +double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) { + int ret = 0; + + // Set CPU affinity to the source NUMA node + ret = numa_run_on_node(src_node); + if (ret != 0) { + std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl; + return 0; + } + + // Allocate memory on the source and destination NUMA nodes + char *src = (char *)numa_alloc_onnode(opts.size, src_node); + if (!src) { + std::cerr << "Memory allocation failed on node" << src_node << std::endl; + return 0; + } + + char *dst = (char *)numa_alloc_onnode(opts.size, dst_node); + if (!dst) { + std::cerr << "Memory allocation failed on node" << dst_node << std::endl; + return 0; + } + + // Initialize the source memory with some data + memset(src, 1, opts.size); + + // Measure the time taken for memcpy between nodes + auto start = std::chrono::high_resolution_clock::now(); + + // Perform the memory copy + memcpy(dst, src, opts.size); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + + // Calculate the latency (nanoseconds per byte) + double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds + + // Free the allocated memory + numa_free(src, opts.size); + numa_free(dst, opts.size); + + if (opts.check_data) { + // Check the data integrity after the copy + if (memcmp(src, dst, opts.size) != 0) { + std::cerr << "Data integrity check failed!" << dst_node << std::endl; + + return -1; + } + } + + return total_time_ns; +} + +/** + * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes. + * + * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system. + * It calculates the average bandwidth and latency for each pair of nodes and outputs the results. + * + * @param src_node The source NUMA node from which data will be copied. + * @param dst_node The destination NUMA node to which data will be copied. + * @param opts A reference to an Opts object containing various options and configurations for the benchmark. + */ +double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) { + double max_time_ns = 0; + + // Run warm up rounds + for (int i = 0; i < opts.num_warm_up; i++) { + BenchmarkNUMACopy(src_node, dst_node, opts); + } + + for (int i = 0; i < opts.num_loops; i++) { + double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts); + max_time_ns = std::max(max_time_ns, time_used_ns); + } + + return max_time_ns; +} + +int main(int argc, char **argv) { + Opts opts; + int ret = -1; + ret = ParseOpts(argc, argv, &opts); + if (0 != ret) { + return ret; + } + + // Check if the system has multiple NUMA nodes + if (-1 == numa_available()) { + std::cerr << "NUMA is not available on this system!" << std::endl; + return 1; + } + + int num_of_numa_nodes = numa_num_configured_nodes(); + + if (num_of_numa_nodes < 2) { + std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl; + return 1; + } + + // Run the benchmark + for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) { + if (!HasCPUsForNumaNode(src_node)) { + // Skip the NUMA node if there are no CPUs available + continue; + } + + for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) { + if (src_node == dst_node) { + // Skip the same NUMA node + continue; + } + + if (!HasCPUsForNumaNode(dst_node)) { + // Skip the NUMA node if there are no CPUs available + continue; + } + + double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts); + double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s + double latency = time_used_ns / opts.size; // ns/byte + + // Output the result + std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw + << std::endl; + std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) + << latency << std::endl; + } + } + + return 0; +} From 4c9546ca4562a5914c3f9a62d78e0bb6edbf5877 Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Wed, 30 Oct 2024 11:43:48 -0700 Subject: [PATCH 2/6] Revert "Init cpu copy." This reverts commit 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7. --- .../micro_benchmarks/cpu_copy_performance.py | 113 ------- .../cpu_copy_performance/CMakeLists.txt | 44 --- .../cpu_copy_performance/cpu_copy.cu | 289 ------------------ 3 files changed, 446 deletions(-) delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py deleted file mode 100644 index 3b4d52c6d..000000000 --- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -"""Module of the GPU Copy Bandwidth Performance benchmark.""" - -import os - -from superbench.common.utils import logger -from superbench.benchmarks import BenchmarkRegistry, ReturnCode -from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke - - -class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke): - """The CPU copy bandwidth performance benchmark class.""" - def __init__(self, name, parameters=''): - """Constructor. - - Args: - name (str): benchmark name. - parameters (str): benchmark parameters. - """ - super().__init__(name, parameters) - - self._bin_name = 'cpu_copy' - - def add_parser_arguments(self): - """Add the specified arguments.""" - super().add_parser_arguments() - - self._parser.add_argument( - '--size', - type=int, - default=256 * 1024**2, - required=False, - help='Size of data buffer in bytes.', - ) - - self._parser.add_argument( - '--num_warm_up', - type=int, - default=20, - required=False, - help='Number of warm up rounds', - ) - - self._parser.add_argument( - '--num_loops', - type=int, - default=100, - required=False, - help='Number of data buffer copies performed.', - ) - - self._parser.add_argument( - '--check_data', - action='store_true', - help='Enable data checking', - ) - - def _preprocess(self): - """Preprocess/preparation operations before the benchmarking. - - Return: - True if _preprocess() succeed. - """ - if not super()._preprocess(): - return False - - # TODO: enable hugepages? - - self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name) - - args = '--size %d --num_warm_up %d --num_loops %d' % ( - self._args.size, self._args.num_warm_up, self._args.num_loops - ) - - if self._args.check_data: - args += ' --check_data' - - self._commands = ['%s %s' % (self.__bin_path, args)] - - return True - - def _process_raw_result(self, cmd_idx, raw_output): - """Function to parse raw results and save the summarized results. - - self._result.add_raw_data() and self._result.add_result() need to be called to save the results. - - Args: - cmd_idx (int): the index of command corresponding with the raw_output. - raw_output (str): raw output string of the micro-benchmark. - - Return: - True if the raw output string is valid and result can be extracted. - """ - self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) - - try: - for output_line in raw_output.strip().splitlines(): - self._result.add_result(output_line.strip()) - except BaseException as e: - self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) - logger.error( - 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format( - self._curr_run_index, self._name, raw_output, str(e) - ) - ) - return False - - return True - - -BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark) diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt deleted file mode 100644 index 2929afa50..000000000 --- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -cmake_minimum_required(VERSION 3.18) - -project(cpu_copy LANGUAGES CXX) - -find_package(CUDAToolkit QUIET) - -# Cuda environment -if(CUDAToolkit_FOUND) - message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) - - include(../cuda_common.cmake) - add_executable(cpu_copy cpu_copy.cu) - set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) - target_link_libraries(cpu_copy numa) -else() - # ROCm environment - include(../rocm_common.cmake) - find_package(hip QUIET) - if(hip_FOUND) - message(STATUS "Found ROCm: " ${HIP_VERSION}) - - # Convert cuda code to hip code in cpp - execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/) - - # link hip device lib - add_executable(cpu_copy cpu_copy.cpp) - - include(CheckSymbolExists) - check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) - if(${HIP_UNCACHED_MEMORY}) - target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY) - endif() - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") - target_link_libraries(cpu_copy numa hip::device) - else() - message(FATAL_ERROR "No CUDA or ROCm environment found.") - endif() -endif() - -install(TARGETS cpu_copy RUNTIME DESTINATION bin) diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu deleted file mode 100644 index 0c205a3d5..000000000 --- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu +++ /dev/null @@ -1,289 +0,0 @@ -#include -#include // for memcpy -#include -#include // for setting precision -#include -#include -#include -#include - -// Options accepted by this program. -struct Opts { - // Data buffer size for copy benchmark. - uint64_t size = 0; - - // Number of warm up rounds to run. - uint64_t num_warm_up = 0; - - // Number of loops to run. - uint64_t num_loops = 0; - - // Whether check data after copy. - bool check_data = false; -}; - -/** - * @brief Print the usage instructions for this program. - * - * This function outputs the correct way to execute the program, - * including any necessary command-line arguments and their descriptions. - */ -void PrintUsage() { - std::cout << "Usage: gpu_copy " - << "--size " - << "--num_warm_up " - << "--num_loops " - << "[--check_data]" << std::endl; -} - -/** - * @brief Checks if the system has CPUs available for a given NUMA node. - * - * This function determines whether there are CPUs available for the specified - * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with - * multiple processors to optimize memory access times. - * - * @param node_id The identifier of the NUMA node to check. - * @return true if there are CPUs available for the specified NUMA node, false otherwise. - */ -bool HasCPUsForNumaNode(int node) { - struct bitmask *bm = numa_allocate_nodemask(); - std::vector cpus; - if (numa_node_to_cpus(node, bm) < 0) { - perror("numa_node_to_cpus"); - numa_bitmask_free(bm); - return false; // On error - } - - for (int i = 0; i < numa_bitmask_weight(bm); i++) { - if (numa_bitmask_isbitset(bm, i)) { - numa_bitmask_free(bm); - return true; - } - } - return false; -} - -/** - * @brief Parses command-line options for the CPU copy performance benchmark. - * - * This function processes the command-line arguments provided to the benchmark - * and sets the appropriate configuration options based on the input. - * - * @param argc The number of command-line arguments. - * @param argv The array of command-line arguments. - * @return An integer indicating the success or failure of the option parsing. - * Returns 0 on success, and a non-zero value on failure. - */ -/**/ -int ParseOpts(int argc, char **argv, Opts *opts) { - enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData }; - const struct option options[] = {{"size", required_argument, nullptr, static_cast(OptIdx::kSize)}, - {"num_warm_up", required_argument, nullptr, static_cast(OptIdx::kNumWarmUp)}, - {"num_loops", required_argument, nullptr, static_cast(OptIdx::kNumLoops)}, - {"check_data", no_argument, nullptr, static_cast(OptIdx::kEnableCheckData)}}; - int getopt_ret = 0; - int opt_idx = 0; - bool size_specified = false; - bool num_warm_up_specified = false; - bool num_loops_specified = false; - bool parse_err = false; - - while (true) { - getopt_ret = getopt_long(argc, argv, "", options, &opt_idx); - if (getopt_ret == -1) { - if (!size_specified || !num_warm_up_specified || !num_loops_specified) { - parse_err = true; - } - break; - } else if (getopt_ret == '?') { - parse_err = true; - break; - } - switch (opt_idx) { - case static_cast(OptIdx::kSize): - if (1 != sscanf(optarg, "%lu", &(opts->size))) { - std::cerr << "Invalid size: " << optarg << std::endl; - parse_err = true; - } else { - size_specified = true; - } - break; - case static_cast(OptIdx::kNumWarmUp): - if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) { - std::cerr << "Invalid num_warm_up: " << optarg << std::endl; - parse_err = true; - } else { - num_warm_up_specified = true; - } - break; - case static_cast(OptIdx::kNumLoops): - if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) { - std::cerr << "Invalid num_loops: " << optarg << std::endl; - parse_err = true; - } else { - num_loops_specified = true; - } - break; - case static_cast(OptIdx::kEnableCheckData): - opts->check_data = true; - break; - default: - parse_err = true; - } - if (parse_err) { - break; - } - } - - if (parse_err) { - PrintUsage(); - return -1; - } - - return 0; -} - -/** - * @brief Benchmark the memory copy performance between two NUMA nodes. - * - * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node. - * - * @param src_node The source NUMA node from which memory will be copied. - * @param dst_node The destination NUMA node to which memory will be copied. - * @param opts A reference to an Opts structure containing various options and configurations for the benchmark. - * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency. - */ -double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) { - int ret = 0; - - // Set CPU affinity to the source NUMA node - ret = numa_run_on_node(src_node); - if (ret != 0) { - std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl; - return 0; - } - - // Allocate memory on the source and destination NUMA nodes - char *src = (char *)numa_alloc_onnode(opts.size, src_node); - if (!src) { - std::cerr << "Memory allocation failed on node" << src_node << std::endl; - return 0; - } - - char *dst = (char *)numa_alloc_onnode(opts.size, dst_node); - if (!dst) { - std::cerr << "Memory allocation failed on node" << dst_node << std::endl; - return 0; - } - - // Initialize the source memory with some data - memset(src, 1, opts.size); - - // Measure the time taken for memcpy between nodes - auto start = std::chrono::high_resolution_clock::now(); - - // Perform the memory copy - memcpy(dst, src, opts.size); - - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff = end - start; - - // Calculate the latency (nanoseconds per byte) - double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds - - // Free the allocated memory - numa_free(src, opts.size); - numa_free(dst, opts.size); - - if (opts.check_data) { - // Check the data integrity after the copy - if (memcmp(src, dst, opts.size) != 0) { - std::cerr << "Data integrity check failed!" << dst_node << std::endl; - - return -1; - } - } - - return total_time_ns; -} - -/** - * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes. - * - * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system. - * It calculates the average bandwidth and latency for each pair of nodes and outputs the results. - * - * @param src_node The source NUMA node from which data will be copied. - * @param dst_node The destination NUMA node to which data will be copied. - * @param opts A reference to an Opts object containing various options and configurations for the benchmark. - */ -double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) { - double max_time_ns = 0; - - // Run warm up rounds - for (int i = 0; i < opts.num_warm_up; i++) { - BenchmarkNUMACopy(src_node, dst_node, opts); - } - - for (int i = 0; i < opts.num_loops; i++) { - double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts); - max_time_ns = std::max(max_time_ns, time_used_ns); - } - - return max_time_ns; -} - -int main(int argc, char **argv) { - Opts opts; - int ret = -1; - ret = ParseOpts(argc, argv, &opts); - if (0 != ret) { - return ret; - } - - // Check if the system has multiple NUMA nodes - if (-1 == numa_available()) { - std::cerr << "NUMA is not available on this system!" << std::endl; - return 1; - } - - int num_of_numa_nodes = numa_num_configured_nodes(); - - if (num_of_numa_nodes < 2) { - std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl; - return 1; - } - - // Run the benchmark - for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) { - if (!HasCPUsForNumaNode(src_node)) { - // Skip the NUMA node if there are no CPUs available - continue; - } - - for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) { - if (src_node == dst_node) { - // Skip the same NUMA node - continue; - } - - if (!HasCPUsForNumaNode(dst_node)) { - // Skip the NUMA node if there are no CPUs available - continue; - } - - double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts); - double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s - double latency = time_used_ns / opts.size; // ns/byte - - // Output the result - std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw - << std::endl; - std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) - << latency << std::endl; - } - } - - return 0; -} From fea87c9dacffa416056cf6b146df75ad7d23541b Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Sat, 30 Nov 2024 12:50:47 -0800 Subject: [PATCH 3/6] Fix bug: nabandwidth benchmark need to handle 'N/A' valules in nvbandwidth output. --- examples/benchmarks/nvbandwidth.py | 2 +- .../micro_benchmarks/nvbandwidth.py | 26 ++++++++++++------- superbench/config/default.yaml | 16 ++++++++++++ .../micro_benchmarks/test_nvbandwidth.py | 2 +- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py index 45b836734..c7d020e38 100644 --- a/examples/benchmarks/nvbandwidth.py +++ b/examples/benchmarks/nvbandwidth.py @@ -13,7 +13,7 @@ if __name__ == '__main__': context = BenchmarkRegistry.create_benchmark_context( 'nvbandwidth', - platform=Platform.CPU, + platform=Platform.CUDA, parameters=( '--buffer_size 128 ' '--test_cases 0,1,19,20 ' diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py index 81a032195..2da4eda2a 100644 --- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py +++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py @@ -38,13 +38,11 @@ def add_parser_arguments(self): self._parser.add_argument( '--test_cases', + nargs='+', type=str, - default='', + default=[], required=False, - help=( - 'Specify the test case(s) to run, either by name or index. By default, all test cases are executed. ' - 'Example: --test_cases 0,1,2,19,20' - ), + help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed.', ) self._parser.add_argument( @@ -92,7 +90,7 @@ def _preprocess(self): command += f' --bufferSize {self._args.buffer_size}' if self._args.test_cases: - command += ' --testcase ' + ' '.join([testcase.strip() for testcase in self._args.test_cases.split(',')]) + command += ' --testcase ' + ' '.join(self._args.test_cases) if self._args.skip_verification: command += ' --skipVerification' @@ -157,21 +155,29 @@ def _process_raw_line(self, line, parse_status): if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line): row_data = line.split() row_index = row_data[0] + for col_index, value in enumerate(row_data[1:], start=1): + # Skip 'N/A' values + if value == 'N/A': + continue + col_header = parse_status['matrix_header'][col_index - 1] test_name = parse_status['test_name'] benchmark_type = parse_status['benchmark_type'] metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}' parse_status['results'][metric_name] = float(value) + return # Parse summary results summary_match = summary_pattern.search(line) if summary_match: - value = float(summary_match.group(2)) - test_name = parse_status['test_name'] - benchmark_type = parse_status['benchmark_type'] - parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = value + value = summary_match.group(2) + # Skip 'N/A' values + if value != 'N/A': + test_name = parse_status['test_name'] + benchmark_type = parse_status['benchmark_type'] + parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value) # Reset parsing state for next test parse_status['test_name'] = '' diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 601136e9f..fdf758632 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -134,6 +134,22 @@ superbench: copy_type: - sm - dma + nvbandwidth: + enable: true + modes: + - name: local + parallel: no + parameters: + buffer_size: 128 + test_cases: + - host_to_device_memcpy_ce + - device_to_host_memcpy_ce + - host_to_device_memcpy_sm + - device_to_host_memcpy_sm + num_loops: 6 + skip_verification: false + disable_affinity: false + use_mean: false kernel-launch: <<: *default_local_mode gemm-flops: diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py index f6c82a030..f32f0f50d 100644 --- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py +++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py @@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self): # Test preprocess with specified parameters parameters = ( '--buffer_size 256 ' - '--test_cases 0,1,2,19,20 ' + '--test_cases 0 1 2 19 20 ' '--skip_verification ' '--disable_affinity ' '--use_mean ' From d444c630c04b22bacca56725c11d37117eb3e4c6 Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Wed, 4 Dec 2024 17:32:12 -0800 Subject: [PATCH 4/6] Fix comments. --- superbench/benchmarks/micro_benchmarks/nvbandwidth.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py index 2da4eda2a..1f7fa9544 100644 --- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py +++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py @@ -42,7 +42,11 @@ def add_parser_arguments(self): type=str, default=[], required=False, - help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed.', + help=( + 'Specify the test case(s) to execute, either by name or index. ' + 'To view the available test case names or indices, run the command nvbandwidth on the host. ' + 'If no specific test case is specified, all test cases will be executed by default.' + ), ) self._parser.add_argument( From c7d7efff92fd1c4e3e7ec483680a946a105e605a Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Thu, 5 Dec 2024 13:44:39 -0800 Subject: [PATCH 5/6] Fix comments. --- examples/benchmarks/nvbandwidth.py | 2 +- superbench/benchmarks/micro_benchmarks/nvbandwidth.py | 2 -- tests/benchmarks/micro_benchmarks/test_nvbandwidth.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py index c7d020e38..e89edfbfc 100644 --- a/examples/benchmarks/nvbandwidth.py +++ b/examples/benchmarks/nvbandwidth.py @@ -16,7 +16,7 @@ platform=Platform.CUDA, parameters=( '--buffer_size 128 ' - '--test_cases 0,1,19,20 ' + '--test_cases 0 1 19 20 ' '--skip_verification ' '--disable_affinity ' '--use_mean ' diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py index 1f7fa9544..110966b86 100644 --- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py +++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py @@ -159,7 +159,6 @@ def _process_raw_line(self, line, parse_status): if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line): row_data = line.split() row_index = row_data[0] - for col_index, value in enumerate(row_data[1:], start=1): # Skip 'N/A' values if value == 'N/A': @@ -170,7 +169,6 @@ def _process_raw_line(self, line, parse_status): benchmark_type = parse_status['benchmark_type'] metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}' parse_status['results'][metric_name] = float(value) - return # Parse summary results diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py index f32f0f50d..630e77179 100644 --- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py +++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py @@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self): # Test preprocess with specified parameters parameters = ( '--buffer_size 256 ' - '--test_cases 0 1 2 19 20 ' + '--test_cases 0 1 2 19 20 ' '--skip_verification ' '--disable_affinity ' '--use_mean ' From bd6aab25e52b8dbd7db66747b7299c013ec33edd Mon Sep 17 00:00:00 2001 From: hongtaozhang Date: Fri, 13 Dec 2024 10:39:25 -0800 Subject: [PATCH 6/6] Add func to handle waied and unsupported test cases. --- examples/benchmarks/nvbandwidth.py | 2 +- .../micro_benchmarks/nvbandwidth.py | 128 +++++++++++++----- .../micro_benchmarks/test_nvbandwidth.py | 4 +- 3 files changed, 96 insertions(+), 38 deletions(-) diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py index e89edfbfc..afdb46ddf 100644 --- a/examples/benchmarks/nvbandwidth.py +++ b/examples/benchmarks/nvbandwidth.py @@ -16,7 +16,7 @@ platform=Platform.CUDA, parameters=( '--buffer_size 128 ' - '--test_cases 0 1 19 20 ' + '--test_cases host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce ' '--skip_verification ' '--disable_affinity ' '--use_mean ' diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py index 110966b86..2f6a9c3c0 100644 --- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py +++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py @@ -4,15 +4,23 @@ """Module of the NV Bandwidth Test.""" import os +import subprocess import re from superbench.common.utils import logger -from superbench.benchmarks import BenchmarkRegistry, Platform +from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke class NvBandwidthBenchmark(MicroBenchmarkWithInvoke): """The NV Bandwidth Test benchmark class.""" + # Regular expressions for summary line and matrix header detection + re_block_start_pattern = re.compile(r'^Running\s+(.+)$') + re_matrix_header_line = re.compile(r'^(memcpy|memory latency)') + re_matrix_row_pattern = re.compile(r'^\s*\d') + re_summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)') + re_unsupported_pattern = re.compile(r'ERROR: Testcase (\S+) not found!') + def __init__(self, name, parameters=''): """Constructor. @@ -43,8 +51,8 @@ def add_parser_arguments(self): default=[], required=False, help=( - 'Specify the test case(s) to execute, either by name or index. ' - 'To view the available test case names or indices, run the command nvbandwidth on the host. ' + 'Specify the test case(s) to execute by name only. ' + 'To view the available test case names, run the command "nvbandwidth -l" on the host. ' 'If no specific test case is specified, all test cases will be executed by default.' ), ) @@ -95,6 +103,8 @@ def _preprocess(self): if self._args.test_cases: command += ' --testcase ' + ' '.join(self._args.test_cases) + else: + self._args.test_cases = self._get_all_test_cases() if self._args.skip_verification: command += ' --skipVerification' @@ -113,78 +123,79 @@ def _preprocess(self): return True def _process_raw_line(self, line, parse_status): - """Process a single line of raw output from the nvbandwidth benchmark. - - This function updates the `parse_status` dictionary with parsed results from the given `line`. - It detects the start of a test, parses matrix headers and rows, and extracts summary results. + """Process a raw line of text and update the parse status accordingly. Args: - line (str): A single line of raw output from the benchmark. - parse_status (dict): A dictionary to maintain the current parsing state and results. It should contain: - - 'test_name' (str): The name of the current test being parsed. - - 'benchmark_type' (str): 'bw' or 'lat'. It also indicating if matrix data is being parsed. - - 'matrix_header' (list): The header of the matrix being parsed. - - 'results' (dict): A dictionary to store the parsed results. + line (str): The raw line of text to be processed. + parse_status (dict): A dictionary containing the current parsing status, + which will be updated based on the content of the line. - Return: + Returns: None """ - # Regular expressions for summary line and matrix header detection - block_start_pattern = re.compile(r'^Running\s+(.+)$') - summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)') - matrix_header_line = re.compile(r'^(memcpy|memory latency)') - matrix_row_pattern = re.compile(r'^\s*\d') - line = line.strip() + # Detect unsupported test cases + if self.re_unsupported_pattern.match(line): + parse_status['unsupported_testcases'].add(self.re_unsupported_pattern.match(line).group(1).lower()) + return + # Detect the start of a test - if block_start_pattern.match(line): - parse_status['test_name'] = block_start_pattern.match(line).group(1).lower()[:-1] + if self.re_block_start_pattern.match(line): + parse_status['test_name'] = self.re_block_start_pattern.match(line).group(1).lower()[:-1] + parse_status['excuted_testcases'].add(parse_status['test_name']) return # Detect the start of matrix data - if parse_status['test_name'] and matrix_header_line.match(line): + if parse_status['test_name'] and self.re_matrix_header_line.match(line): parse_status['benchmark_type'] = 'bw' if 'bandwidth' in line else 'lat' + # Parse the row and column name + tmp_idx = line.find('(row)') + parse_status['metrix_row'] = line[tmp_idx - 3:tmp_idx].lower() + tmp_idx = line.find('(column)') + parse_status['metrix_col'] = line[tmp_idx - 3:tmp_idx].lower() return # Parse the matrix header if ( parse_status['test_name'] and parse_status['benchmark_type'] and not parse_status['matrix_header'] - and matrix_row_pattern.match(line) + and self.re_matrix_row_pattern.match(line) ): parse_status['matrix_header'] = line.split() return # Parse matrix rows - if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line): + if parse_status['test_name'] and parse_status['benchmark_type'] and self.re_matrix_row_pattern.match(line): row_data = line.split() row_index = row_data[0] for col_index, value in enumerate(row_data[1:], start=1): - # Skip 'N/A' values + # Skip 'N/A' values, 'N/A' indicates the test path is self to self. if value == 'N/A': continue col_header = parse_status['matrix_header'][col_index - 1] test_name = parse_status['test_name'] benchmark_type = parse_status['benchmark_type'] - metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}' + row_name = parse_status['metrix_row'] + col_name = parse_status['metrix_col'] + metric_name = f'{test_name}_{row_name}{row_index}_{col_name}{col_header}_{benchmark_type}' parse_status['results'][metric_name] = float(value) return # Parse summary results - summary_match = summary_pattern.search(line) - if summary_match: - value = summary_match.group(2) - # Skip 'N/A' values - if value != 'N/A': - test_name = parse_status['test_name'] - benchmark_type = parse_status['benchmark_type'] - parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value) + if self.re_summary_pattern.match(line): + value = self.re_summary_pattern.match(line).group(2) + test_name = parse_status['test_name'] + benchmark_type = parse_status['benchmark_type'] + parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value) # Reset parsing state for next test parse_status['test_name'] = '' parse_status['benchmark_type'] = None parse_status['matrix_header'].clear() + parse_status['metrix_row'] = '' + parse_status['metrix_col'] = '' + return def _process_raw_result(self, cmd_idx, raw_output): """Function to parse raw results and save the summarized results. @@ -203,22 +214,45 @@ def _process_raw_result(self, cmd_idx, raw_output): content = raw_output.splitlines() parsing_status = { 'results': {}, + 'excuted_testcases': set(), + 'unsupported_testcases': set(), 'benchmark_type': None, 'matrix_header': [], 'test_name': '', + 'metrix_row': '', + 'metrix_col': '', } for line in content: self._process_raw_line(line, parsing_status) + return_code = ReturnCode.SUCCESS + # Log unsupported test cases + for testcase in parsing_status['unsupported_testcases']: + logger.warning(f'Test case {testcase} is not supported.') + return_code = ReturnCode.INVALID_ARGUMENT + self._result.add_raw_data(testcase, 'Not supported', self._args.log_raw_data) + + # Check if the test case was waived + for testcase in self._args.test_cases: + if ( + testcase not in parsing_status['unsupported_testcases'] + and testcase not in parsing_status['excuted_testcases'] + ): + logger.warning(f'Test case {testcase} was waived.') + self._result.add_raw_data(testcase, 'waived', self._args.log_raw_data) + return_code = ReturnCode.INVALID_ARGUMENT + if not parsing_status['results']: self._result.add_raw_data('nvbandwidth', 'No valid results found', self._args.log_raw_data) + return_code = ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE return False # Store parsed results for metric, value in parsing_status['results'].items(): self._result.add_result(metric, value) + self._result.set_return_code(return_code) return True except Exception as e: logger.error( @@ -229,5 +263,29 @@ def _process_raw_result(self, cmd_idx, raw_output): self._result.add_result('abort', 1) return False + @staticmethod + def _get_all_test_cases(): + command = 'nvbandwidth -l' + test_case_pattern = re.compile(r'(\d+),\s+([\w_]+):') + + try: + # Execute the command and capture output + result = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Check the return code + if result.returncode != 0: + logger.error(f'{command} failed with return code {result.returncode}') + return [] + + if result.stderr: + logger.error(f'{command} failed with {result.stderr}') + return [] + + # Parse the output + return [name for _, name in test_case_pattern.findall(result.stdout)] + except Exception as e: + logger.error(f'Failed to get all test case names: {e}') + return [] + BenchmarkRegistry.register_benchmark('nvbandwidth', NvBandwidthBenchmark, platform=Platform.CUDA) diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py index 630e77179..7edf69d5f 100644 --- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py +++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py @@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self): # Test preprocess with specified parameters parameters = ( '--buffer_size 256 ' - '--test_cases 0 1 2 19 20 ' + '--test_cases host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce ' '--skip_verification ' '--disable_affinity ' '--use_mean ' @@ -47,7 +47,7 @@ def test_nvbandwidth_preprocess(self): # Check command assert (1 == len(benchmark._commands)) assert ('--bufferSize 256' in benchmark._commands[0]) - assert ('--testcase 0 1 2 19 20' in benchmark._commands[0]) + assert ('--testcase host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce' in benchmark._commands[0]) assert ('--skipVerification' in benchmark._commands[0]) assert ('--disableAffinity' in benchmark._commands[0]) assert ('--useMean' in benchmark._commands[0])