From 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 30 Oct 2024 11:40:19 -0700
Subject: [PATCH 1/6] Init cpu copy.

---
 .../micro_benchmarks/cpu_copy_performance.py  | 113 +++++++
 .../cpu_copy_performance/CMakeLists.txt       |  44 +++
 .../cpu_copy_performance/cpu_copy.cu          | 289 ++++++++++++++++++
 3 files changed, 446 insertions(+)
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu

diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
new file mode 100644
index 000000000..3b4d52c6d
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
@@ -0,0 +1,113 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the GPU Copy Bandwidth Performance benchmark."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
+    """The CPU copy bandwidth performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'cpu_copy'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=256 * 1024**2,
+            required=False,
+            help='Size of data buffer in bytes.',
+        )
+
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=20,
+            required=False,
+            help='Number of warm up rounds',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of data buffer copies performed.',
+        )
+
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            help='Enable data checking',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # TODO: enable hugepages?
+
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+
+        args = '--size %d --num_warm_up %d --num_loops %d' % (
+            self._args.size, self._args.num_warm_up, self._args.num_loops
+        )
+
+        if self._args.check_data:
+            args += ' --check_data'
+
+        self._commands = ['%s %s' % (self.__bin_path, args)]
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        try:
+            for output_line in raw_output.strip().splitlines():
+                self._result.add_result(output_line.strip())
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
new file mode 100644
index 000000000..2929afa50
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(cpu_copy LANGUAGES CXX)
+
+find_package(CUDAToolkit QUIET)
+
+# Cuda environment
+if(CUDAToolkit_FOUND)
+    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
+
+    include(../cuda_common.cmake)
+    add_executable(cpu_copy cpu_copy.cu)
+    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+    target_link_libraries(cpu_copy numa)
+else()
+    # ROCm environment
+    include(../rocm_common.cmake)
+    find_package(hip QUIET)
+    if(hip_FOUND)
+        message(STATUS "Found ROCm: " ${HIP_VERSION})
+
+        # Convert cuda code to hip code in cpp
+        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+        # link hip device lib
+        add_executable(cpu_copy cpu_copy.cpp)
+
+        include(CheckSymbolExists)
+        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+        if(${HIP_UNCACHED_MEMORY})
+            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
+        endif()
+
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+        target_link_libraries(cpu_copy numa hip::device)
+    else()
+        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
+endif()
+
+install(TARGETS cpu_copy RUNTIME DESTINATION bin)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
new file mode 100644
index 000000000..0c205a3d5
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
@@ -0,0 +1,289 @@
+#include <chrono>
+#include <cstring> // for memcpy
+#include <getopt.h>
+#include <iomanip> // for setting precision
+#include <iostream>
+#include <numa.h>
+#include <numeric>
+#include <vector>
+
+// Options accepted by this program.
+struct Opts {
+    // Data buffer size for copy benchmark.
+    uint64_t size = 0;
+
+    // Number of warm up rounds to run.
+    uint64_t num_warm_up = 0;
+
+    // Number of loops to run.
+    uint64_t num_loops = 0;
+
+    // Whether check data after copy.
+    bool check_data = false;
+};
+
+/**
+ * @brief Print the usage instructions for this program.
+ *
+ * This function outputs the correct way to execute the program,
+ * including any necessary command-line arguments and their descriptions.
+ */
+void PrintUsage() {
+    std::cout << "Usage: gpu_copy "
+              << "--size <size> "
+              << "--num_warm_up <num_warm_up> "
+              << "--num_loops <num_loops> "
+              << "[--check_data]" << std::endl;
+}
+
+/**
+ * @brief Checks if the system has CPUs available for a given NUMA node.
+ *
+ * This function determines whether there are CPUs available for the specified
+ * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with
+ * multiple processors to optimize memory access times.
+ *
+ * @param node_id The identifier of the NUMA node to check.
+ * @return true if there are CPUs available for the specified NUMA node, false otherwise.
+ */
+bool HasCPUsForNumaNode(int node) {
+    struct bitmask *bm = numa_allocate_nodemask();
+    std::vector<int> cpus;
+    if (numa_node_to_cpus(node, bm) < 0) {
+        perror("numa_node_to_cpus");
+        numa_bitmask_free(bm);
+        return false; // On error
+    }
+
+    for (int i = 0; i < numa_bitmask_weight(bm); i++) {
+        if (numa_bitmask_isbitset(bm, i)) {
+            numa_bitmask_free(bm);
+            return true;
+        }
+    }
+    return false;
+}
+
+/**
+ * @brief Parses command-line options for the CPU copy performance benchmark.
+ *
+ * This function processes the command-line arguments provided to the benchmark
+ * and sets the appropriate configuration options based on the input.
+ *
+ * @param argc The number of command-line arguments.
+ * @param argv The array of command-line arguments.
+ * @return An integer indicating the success or failure of the option parsing.
+ *         Returns 0 on success, and a non-zero value on failure.
+ */
+/**/
+int ParseOpts(int argc, char **argv, Opts *opts) {
+    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
+    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
+                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
+                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
+                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
+    int getopt_ret = 0;
+    int opt_idx = 0;
+    bool size_specified = false;
+    bool num_warm_up_specified = false;
+    bool num_loops_specified = false;
+    bool parse_err = false;
+
+    while (true) {
+        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
+        if (getopt_ret == -1) {
+            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
+                parse_err = true;
+            }
+            break;
+        } else if (getopt_ret == '?') {
+            parse_err = true;
+            break;
+        }
+        switch (opt_idx) {
+        case static_cast<int>(OptIdx::kSize):
+            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
+                std::cerr << "Invalid size: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                size_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumWarmUp):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
+                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_warm_up_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumLoops):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
+                std::cerr << "Invalid num_loops: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_loops_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kEnableCheckData):
+            opts->check_data = true;
+            break;
+        default:
+            parse_err = true;
+        }
+        if (parse_err) {
+            break;
+        }
+    }
+
+    if (parse_err) {
+        PrintUsage();
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief Benchmark the memory copy performance between two NUMA nodes.
+ *
+ * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
+ *
+ * @param src_node The source NUMA node from which memory will be copied.
+ * @param dst_node The destination NUMA node to which memory will be copied.
+ * @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
+ * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
+ */
+double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
+    int ret = 0;
+
+    // Set CPU affinity to the source NUMA node
+    ret = numa_run_on_node(src_node);
+    if (ret != 0) {
+        std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
+        return 0;
+    }
+
+    // Allocate memory on the source and destination NUMA nodes
+    char *src = (char *)numa_alloc_onnode(opts.size, src_node);
+    if (!src) {
+        std::cerr << "Memory allocation failed on node" << src_node << std::endl;
+        return 0;
+    }
+
+    char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
+    if (!dst) {
+        std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
+        return 0;
+    }
+
+    // Initialize the source memory with some data
+    memset(src, 1, opts.size);
+
+    // Measure the time taken for memcpy between nodes
+    auto start = std::chrono::high_resolution_clock::now();
+
+    // Perform the memory copy
+    memcpy(dst, src, opts.size);
+
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+
+    // Calculate the latency (nanoseconds per byte)
+    double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
+
+    // Free the allocated memory
+    numa_free(src, opts.size);
+    numa_free(dst, opts.size);
+
+    if (opts.check_data) {
+        // Check the data integrity after the copy
+        if (memcmp(src, dst, opts.size) != 0) {
+            std::cerr << "Data integrity check failed!" << dst_node << std::endl;
+
+            return -1;
+        }
+    }
+
+    return total_time_ns;
+}
+
+/**
+ * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
+ *
+ * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
+ * It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
+ *
+ * @param src_node The source NUMA node from which data will be copied.
+ * @param dst_node The destination NUMA node to which data will be copied.
+ * @param opts A reference to an Opts object containing various options and configurations for the benchmark.
+ */
+double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
+    double max_time_ns = 0;
+
+    // Run warm up rounds
+    for (int i = 0; i < opts.num_warm_up; i++) {
+        BenchmarkNUMACopy(src_node, dst_node, opts);
+    }
+
+    for (int i = 0; i < opts.num_loops; i++) {
+        double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts);
+        max_time_ns = std::max(max_time_ns, time_used_ns);
+    }
+
+    return max_time_ns;
+}
+
+int main(int argc, char **argv) {
+    Opts opts;
+    int ret = -1;
+    ret = ParseOpts(argc, argv, &opts);
+    if (0 != ret) {
+        return ret;
+    }
+
+    // Check if the system has multiple NUMA nodes
+    if (-1 == numa_available()) {
+        std::cerr << "NUMA is not available on this system!" << std::endl;
+        return 1;
+    }
+
+    int num_of_numa_nodes = numa_num_configured_nodes();
+
+    if (num_of_numa_nodes < 2) {
+        std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
+        return 1;
+    }
+
+    // Run the benchmark
+    for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
+        if (!HasCPUsForNumaNode(src_node)) {
+            // Skip the NUMA node if there are no CPUs available
+            continue;
+        }
+
+        for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
+            if (src_node == dst_node) {
+                // Skip the same NUMA node
+                continue;
+            }
+
+            if (!HasCPUsForNumaNode(dst_node)) {
+                // Skip the NUMA node if there are no CPUs available
+                continue;
+            }
+
+            double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
+            double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
+            double latency = time_used_ns / opts.size;          // ns/byte
+
+            // Output the result
+            std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw
+                      << std::endl;
+            std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9)
+                      << latency << std::endl;
+        }
+    }
+
+    return 0;
+}

From 4c9546ca4562a5914c3f9a62d78e0bb6edbf5877 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 30 Oct 2024 11:43:48 -0700
Subject: [PATCH 2/6] Revert "Init cpu copy."

This reverts commit 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7.
---
 .../micro_benchmarks/cpu_copy_performance.py  | 113 -------
 .../cpu_copy_performance/CMakeLists.txt       |  44 ---
 .../cpu_copy_performance/cpu_copy.cu          | 289 ------------------
 3 files changed, 446 deletions(-)
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu

diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
deleted file mode 100644
index 3b4d52c6d..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-"""Module of the GPU Copy Bandwidth Performance benchmark."""
-
-import os
-
-from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode
-from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
-
-
-class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
-    """The CPU copy bandwidth performance benchmark class."""
-    def __init__(self, name, parameters=''):
-        """Constructor.
-
-        Args:
-            name (str): benchmark name.
-            parameters (str): benchmark parameters.
-        """
-        super().__init__(name, parameters)
-
-        self._bin_name = 'cpu_copy'
-
-    def add_parser_arguments(self):
-        """Add the specified arguments."""
-        super().add_parser_arguments()
-
-        self._parser.add_argument(
-            '--size',
-            type=int,
-            default=256 * 1024**2,
-            required=False,
-            help='Size of data buffer in bytes.',
-        )
-
-        self._parser.add_argument(
-            '--num_warm_up',
-            type=int,
-            default=20,
-            required=False,
-            help='Number of warm up rounds',
-        )
-
-        self._parser.add_argument(
-            '--num_loops',
-            type=int,
-            default=100,
-            required=False,
-            help='Number of data buffer copies performed.',
-        )
-
-        self._parser.add_argument(
-            '--check_data',
-            action='store_true',
-            help='Enable data checking',
-        )
-
-    def _preprocess(self):
-        """Preprocess/preparation operations before the benchmarking.
-
-        Return:
-            True if _preprocess() succeed.
-        """
-        if not super()._preprocess():
-            return False
-
-        # TODO: enable hugepages?
-
-        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
-
-        args = '--size %d --num_warm_up %d --num_loops %d' % (
-            self._args.size, self._args.num_warm_up, self._args.num_loops
-        )
-
-        if self._args.check_data:
-            args += ' --check_data'
-
-        self._commands = ['%s %s' % (self.__bin_path, args)]
-
-        return True
-
-    def _process_raw_result(self, cmd_idx, raw_output):
-        """Function to parse raw results and save the summarized results.
-
-          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
-
-        Args:
-            cmd_idx (int): the index of command corresponding with the raw_output.
-            raw_output (str): raw output string of the micro-benchmark.
-
-        Return:
-            True if the raw output string is valid and result can be extracted.
-        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
-
-        try:
-            for output_line in raw_output.strip().splitlines():
-                self._result.add_result(output_line.strip())
-        except BaseException as e:
-            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
-            logger.error(
-                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
-                    self._curr_run_index, self._name, raw_output, str(e)
-                )
-            )
-            return False
-
-        return True
-
-
-BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
deleted file mode 100644
index 2929afa50..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-cmake_minimum_required(VERSION 3.18)
-
-project(cpu_copy LANGUAGES CXX)
-
-find_package(CUDAToolkit QUIET)
-
-# Cuda environment
-if(CUDAToolkit_FOUND)
-    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-
-    include(../cuda_common.cmake)
-    add_executable(cpu_copy cpu_copy.cu)
-    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_link_libraries(cpu_copy numa)
-else()
-    # ROCm environment
-    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
-
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-
-        # link hip device lib
-        add_executable(cpu_copy cpu_copy.cpp)
-
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(cpu_copy numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
-endif()
-
-install(TARGETS cpu_copy RUNTIME DESTINATION bin)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
deleted file mode 100644
index 0c205a3d5..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-#include <chrono>
-#include <cstring> // for memcpy
-#include <getopt.h>
-#include <iomanip> // for setting precision
-#include <iostream>
-#include <numa.h>
-#include <numeric>
-#include <vector>
-
-// Options accepted by this program.
-struct Opts {
-    // Data buffer size for copy benchmark.
-    uint64_t size = 0;
-
-    // Number of warm up rounds to run.
-    uint64_t num_warm_up = 0;
-
-    // Number of loops to run.
-    uint64_t num_loops = 0;
-
-    // Whether check data after copy.
-    bool check_data = false;
-};
-
-/**
- * @brief Print the usage instructions for this program.
- *
- * This function outputs the correct way to execute the program,
- * including any necessary command-line arguments and their descriptions.
- */
-void PrintUsage() {
-    std::cout << "Usage: gpu_copy "
-              << "--size <size> "
-              << "--num_warm_up <num_warm_up> "
-              << "--num_loops <num_loops> "
-              << "[--check_data]" << std::endl;
-}
-
-/**
- * @brief Checks if the system has CPUs available for a given NUMA node.
- *
- * This function determines whether there are CPUs available for the specified
- * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with
- * multiple processors to optimize memory access times.
- *
- * @param node_id The identifier of the NUMA node to check.
- * @return true if there are CPUs available for the specified NUMA node, false otherwise.
- */
-bool HasCPUsForNumaNode(int node) {
-    struct bitmask *bm = numa_allocate_nodemask();
-    std::vector<int> cpus;
-    if (numa_node_to_cpus(node, bm) < 0) {
-        perror("numa_node_to_cpus");
-        numa_bitmask_free(bm);
-        return false; // On error
-    }
-
-    for (int i = 0; i < numa_bitmask_weight(bm); i++) {
-        if (numa_bitmask_isbitset(bm, i)) {
-            numa_bitmask_free(bm);
-            return true;
-        }
-    }
-    return false;
-}
-
-/**
- * @brief Parses command-line options for the CPU copy performance benchmark.
- *
- * This function processes the command-line arguments provided to the benchmark
- * and sets the appropriate configuration options based on the input.
- *
- * @param argc The number of command-line arguments.
- * @param argv The array of command-line arguments.
- * @return An integer indicating the success or failure of the option parsing.
- *         Returns 0 on success, and a non-zero value on failure.
- */
-/**/
-int ParseOpts(int argc, char **argv, Opts *opts) {
-    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
-    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
-                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
-                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
-                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
-    int getopt_ret = 0;
-    int opt_idx = 0;
-    bool size_specified = false;
-    bool num_warm_up_specified = false;
-    bool num_loops_specified = false;
-    bool parse_err = false;
-
-    while (true) {
-        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
-        if (getopt_ret == -1) {
-            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
-                parse_err = true;
-            }
-            break;
-        } else if (getopt_ret == '?') {
-            parse_err = true;
-            break;
-        }
-        switch (opt_idx) {
-        case static_cast<int>(OptIdx::kSize):
-            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
-                std::cerr << "Invalid size: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                size_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumWarmUp):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
-                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_warm_up_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumLoops):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
-                std::cerr << "Invalid num_loops: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_loops_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kEnableCheckData):
-            opts->check_data = true;
-            break;
-        default:
-            parse_err = true;
-        }
-        if (parse_err) {
-            break;
-        }
-    }
-
-    if (parse_err) {
-        PrintUsage();
-        return -1;
-    }
-
-    return 0;
-}
-
-/**
- * @brief Benchmark the memory copy performance between two NUMA nodes.
- *
- * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
- *
- * @param src_node The source NUMA node from which memory will be copied.
- * @param dst_node The destination NUMA node to which memory will be copied.
- * @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
- * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
- */
-double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
-    int ret = 0;
-
-    // Set CPU affinity to the source NUMA node
-    ret = numa_run_on_node(src_node);
-    if (ret != 0) {
-        std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
-        return 0;
-    }
-
-    // Allocate memory on the source and destination NUMA nodes
-    char *src = (char *)numa_alloc_onnode(opts.size, src_node);
-    if (!src) {
-        std::cerr << "Memory allocation failed on node" << src_node << std::endl;
-        return 0;
-    }
-
-    char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
-    if (!dst) {
-        std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
-        return 0;
-    }
-
-    // Initialize the source memory with some data
-    memset(src, 1, opts.size);
-
-    // Measure the time taken for memcpy between nodes
-    auto start = std::chrono::high_resolution_clock::now();
-
-    // Perform the memory copy
-    memcpy(dst, src, opts.size);
-
-    auto end = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> diff = end - start;
-
-    // Calculate the latency (nanoseconds per byte)
-    double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
-
-    // Free the allocated memory
-    numa_free(src, opts.size);
-    numa_free(dst, opts.size);
-
-    if (opts.check_data) {
-        // Check the data integrity after the copy
-        if (memcmp(src, dst, opts.size) != 0) {
-            std::cerr << "Data integrity check failed!" << dst_node << std::endl;
-
-            return -1;
-        }
-    }
-
-    return total_time_ns;
-}
-
-/**
- * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
- *
- * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
- * It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
- *
- * @param src_node The source NUMA node from which data will be copied.
- * @param dst_node The destination NUMA node to which data will be copied.
- * @param opts A reference to an Opts object containing various options and configurations for the benchmark.
- */
-double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
-    double max_time_ns = 0;
-
-    // Run warm up rounds
-    for (int i = 0; i < opts.num_warm_up; i++) {
-        BenchmarkNUMACopy(src_node, dst_node, opts);
-    }
-
-    for (int i = 0; i < opts.num_loops; i++) {
-        double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts);
-        max_time_ns = std::max(max_time_ns, time_used_ns);
-    }
-
-    return max_time_ns;
-}
-
-int main(int argc, char **argv) {
-    Opts opts;
-    int ret = -1;
-    ret = ParseOpts(argc, argv, &opts);
-    if (0 != ret) {
-        return ret;
-    }
-
-    // Check if the system has multiple NUMA nodes
-    if (-1 == numa_available()) {
-        std::cerr << "NUMA is not available on this system!" << std::endl;
-        return 1;
-    }
-
-    int num_of_numa_nodes = numa_num_configured_nodes();
-
-    if (num_of_numa_nodes < 2) {
-        std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
-        return 1;
-    }
-
-    // Run the benchmark
-    for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
-        if (!HasCPUsForNumaNode(src_node)) {
-            // Skip the NUMA node if there are no CPUs available
-            continue;
-        }
-
-        for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
-            if (src_node == dst_node) {
-                // Skip the same NUMA node
-                continue;
-            }
-
-            if (!HasCPUsForNumaNode(dst_node)) {
-                // Skip the NUMA node if there are no CPUs available
-                continue;
-            }
-
-            double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
-            double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
-            double latency = time_used_ns / opts.size;          // ns/byte
-
-            // Output the result
-            std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw
-                      << std::endl;
-            std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9)
-                      << latency << std::endl;
-        }
-    }
-
-    return 0;
-}

From fea87c9dacffa416056cf6b146df75ad7d23541b Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Sat, 30 Nov 2024 12:50:47 -0800
Subject: [PATCH 3/6] Fix bug: nabandwidth benchmark need to handle 'N/A'
 valules in nvbandwidth output.

---
 examples/benchmarks/nvbandwidth.py            |  2 +-
 .../micro_benchmarks/nvbandwidth.py           | 26 ++++++++++++-------
 superbench/config/default.yaml                | 16 ++++++++++++
 .../micro_benchmarks/test_nvbandwidth.py      |  2 +-
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py
index 45b836734..c7d020e38 100644
--- a/examples/benchmarks/nvbandwidth.py
+++ b/examples/benchmarks/nvbandwidth.py
@@ -13,7 +13,7 @@
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
         'nvbandwidth',
-        platform=Platform.CPU,
+        platform=Platform.CUDA,
         parameters=(
             '--buffer_size 128 '
             '--test_cases 0,1,19,20 '
diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
index 81a032195..2da4eda2a 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -38,13 +38,11 @@ def add_parser_arguments(self):
 
         self._parser.add_argument(
             '--test_cases',
+            nargs='+',
             type=str,
-            default='',
+            default=[],
             required=False,
-            help=(
-                'Specify the test case(s) to run, either by name or index. By default, all test cases are executed. '
-                'Example: --test_cases 0,1,2,19,20'
-            ),
+            help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed.',
         )
 
         self._parser.add_argument(
@@ -92,7 +90,7 @@ def _preprocess(self):
             command += f' --bufferSize {self._args.buffer_size}'
 
         if self._args.test_cases:
-            command += ' --testcase ' + ' '.join([testcase.strip() for testcase in self._args.test_cases.split(',')])
+            command += ' --testcase ' + ' '.join(self._args.test_cases)
 
         if self._args.skip_verification:
             command += ' --skipVerification'
@@ -157,21 +155,29 @@ def _process_raw_line(self, line, parse_status):
         if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
             row_data = line.split()
             row_index = row_data[0]
+
             for col_index, value in enumerate(row_data[1:], start=1):
+                # Skip 'N/A' values
+                if value == 'N/A':
+                    continue
+
                 col_header = parse_status['matrix_header'][col_index - 1]
                 test_name = parse_status['test_name']
                 benchmark_type = parse_status['benchmark_type']
                 metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
                 parse_status['results'][metric_name] = float(value)
+
             return
 
         # Parse summary results
         summary_match = summary_pattern.search(line)
         if summary_match:
-            value = float(summary_match.group(2))
-            test_name = parse_status['test_name']
-            benchmark_type = parse_status['benchmark_type']
-            parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = value
+            value = summary_match.group(2)
+            # Skip 'N/A' values
+            if value != 'N/A':
+                test_name = parse_status['test_name']
+                benchmark_type = parse_status['benchmark_type']
+                parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value)
 
             # Reset parsing state for next test
             parse_status['test_name'] = ''
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 601136e9f..fdf758632 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -134,6 +134,22 @@ superbench:
         copy_type:
           - sm
           - dma
+    nvbandwidth:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        buffer_size: 128
+        test_cases:
+          - host_to_device_memcpy_ce
+          - device_to_host_memcpy_ce
+          - host_to_device_memcpy_sm
+          - device_to_host_memcpy_sm
+        num_loops: 6
+        skip_verification: false
+        disable_affinity: false
+        use_mean: false
     kernel-launch:
       <<: *default_local_mode
     gemm-flops:
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
index f6c82a030..f32f0f50d 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
@@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self):
         # Test preprocess with specified parameters
         parameters = (
             '--buffer_size 256 '
-            '--test_cases 0,1,2,19,20 '
+            '--test_cases 0 1 2 19  20 '
             '--skip_verification '
             '--disable_affinity '
             '--use_mean '

From d444c630c04b22bacca56725c11d37117eb3e4c6 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 4 Dec 2024 17:32:12 -0800
Subject: [PATCH 4/6] Fix comments.

---
 superbench/benchmarks/micro_benchmarks/nvbandwidth.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
index 2da4eda2a..1f7fa9544 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -42,7 +42,11 @@ def add_parser_arguments(self):
             type=str,
             default=[],
             required=False,
-            help='Specify the test case(s) to run, either by name or index. By default, all test cases are executed.',
+            help=(
+                'Specify the test case(s) to execute, either by name or index. '
+                'To view the available test case names or indices, run the command nvbandwidth on the host. '
+                'If no specific test case is specified, all test cases will be executed by default.'
+            ),
         )
 
         self._parser.add_argument(

From c7d7efff92fd1c4e3e7ec483680a946a105e605a Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Thu, 5 Dec 2024 13:44:39 -0800
Subject: [PATCH 5/6] Fix comments.

---
 examples/benchmarks/nvbandwidth.py                    | 2 +-
 superbench/benchmarks/micro_benchmarks/nvbandwidth.py | 2 --
 tests/benchmarks/micro_benchmarks/test_nvbandwidth.py | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py
index c7d020e38..e89edfbfc 100644
--- a/examples/benchmarks/nvbandwidth.py
+++ b/examples/benchmarks/nvbandwidth.py
@@ -16,7 +16,7 @@
         platform=Platform.CUDA,
         parameters=(
             '--buffer_size 128 '
-            '--test_cases 0,1,19,20 '
+            '--test_cases 0 1 19 20 '
             '--skip_verification '
             '--disable_affinity '
             '--use_mean '
diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
index 1f7fa9544..110966b86 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -159,7 +159,6 @@ def _process_raw_line(self, line, parse_status):
         if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
             row_data = line.split()
             row_index = row_data[0]
-
             for col_index, value in enumerate(row_data[1:], start=1):
                 # Skip 'N/A' values
                 if value == 'N/A':
@@ -170,7 +169,6 @@ def _process_raw_line(self, line, parse_status):
                 benchmark_type = parse_status['benchmark_type']
                 metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
                 parse_status['results'][metric_name] = float(value)
-
             return
 
         # Parse summary results
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
index f32f0f50d..630e77179 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
@@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self):
         # Test preprocess with specified parameters
         parameters = (
             '--buffer_size 256 '
-            '--test_cases 0 1 2 19  20 '
+            '--test_cases 0 1 2 19 20 '
             '--skip_verification '
             '--disable_affinity '
             '--use_mean '

From bd6aab25e52b8dbd7db66747b7299c013ec33edd Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Fri, 13 Dec 2024 10:39:25 -0800
Subject: [PATCH 6/6] Add func to handle waied and unsupported test cases.

---
 examples/benchmarks/nvbandwidth.py            |   2 +-
 .../micro_benchmarks/nvbandwidth.py           | 128 +++++++++++++-----
 .../micro_benchmarks/test_nvbandwidth.py      |   4 +-
 3 files changed, 96 insertions(+), 38 deletions(-)

diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py
index e89edfbfc..afdb46ddf 100644
--- a/examples/benchmarks/nvbandwidth.py
+++ b/examples/benchmarks/nvbandwidth.py
@@ -16,7 +16,7 @@
         platform=Platform.CUDA,
         parameters=(
             '--buffer_size 128 '
-            '--test_cases 0 1 19 20 '
+            '--test_cases host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce '
             '--skip_verification '
             '--disable_affinity '
             '--use_mean '
diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
index 110966b86..2f6a9c3c0 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -4,15 +4,23 @@
 """Module of the NV Bandwidth Test."""
 
 import os
+import subprocess
 import re
 
 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 
 
 class NvBandwidthBenchmark(MicroBenchmarkWithInvoke):
     """The NV Bandwidth Test benchmark class."""
+    # Regular expressions for summary line and matrix header detection
+    re_block_start_pattern = re.compile(r'^Running\s+(.+)$')
+    re_matrix_header_line = re.compile(r'^(memcpy|memory latency)')
+    re_matrix_row_pattern = re.compile(r'^\s*\d')
+    re_summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)')
+    re_unsupported_pattern = re.compile(r'ERROR: Testcase (\S+) not found!')
+
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -43,8 +51,8 @@ def add_parser_arguments(self):
             default=[],
             required=False,
             help=(
-                'Specify the test case(s) to execute, either by name or index. '
-                'To view the available test case names or indices, run the command nvbandwidth on the host. '
+                'Specify the test case(s) to execute by name only. '
+                'To view the available test case names, run the command "nvbandwidth -l" on the host. '
                 'If no specific test case is specified, all test cases will be executed by default.'
             ),
         )
@@ -95,6 +103,8 @@ def _preprocess(self):
 
         if self._args.test_cases:
             command += ' --testcase ' + ' '.join(self._args.test_cases)
+        else:
+            self._args.test_cases = self._get_all_test_cases()
 
         if self._args.skip_verification:
             command += ' --skipVerification'
@@ -113,78 +123,79 @@ def _preprocess(self):
         return True
 
     def _process_raw_line(self, line, parse_status):
-        """Process a single line of raw output from the nvbandwidth benchmark.
-
-        This function updates the `parse_status` dictionary with parsed results from the given `line`.
-        It detects the start of a test, parses matrix headers and rows, and extracts summary results.
+        """Process a raw line of text and update the parse status accordingly.
 
         Args:
-            line (str): A single line of raw output from the benchmark.
-            parse_status (dict): A dictionary to maintain the current parsing state and results. It should contain:
-                - 'test_name' (str): The name of the current test being parsed.
-                - 'benchmark_type' (str): 'bw' or 'lat'. It also indicating if matrix data is being parsed.
-                - 'matrix_header' (list): The header of the matrix being parsed.
-                - 'results' (dict): A dictionary to store the parsed results.
+            line (str): The raw line of text to be processed.
+            parse_status (dict): A dictionary containing the current parsing status,
+                     which will be updated based on the content of the line.
 
-        Return:
+        Returns:
             None
         """
-        # Regular expressions for summary line and matrix header detection
-        block_start_pattern = re.compile(r'^Running\s+(.+)$')
-        summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)')
-        matrix_header_line = re.compile(r'^(memcpy|memory latency)')
-        matrix_row_pattern = re.compile(r'^\s*\d')
-
         line = line.strip()
 
+        # Detect unsupported test cases
+        if self.re_unsupported_pattern.match(line):
+            parse_status['unsupported_testcases'].add(self.re_unsupported_pattern.match(line).group(1).lower())
+            return
+
         # Detect the start of a test
-        if block_start_pattern.match(line):
-            parse_status['test_name'] = block_start_pattern.match(line).group(1).lower()[:-1]
+        if self.re_block_start_pattern.match(line):
+            parse_status['test_name'] = self.re_block_start_pattern.match(line).group(1).lower()[:-1]
+            parse_status['excuted_testcases'].add(parse_status['test_name'])
             return
 
         # Detect the start of matrix data
-        if parse_status['test_name'] and matrix_header_line.match(line):
+        if parse_status['test_name'] and self.re_matrix_header_line.match(line):
             parse_status['benchmark_type'] = 'bw' if 'bandwidth' in line else 'lat'
+            # Parse the row and column name
+            tmp_idx = line.find('(row)')
+            parse_status['metrix_row'] = line[tmp_idx - 3:tmp_idx].lower()
+            tmp_idx = line.find('(column)')
+            parse_status['metrix_col'] = line[tmp_idx - 3:tmp_idx].lower()
             return
 
         # Parse the matrix header
         if (
             parse_status['test_name'] and parse_status['benchmark_type'] and not parse_status['matrix_header']
-            and matrix_row_pattern.match(line)
+            and self.re_matrix_row_pattern.match(line)
         ):
             parse_status['matrix_header'] = line.split()
             return
 
         # Parse matrix rows
-        if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
+        if parse_status['test_name'] and parse_status['benchmark_type'] and self.re_matrix_row_pattern.match(line):
             row_data = line.split()
             row_index = row_data[0]
             for col_index, value in enumerate(row_data[1:], start=1):
-                # Skip 'N/A' values
+                # Skip 'N/A' values, 'N/A' indicates the test path is self to self.
                 if value == 'N/A':
                     continue
 
                 col_header = parse_status['matrix_header'][col_index - 1]
                 test_name = parse_status['test_name']
                 benchmark_type = parse_status['benchmark_type']
-                metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
+                row_name = parse_status['metrix_row']
+                col_name = parse_status['metrix_col']
+                metric_name = f'{test_name}_{row_name}{row_index}_{col_name}{col_header}_{benchmark_type}'
                 parse_status['results'][metric_name] = float(value)
             return
 
         # Parse summary results
-        summary_match = summary_pattern.search(line)
-        if summary_match:
-            value = summary_match.group(2)
-            # Skip 'N/A' values
-            if value != 'N/A':
-                test_name = parse_status['test_name']
-                benchmark_type = parse_status['benchmark_type']
-                parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value)
+        if self.re_summary_pattern.match(line):
+            value = self.re_summary_pattern.match(line).group(2)
+            test_name = parse_status['test_name']
+            benchmark_type = parse_status['benchmark_type']
+            parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = float(value)
 
             # Reset parsing state for next test
             parse_status['test_name'] = ''
             parse_status['benchmark_type'] = None
             parse_status['matrix_header'].clear()
+            parse_status['metrix_row'] = ''
+            parse_status['metrix_col'] = ''
+            return
 
     def _process_raw_result(self, cmd_idx, raw_output):
         """Function to parse raw results and save the summarized results.
@@ -203,22 +214,45 @@ def _process_raw_result(self, cmd_idx, raw_output):
             content = raw_output.splitlines()
             parsing_status = {
                 'results': {},
+                'excuted_testcases': set(),
+                'unsupported_testcases': set(),
                 'benchmark_type': None,
                 'matrix_header': [],
                 'test_name': '',
+                'metrix_row': '',
+                'metrix_col': '',
             }
 
             for line in content:
                 self._process_raw_line(line, parsing_status)
 
+            return_code = ReturnCode.SUCCESS
+            # Log unsupported test cases
+            for testcase in parsing_status['unsupported_testcases']:
+                logger.warning(f'Test case {testcase} is not supported.')
+                return_code = ReturnCode.INVALID_ARGUMENT
+                self._result.add_raw_data(testcase, 'Not supported', self._args.log_raw_data)
+
+            # Check if the test case was waived
+            for testcase in self._args.test_cases:
+                if (
+                    testcase not in parsing_status['unsupported_testcases']
+                    and testcase not in parsing_status['excuted_testcases']
+                ):
+                    logger.warning(f'Test case {testcase} was waived.')
+                    self._result.add_raw_data(testcase, 'waived', self._args.log_raw_data)
+                    return_code = ReturnCode.INVALID_ARGUMENT
+
             if not parsing_status['results']:
                 self._result.add_raw_data('nvbandwidth', 'No valid results found', self._args.log_raw_data)
+                return_code = ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
                 return False
 
             # Store parsed results
             for metric, value in parsing_status['results'].items():
                 self._result.add_result(metric, value)
 
+            self._result.set_return_code(return_code)
             return True
         except Exception as e:
             logger.error(
@@ -229,5 +263,29 @@ def _process_raw_result(self, cmd_idx, raw_output):
             self._result.add_result('abort', 1)
             return False
 
+    @staticmethod
+    def _get_all_test_cases():
+        command = 'nvbandwidth -l'
+        test_case_pattern = re.compile(r'(\d+),\s+([\w_]+):')
+
+        try:
+            # Execute the command and capture output
+            result = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            # Check the return code
+            if result.returncode != 0:
+                logger.error(f'{command} failed with return code {result.returncode}')
+                return []
+
+            if result.stderr:
+                logger.error(f'{command} failed with {result.stderr}')
+                return []
+
+            # Parse the output
+            return [name for _, name in test_case_pattern.findall(result.stdout)]
+        except Exception as e:
+            logger.error(f'Failed to get all test case names: {e}')
+            return []
+
 
 BenchmarkRegistry.register_benchmark('nvbandwidth', NvBandwidthBenchmark, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
index 630e77179..7edf69d5f 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
@@ -34,7 +34,7 @@ def test_nvbandwidth_preprocess(self):
         # Test preprocess with specified parameters
         parameters = (
             '--buffer_size 256 '
-            '--test_cases 0 1 2 19 20 '
+            '--test_cases host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce '
             '--skip_verification '
             '--disable_affinity '
             '--use_mean '
@@ -47,7 +47,7 @@ def test_nvbandwidth_preprocess(self):
         # Check command
         assert (1 == len(benchmark._commands))
         assert ('--bufferSize 256' in benchmark._commands[0])
-        assert ('--testcase 0 1 2 19 20' in benchmark._commands[0])
+        assert ('--testcase host_to_device_memcpy_ce device_to_host_bidirectional_memcpy_ce' in benchmark._commands[0])
         assert ('--skipVerification' in benchmark._commands[0])
         assert ('--disableAffinity' in benchmark._commands[0])
         assert ('--useMean' in benchmark._commands[0])