From 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 30 Oct 2024 11:40:19 -0700
Subject: [PATCH 1/6] Init cpu copy.

---
 .../micro_benchmarks/cpu_copy_performance.py  | 113 +++++++
 .../cpu_copy_performance/CMakeLists.txt       |  44 +++
 .../cpu_copy_performance/cpu_copy.cu          | 289 ++++++++++++++++++
 3 files changed, 446 insertions(+)
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
 create mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu

diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
new file mode 100644
index 000000000..3b4d52c6d
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
@@ -0,0 +1,113 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the GPU Copy Bandwidth Performance benchmark."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
+    """The CPU copy bandwidth performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'cpu_copy'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=256 * 1024**2,
+            required=False,
+            help='Size of data buffer in bytes.',
+        )
+
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=20,
+            required=False,
+            help='Number of warm up rounds',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of data buffer copies performed.',
+        )
+
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            help='Enable data checking',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # TODO: enable hugepages?
+
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+
+        args = '--size %d --num_warm_up %d --num_loops %d' % (
+            self._args.size, self._args.num_warm_up, self._args.num_loops
+        )
+
+        if self._args.check_data:
+            args += ' --check_data'
+
+        self._commands = ['%s %s' % (self.__bin_path, args)]
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        try:
+            for output_line in raw_output.strip().splitlines():
+                self._result.add_result(output_line.strip())
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
new file mode 100644
index 000000000..2929afa50
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(cpu_copy LANGUAGES CXX)
+
+find_package(CUDAToolkit QUIET)
+
+# Cuda environment
+if(CUDAToolkit_FOUND)
+    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
+
+    include(../cuda_common.cmake)
+    add_executable(cpu_copy cpu_copy.cu)
+    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+    target_link_libraries(cpu_copy numa)
+else()
+    # ROCm environment
+    include(../rocm_common.cmake)
+    find_package(hip QUIET)
+    if(hip_FOUND)
+        message(STATUS "Found ROCm: " ${HIP_VERSION})
+
+        # Convert cuda code to hip code in cpp
+        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+        # link hip device lib
+        add_executable(cpu_copy cpu_copy.cpp)
+
+        include(CheckSymbolExists)
+        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+        if(${HIP_UNCACHED_MEMORY})
+            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
+        endif()
+
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+        target_link_libraries(cpu_copy numa hip::device)
+    else()
+        message(FATAL_ERROR "No CUDA or ROCm environment found.")
+    endif()
+endif()
+
+install(TARGETS cpu_copy RUNTIME DESTINATION bin)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
new file mode 100644
index 000000000..0c205a3d5
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
@@ -0,0 +1,289 @@
+#include <chrono>
+#include <cstring> // for memcpy
+#include <getopt.h>
+#include <iomanip> // for setting precision
+#include <iostream>
+#include <numa.h>
+#include <numeric>
+#include <vector>
+
+// Options accepted by this program.
+struct Opts {
+    // Data buffer size for copy benchmark.
+    uint64_t size = 0;
+
+    // Number of warm up rounds to run.
+    uint64_t num_warm_up = 0;
+
+    // Number of loops to run.
+    uint64_t num_loops = 0;
+
+    // Whether check data after copy.
+    bool check_data = false;
+};
+
+/**
+ * @brief Print the usage instructions for this program.
+ *
+ * This function outputs the correct way to execute the program,
+ * including any necessary command-line arguments and their descriptions.
+ */
+void PrintUsage() {
+    std::cout << "Usage: gpu_copy "
+              << "--size <size> "
+              << "--num_warm_up <num_warm_up> "
+              << "--num_loops <num_loops> "
+              << "[--check_data]" << std::endl;
+}
+
+/**
+ * @brief Checks if the system has CPUs available for a given NUMA node.
+ *
+ * This function determines whether there are CPUs available for the specified
+ * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with
+ * multiple processors to optimize memory access times.
+ *
+ * @param node_id The identifier of the NUMA node to check.
+ * @return true if there are CPUs available for the specified NUMA node, false otherwise.
+ */
+bool HasCPUsForNumaNode(int node) {
+    struct bitmask *bm = numa_allocate_nodemask();
+    std::vector<int> cpus;
+    if (numa_node_to_cpus(node, bm) < 0) {
+        perror("numa_node_to_cpus");
+        numa_bitmask_free(bm);
+        return false; // On error
+    }
+
+    for (int i = 0; i < numa_bitmask_weight(bm); i++) {
+        if (numa_bitmask_isbitset(bm, i)) {
+            numa_bitmask_free(bm);
+            return true;
+        }
+    }
+    return false;
+}
+
+/**
+ * @brief Parses command-line options for the CPU copy performance benchmark.
+ *
+ * This function processes the command-line arguments provided to the benchmark
+ * and sets the appropriate configuration options based on the input.
+ *
+ * @param argc The number of command-line arguments.
+ * @param argv The array of command-line arguments.
+ * @return An integer indicating the success or failure of the option parsing.
+ *         Returns 0 on success, and a non-zero value on failure.
+ */
+/**/
+int ParseOpts(int argc, char **argv, Opts *opts) {
+    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
+    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
+                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
+                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
+                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
+    int getopt_ret = 0;
+    int opt_idx = 0;
+    bool size_specified = false;
+    bool num_warm_up_specified = false;
+    bool num_loops_specified = false;
+    bool parse_err = false;
+
+    while (true) {
+        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
+        if (getopt_ret == -1) {
+            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
+                parse_err = true;
+            }
+            break;
+        } else if (getopt_ret == '?') {
+            parse_err = true;
+            break;
+        }
+        switch (opt_idx) {
+        case static_cast<int>(OptIdx::kSize):
+            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
+                std::cerr << "Invalid size: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                size_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumWarmUp):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
+                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_warm_up_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kNumLoops):
+            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
+                std::cerr << "Invalid num_loops: " << optarg << std::endl;
+                parse_err = true;
+            } else {
+                num_loops_specified = true;
+            }
+            break;
+        case static_cast<int>(OptIdx::kEnableCheckData):
+            opts->check_data = true;
+            break;
+        default:
+            parse_err = true;
+        }
+        if (parse_err) {
+            break;
+        }
+    }
+
+    if (parse_err) {
+        PrintUsage();
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief Benchmark the memory copy performance between two NUMA nodes.
+ *
+ * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
+ *
+ * @param src_node The source NUMA node from which memory will be copied.
+ * @param dst_node The destination NUMA node to which memory will be copied.
+ * @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
+ * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
+ */
+double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
+    int ret = 0;
+
+    // Set CPU affinity to the source NUMA node
+    ret = numa_run_on_node(src_node);
+    if (ret != 0) {
+        std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
+        return 0;
+    }
+
+    // Allocate memory on the source and destination NUMA nodes
+    char *src = (char *)numa_alloc_onnode(opts.size, src_node);
+    if (!src) {
+        std::cerr << "Memory allocation failed on node" << src_node << std::endl;
+        return 0;
+    }
+
+    char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
+    if (!dst) {
+        std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
+        return 0;
+    }
+
+    // Initialize the source memory with some data
+    memset(src, 1, opts.size);
+
+    // Measure the time taken for memcpy between nodes
+    auto start = std::chrono::high_resolution_clock::now();
+
+    // Perform the memory copy
+    memcpy(dst, src, opts.size);
+
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+
+    // Calculate the latency (nanoseconds per byte)
+    double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
+
+    // Free the allocated memory
+    numa_free(src, opts.size);
+    numa_free(dst, opts.size);
+
+    if (opts.check_data) {
+        // Check the data integrity after the copy
+        if (memcmp(src, dst, opts.size) != 0) {
+            std::cerr << "Data integrity check failed!" << dst_node << std::endl;
+
+            return -1;
+        }
+    }
+
+    return total_time_ns;
+}
+
+/**
+ * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
+ *
+ * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
+ * It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
+ *
+ * @param src_node The source NUMA node from which data will be copied.
+ * @param dst_node The destination NUMA node to which data will be copied.
+ * @param opts A reference to an Opts object containing various options and configurations for the benchmark.
+ */
+double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
+    double max_time_ns = 0;
+
+    // Run warm up rounds
+    for (int i = 0; i < opts.num_warm_up; i++) {
+        BenchmarkNUMACopy(src_node, dst_node, opts);
+    }
+
+    for (int i = 0; i < opts.num_loops; i++) {
+        double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts);
+        max_time_ns = std::max(max_time_ns, time_used_ns);
+    }
+
+    return max_time_ns;
+}
+
+int main(int argc, char **argv) {
+    Opts opts;
+    int ret = -1;
+    ret = ParseOpts(argc, argv, &opts);
+    if (0 != ret) {
+        return ret;
+    }
+
+    // Check if the system has multiple NUMA nodes
+    if (-1 == numa_available()) {
+        std::cerr << "NUMA is not available on this system!" << std::endl;
+        return 1;
+    }
+
+    int num_of_numa_nodes = numa_num_configured_nodes();
+
+    if (num_of_numa_nodes < 2) {
+        std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
+        return 1;
+    }
+
+    // Run the benchmark
+    for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
+        if (!HasCPUsForNumaNode(src_node)) {
+            // Skip the NUMA node if there are no CPUs available
+            continue;
+        }
+
+        for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
+            if (src_node == dst_node) {
+                // Skip the same NUMA node
+                continue;
+            }
+
+            if (!HasCPUsForNumaNode(dst_node)) {
+                // Skip the NUMA node if there are no CPUs available
+                continue;
+            }
+
+            double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
+            double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
+            double latency = time_used_ns / opts.size;          // ns/byte
+
+            // Output the result
+            std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw
+                      << std::endl;
+            std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9)
+                      << latency << std::endl;
+        }
+    }
+
+    return 0;
+}

From 4c9546ca4562a5914c3f9a62d78e0bb6edbf5877 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 30 Oct 2024 11:43:48 -0700
Subject: [PATCH 2/6] Revert "Init cpu copy."

This reverts commit 3459eacad4ef4bc0bc193bbe51d05d87a8fbf2d7.
---
 .../micro_benchmarks/cpu_copy_performance.py  | 113 -------
 .../cpu_copy_performance/CMakeLists.txt       |  44 ---
 .../cpu_copy_performance/cpu_copy.cu          | 289 ------------------
 3 files changed, 446 deletions(-)
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
 delete mode 100644 superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu

diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
deleted file mode 100644
index 3b4d52c6d..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-"""Module of the GPU Copy Bandwidth Performance benchmark."""
-
-import os
-
-from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode
-from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
-
-
-class CpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
-    """The CPU copy bandwidth performance benchmark class."""
-    def __init__(self, name, parameters=''):
-        """Constructor.
-
-        Args:
-            name (str): benchmark name.
-            parameters (str): benchmark parameters.
-        """
-        super().__init__(name, parameters)
-
-        self._bin_name = 'cpu_copy'
-
-    def add_parser_arguments(self):
-        """Add the specified arguments."""
-        super().add_parser_arguments()
-
-        self._parser.add_argument(
-            '--size',
-            type=int,
-            default=256 * 1024**2,
-            required=False,
-            help='Size of data buffer in bytes.',
-        )
-
-        self._parser.add_argument(
-            '--num_warm_up',
-            type=int,
-            default=20,
-            required=False,
-            help='Number of warm up rounds',
-        )
-
-        self._parser.add_argument(
-            '--num_loops',
-            type=int,
-            default=100,
-            required=False,
-            help='Number of data buffer copies performed.',
-        )
-
-        self._parser.add_argument(
-            '--check_data',
-            action='store_true',
-            help='Enable data checking',
-        )
-
-    def _preprocess(self):
-        """Preprocess/preparation operations before the benchmarking.
-
-        Return:
-            True if _preprocess() succeed.
-        """
-        if not super()._preprocess():
-            return False
-
-        # TODO: enable hugepages?
-
-        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
-
-        args = '--size %d --num_warm_up %d --num_loops %d' % (
-            self._args.size, self._args.num_warm_up, self._args.num_loops
-        )
-
-        if self._args.check_data:
-            args += ' --check_data'
-
-        self._commands = ['%s %s' % (self.__bin_path, args)]
-
-        return True
-
-    def _process_raw_result(self, cmd_idx, raw_output):
-        """Function to parse raw results and save the summarized results.
-
-          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
-
-        Args:
-            cmd_idx (int): the index of command corresponding with the raw_output.
-            raw_output (str): raw output string of the micro-benchmark.
-
-        Return:
-            True if the raw output string is valid and result can be extracted.
-        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
-
-        try:
-            for output_line in raw_output.strip().splitlines():
-                self._result.add_result(output_line.strip())
-        except BaseException as e:
-            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
-            logger.error(
-                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
-                    self._curr_run_index, self._name, raw_output, str(e)
-                )
-            )
-            return False
-
-        return True
-
-
-BenchmarkRegistry.register_benchmark('cpu-copy-bw', CpuCopyBwBenchmark)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
deleted file mode 100644
index 2929afa50..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-cmake_minimum_required(VERSION 3.18)
-
-project(cpu_copy LANGUAGES CXX)
-
-find_package(CUDAToolkit QUIET)
-
-# Cuda environment
-if(CUDAToolkit_FOUND)
-    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
-
-    include(../cuda_common.cmake)
-    add_executable(cpu_copy cpu_copy.cu)
-    set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    target_link_libraries(cpu_copy numa)
-else()
-    # ROCm environment
-    include(../rocm_common.cmake)
-    find_package(hip QUIET)
-    if(hip_FOUND)
-        message(STATUS "Found ROCm: " ${HIP_VERSION})
-
-        # Convert cuda code to hip code in cpp
-        execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
-
-        # link hip device lib
-        add_executable(cpu_copy cpu_copy.cpp)
-
-        include(CheckSymbolExists)
-        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
-        if(${HIP_UNCACHED_MEMORY})
-            target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
-        endif()
-
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-        target_link_libraries(cpu_copy numa hip::device)
-    else()
-        message(FATAL_ERROR "No CUDA or ROCm environment found.")
-    endif()
-endif()
-
-install(TARGETS cpu_copy RUNTIME DESTINATION bin)
diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
deleted file mode 100644
index 0c205a3d5..000000000
--- a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-#include <chrono>
-#include <cstring> // for memcpy
-#include <getopt.h>
-#include <iomanip> // for setting precision
-#include <iostream>
-#include <numa.h>
-#include <numeric>
-#include <vector>
-
-// Options accepted by this program.
-struct Opts {
-    // Data buffer size for copy benchmark.
-    uint64_t size = 0;
-
-    // Number of warm up rounds to run.
-    uint64_t num_warm_up = 0;
-
-    // Number of loops to run.
-    uint64_t num_loops = 0;
-
-    // Whether check data after copy.
-    bool check_data = false;
-};
-
-/**
- * @brief Print the usage instructions for this program.
- *
- * This function outputs the correct way to execute the program,
- * including any necessary command-line arguments and their descriptions.
- */
-void PrintUsage() {
-    std::cout << "Usage: gpu_copy "
-              << "--size <size> "
-              << "--num_warm_up <num_warm_up> "
-              << "--num_loops <num_loops> "
-              << "[--check_data]" << std::endl;
-}
-
-/**
- * @brief Checks if the system has CPUs available for a given NUMA node.
- *
- * This function determines whether there are CPUs available for the specified
- * NUMA (Non-Uniform Memory Access) node. NUMA nodes are used in systems with
- * multiple processors to optimize memory access times.
- *
- * @param node_id The identifier of the NUMA node to check.
- * @return true if there are CPUs available for the specified NUMA node, false otherwise.
- */
-bool HasCPUsForNumaNode(int node) {
-    struct bitmask *bm = numa_allocate_nodemask();
-    std::vector<int> cpus;
-    if (numa_node_to_cpus(node, bm) < 0) {
-        perror("numa_node_to_cpus");
-        numa_bitmask_free(bm);
-        return false; // On error
-    }
-
-    for (int i = 0; i < numa_bitmask_weight(bm); i++) {
-        if (numa_bitmask_isbitset(bm, i)) {
-            numa_bitmask_free(bm);
-            return true;
-        }
-    }
-    return false;
-}
-
-/**
- * @brief Parses command-line options for the CPU copy performance benchmark.
- *
- * This function processes the command-line arguments provided to the benchmark
- * and sets the appropriate configuration options based on the input.
- *
- * @param argc The number of command-line arguments.
- * @param argv The array of command-line arguments.
- * @return An integer indicating the success or failure of the option parsing.
- *         Returns 0 on success, and a non-zero value on failure.
- */
-/**/
-int ParseOpts(int argc, char **argv, Opts *opts) {
-    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
-    const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
-                                     {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
-                                     {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
-                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
-    int getopt_ret = 0;
-    int opt_idx = 0;
-    bool size_specified = false;
-    bool num_warm_up_specified = false;
-    bool num_loops_specified = false;
-    bool parse_err = false;
-
-    while (true) {
-        getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
-        if (getopt_ret == -1) {
-            if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
-                parse_err = true;
-            }
-            break;
-        } else if (getopt_ret == '?') {
-            parse_err = true;
-            break;
-        }
-        switch (opt_idx) {
-        case static_cast<int>(OptIdx::kSize):
-            if (1 != sscanf(optarg, "%lu", &(opts->size))) {
-                std::cerr << "Invalid size: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                size_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumWarmUp):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
-                std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_warm_up_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kNumLoops):
-            if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
-                std::cerr << "Invalid num_loops: " << optarg << std::endl;
-                parse_err = true;
-            } else {
-                num_loops_specified = true;
-            }
-            break;
-        case static_cast<int>(OptIdx::kEnableCheckData):
-            opts->check_data = true;
-            break;
-        default:
-            parse_err = true;
-        }
-        if (parse_err) {
-            break;
-        }
-    }
-
-    if (parse_err) {
-        PrintUsage();
-        return -1;
-    }
-
-    return 0;
-}
-
-/**
- * @brief Benchmark the memory copy performance between two NUMA nodes.
- *
- * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
- *
- * @param src_node The source NUMA node from which memory will be copied.
- * @param dst_node The destination NUMA node to which memory will be copied.
- * @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
- * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
- */
-double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
-    int ret = 0;
-
-    // Set CPU affinity to the source NUMA node
-    ret = numa_run_on_node(src_node);
-    if (ret != 0) {
-        std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
-        return 0;
-    }
-
-    // Allocate memory on the source and destination NUMA nodes
-    char *src = (char *)numa_alloc_onnode(opts.size, src_node);
-    if (!src) {
-        std::cerr << "Memory allocation failed on node" << src_node << std::endl;
-        return 0;
-    }
-
-    char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
-    if (!dst) {
-        std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
-        return 0;
-    }
-
-    // Initialize the source memory with some data
-    memset(src, 1, opts.size);
-
-    // Measure the time taken for memcpy between nodes
-    auto start = std::chrono::high_resolution_clock::now();
-
-    // Perform the memory copy
-    memcpy(dst, src, opts.size);
-
-    auto end = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> diff = end - start;
-
-    // Calculate the latency (nanoseconds per byte)
-    double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds
-
-    // Free the allocated memory
-    numa_free(src, opts.size);
-    numa_free(dst, opts.size);
-
-    if (opts.check_data) {
-        // Check the data integrity after the copy
-        if (memcmp(src, dst, opts.size) != 0) {
-            std::cerr << "Data integrity check failed!" << dst_node << std::endl;
-
-            return -1;
-        }
-    }
-
-    return total_time_ns;
-}
-
-/**
- * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
- *
- * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
- * It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
- *
- * @param src_node The source NUMA node from which data will be copied.
- * @param dst_node The destination NUMA node to which data will be copied.
- * @param opts A reference to an Opts object containing various options and configurations for the benchmark.
- */
-double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
-    double max_time_ns = 0;
-
-    // Run warm up rounds
-    for (int i = 0; i < opts.num_warm_up; i++) {
-        BenchmarkNUMACopy(src_node, dst_node, opts);
-    }
-
-    for (int i = 0; i < opts.num_loops; i++) {
-        double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts);
-        max_time_ns = std::max(max_time_ns, time_used_ns);
-    }
-
-    return max_time_ns;
-}
-
-int main(int argc, char **argv) {
-    Opts opts;
-    int ret = -1;
-    ret = ParseOpts(argc, argv, &opts);
-    if (0 != ret) {
-        return ret;
-    }
-
-    // Check if the system has multiple NUMA nodes
-    if (-1 == numa_available()) {
-        std::cerr << "NUMA is not available on this system!" << std::endl;
-        return 1;
-    }
-
-    int num_of_numa_nodes = numa_num_configured_nodes();
-
-    if (num_of_numa_nodes < 2) {
-        std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
-        return 1;
-    }
-
-    // Run the benchmark
-    for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
-        if (!HasCPUsForNumaNode(src_node)) {
-            // Skip the NUMA node if there are no CPUs available
-            continue;
-        }
-
-        for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
-            if (src_node == dst_node) {
-                // Skip the same NUMA node
-                continue;
-            }
-
-            if (!HasCPUsForNumaNode(dst_node)) {
-                // Skip the NUMA node if there are no CPUs available
-                continue;
-            }
-
-            double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
-            double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
-            double latency = time_used_ns / opts.size;          // ns/byte
-
-            // Output the result
-            std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw
-                      << std::endl;
-            std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9)
-                      << latency << std::endl;
-        }
-    }
-
-    return 0;
-}

From 820907ba902e2a4e95b5c2db3a7899632232b963 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Sat, 16 Nov 2024 00:22:13 -0800
Subject: [PATCH 3/6] Add nvbandwidth build.

---
 .gitmodules             |  3 +++
 third_party/Makefile    | 17 +++++++++++++++--
 third_party/nvbandwidth |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)
 create mode 160000 third_party/nvbandwidth

diff --git a/.gitmodules b/.gitmodules
index 339520d19..9be41b59b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -30,3 +30,6 @@
 [submodule "third_party/Megatron/Megatron-DeepSpeed"]
 	path = third_party/Megatron/Megatron-DeepSpeed
 	url = https://github.com/microsoft/Megatron-DeepSpeed.git
+[submodule "third_party/nvbandwidth"]
+	path = third_party/nvbandwidth
+	url = https://github.com/NVIDIA/nvbandwidth.git
diff --git a/third_party/Makefile b/third_party/Makefile
index 63ca48f36..39333b1b3 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -16,12 +16,12 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
 
 NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 
-.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
+.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth
 
 # Build targets.
 all: cuda rocm
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
 cpu: common cpu_perftest cpu_stream
 common: fio
@@ -239,3 +239,16 @@ ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
 	mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
 	cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
 endif
+
+# Build nvbandwidth.
+REQUIRED_CMAKE_VERSION := 3.2.0
+CMAKE_VERSION := $(shell cmake --version 2>/dev/null | head -n 1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/')
+nvbandwidth: sb_micro_path
+	@if [ "$(CMAKE_VERSION)" = "" ]; then \
+        echo "Skip nvbandwidth for CMake is not installed."; \
+    elif [ "$(shell printf '%s\n' "$(REQUIRED_CMAKE_VERSION)" "$(CMAKE_VERSION)" | sort -V | head -n 1)" = "$(REQUIRED_CMAKE_VERSION)" ]; then \
+        cd ./nvbandwidth && cmake . && make && cd ..; \
+        cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin; \
+    else \
+        echo "Skip nvbandwidth for CMake version $(CMAKE_VERSION) is too old. Required version is $(REQUIRED_CMAKE_VERSION) or higher."; \
+    fi
diff --git a/third_party/nvbandwidth b/third_party/nvbandwidth
new file mode 160000
index 000000000..445d8aef7
--- /dev/null
+++ b/third_party/nvbandwidth
@@ -0,0 +1 @@
+Subproject commit 445d8aef742e8a48a69779a939996f9e8863df9d

From 3a51ad5434260671f4e6367964b729f85da25a08 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Sat, 16 Nov 2024 20:01:53 -0800
Subject: [PATCH 4/6] Fix required cmake version.

---
 third_party/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 39333b1b3..645138c9a 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -241,7 +241,7 @@ ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
 endif
 
 # Build nvbandwidth.
-REQUIRED_CMAKE_VERSION := 3.2.0
+REQUIRED_CMAKE_VERSION := 3.20.0
 CMAKE_VERSION := $(shell cmake --version 2>/dev/null | head -n 1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/')
 nvbandwidth: sb_micro_path
 	@if [ "$(CMAKE_VERSION)" = "" ]; then \

From 1f7c8e40f552fbe3f605e70c0eaf2cf37434f621 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Tue, 19 Nov 2024 22:56:01 -0800
Subject: [PATCH 5/6] Remove cmake version verification.

---
 dockerfile/cuda12.4.dockerfile |  2 +-
 third_party/Makefile           | 15 ++++-----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 560f0908a..3cd70da13 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -156,7 +156,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl_and_nvbandwidth
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==65.7 && \
diff --git a/third_party/Makefile b/third_party/Makefile
index 645138c9a..7f76b293c 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -20,8 +20,9 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 
 # Build targets.
 all: cuda rocm
+cuda_with_msccl_and_nvbandwidth: cuda cuda_msccl nvbandwidth
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
 cpu: common cpu_perftest cpu_stream
 common: fio
@@ -241,14 +242,6 @@ ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
 endif
 
 # Build nvbandwidth.
-REQUIRED_CMAKE_VERSION := 3.20.0
-CMAKE_VERSION := $(shell cmake --version 2>/dev/null | head -n 1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/')
 nvbandwidth: sb_micro_path
-	@if [ "$(CMAKE_VERSION)" = "" ]; then \
-        echo "Skip nvbandwidth for CMake is not installed."; \
-    elif [ "$(shell printf '%s\n' "$(REQUIRED_CMAKE_VERSION)" "$(CMAKE_VERSION)" | sort -V | head -n 1)" = "$(REQUIRED_CMAKE_VERSION)" ]; then \
-        cd ./nvbandwidth && cmake . && make && cd ..; \
-        cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin; \
-    else \
-        echo "Skip nvbandwidth for CMake version $(CMAKE_VERSION) is too old. Required version is $(REQUIRED_CMAKE_VERSION) or higher."; \
-    fi
+	cd ./nvbandwidth && cmake . && make && cd ..
+	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin

From cf84e5d4aebef62f1cdf704e4b3254f4336619f7 Mon Sep 17 00:00:00 2001
From: hongtaozhang <hongtaozhang@microsoft.com>
Date: Wed, 20 Nov 2024 11:53:51 -0800
Subject: [PATCH 6/6] Exclude nvbandwidth with -o.

---
 dockerfile/cuda11.1.1.dockerfile | 2 +-
 dockerfile/cuda12.4.dockerfile   | 2 +-
 third_party/Makefile             | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index 54577fdad..69044c440 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -149,7 +149,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda -o nvbandwidth
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==65.7 && \
diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 3cd70da13..560f0908a 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -156,7 +156,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl_and_nvbandwidth
+RUN make -C third_party cuda_with_msccl
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==65.7 && \
diff --git a/third_party/Makefile b/third_party/Makefile
index 7f76b293c..8117fc10b 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -20,9 +20,8 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 
 # Build targets.
 all: cuda rocm
-cuda_with_msccl_and_nvbandwidth: cuda cuda_msccl nvbandwidth
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
 cpu: common cpu_perftest cpu_stream
 common: fio