diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt new file mode 100644 index 000000000..ab05185c1 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.18) + +project(cpu_copy LANGUAGES CXX) + +find_package(CUDAToolkit QUIET) + +# Cuda environment +if(CUDAToolkit_FOUND) + message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION}) + + include(../cuda_common.cmake) + add_executable(cpu_copy cpu_copy.cpp) + set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED}) + target_link_libraries(cpu_copy numa) +else() + # ROCm environment + include(../rocm_common.cmake) + find_package(hip QUIET) + if(hip_FOUND) + message(STATUS "Found ROCm: " ${HIP_VERSION}) + + # Convert cuda code to hip code in cpp + execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/) + + # link hip device lib + add_executable(cpu_copy cpu_copy.cpp) + + include(CheckSymbolExists) + check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) + if(${HIP_UNCACHED_MEMORY}) + target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY) + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + target_link_libraries(cpu_copy numa hip::device) + else() + message(FATAL_ERROR "No CUDA or ROCm environment found.") + endif() +endif() + +install(TARGETS cpu_copy RUNTIME DESTINATION bin) diff --git a/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cpp b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cpp new file mode 100644 index 000000000..dbfeadd7a --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cpu_copy_performance/cpu_copy.cpp @@ -0,0 +1,319 @@ +#include +#include // for memcpy +#include +#include // for setting precision +#include +#include +#include +#include + +// Options accepted by this program. +struct Opts { + // Data buffer size for copy benchmark. + uint64_t size = 0; + + // Number of warm up rounds to run. + uint64_t num_warm_up = 0; + + // Number of loops to run. + uint64_t num_loops = 0; + + // Whether check data after copy. + bool check_data = false; +}; + +/** + * @brief Print the usage instructions for this program. + * + * This function outputs the correct way to execute the program, + * including any necessary command-line arguments and their descriptions. + */ +void PrintUsage() { + std::cout << "Usage: cpu_copy " + << "--size " + << "--num_warm_up " + << "--num_loops " + << "[--check_data]" << std::endl; +} + +/** + * @brief Checks if the system has memory available for a specific NUMA node. + * + * This function determines whether there is memory available on the specified + * NUMA (Non-Uniform Memory Access) node. + * + * Empty NUMA nodes are reserved for GPUs that may be connected in the future. + * + * @param node The identifier of the NUMA node to check. + * @return true if the specified NUMA node has memory available, false otherwise. + */ +bool HasMemForNumaNode(int node) { + try { + long free_memory = numa_node_size64(node, nullptr); + return free_memory > 0; + } catch (const std::exception &e) { + std::cerr << "Failed to get memory size for NUMA node " << node << ". ERROR: " << e.what() << std::endl; + return false; + } +} + +/** + * @brief Checks if the system has CPUs available for a specific NUMA node. + * + * This function determines whether there are CPUs available on the specified + * NUMA (Non-Uniform Memory Access) node. It is useful for ensuring that CPU + * affinity can be set to the desired NUMA node, which can help optimize memory + * access patterns and performance in NUMA-aware applications. + * + * Memory-only or Empty NUMA nodes are not considered to have CPUs available. + * + * @param node The identifier of the NUMA node to check. + * @return true if the specified NUMA node has CPUs available, false otherwise. + */ +bool HasCPUsForNumaNode(int node) { + struct bitmask *bm = numa_allocate_cpumask(); + + int numa_err = numa_node_to_cpus(node, bm); + if (numa_err != 0) { + std::cerr << "Failed to get CPU mask for NUMA node " << node << ". ERROR: " << strerror(errno) << std::endl; + + numa_bitmask_free(bm); + return false; // On error + } + + // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes + bool has_cpus = (numa_bitmask_weight(bm) > 0); + numa_bitmask_free(bm); + return has_cpus; +} + +/** + * @brief Parses command-line options for the CPU copy performance benchmark. + * + * This function processes the command-line arguments provided to the benchmark + * and sets the appropriate configuration options based on the input. + * + * @param argc The number of command-line arguments. + * @param argv The array of command-line arguments. + * @return An integer indicating the success or failure of the option parsing. + * Returns 0 on success, and a non-zero value on failure. + */ +/**/ +int ParseOpts(int argc, char **argv, Opts *opts) { + enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData }; + const struct option options[] = {{"size", required_argument, nullptr, static_cast(OptIdx::kSize)}, + {"num_warm_up", required_argument, nullptr, static_cast(OptIdx::kNumWarmUp)}, + {"num_loops", required_argument, nullptr, static_cast(OptIdx::kNumLoops)}, + {"check_data", no_argument, nullptr, static_cast(OptIdx::kEnableCheckData)}}; + int getopt_ret = 0; + int opt_idx = 0; + bool size_specified = false; + bool num_warm_up_specified = false; + bool num_loops_specified = false; + bool parse_err = false; + + while (true) { + getopt_ret = getopt_long(argc, argv, "", options, &opt_idx); + if (getopt_ret == -1) { + if (!size_specified || !num_warm_up_specified || !num_loops_specified) { + parse_err = true; + } + break; + } else if (getopt_ret == '?') { + parse_err = true; + break; + } + switch (opt_idx) { + case static_cast(OptIdx::kSize): + if (1 != sscanf(optarg, "%lu", &(opts->size))) { + std::cerr << "Invalid size: " << optarg << std::endl; + parse_err = true; + } else { + size_specified = true; + } + break; + case static_cast(OptIdx::kNumWarmUp): + if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) { + std::cerr << "Invalid num_warm_up: " << optarg << std::endl; + parse_err = true; + } else { + num_warm_up_specified = true; + } + break; + case static_cast(OptIdx::kNumLoops): + if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) { + std::cerr << "Invalid num_loops: " << optarg << std::endl; + parse_err = true; + } else { + num_loops_specified = true; + } + break; + case static_cast(OptIdx::kEnableCheckData): + opts->check_data = true; + break; + default: + parse_err = true; + } + if (parse_err) { + break; + } + } + + if (parse_err) { + PrintUsage(); + return -1; + } + + return 0; +} + +/** + * @brief Benchmark the memory copy performance between two NUMA nodes. + * + * This function measures the performance of copying memory from a source NUMA node to a destination NUMA node. + * + * @param src_node The source NUMA node from which memory will be copied. + * @param dst_node The destination NUMA node to which memory will be copied. + * @param opts A reference to an Opts structure containing various options and configurations for the benchmark. + * @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency. + */ +double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) { + int ret = 0; + + // Set CPU affinity to the NUMA node with CPU cores assoiated + int affinity_node = HasCPUsForNumaNode(src_node) ? src_node : dst_node; + ret = numa_run_on_node(affinity_node); + if (ret != 0) { + std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl; + return 0; + } + + // Allocate memory on the source and destination NUMA nodes + char *src = (char *)numa_alloc_onnode(opts.size, src_node); + if (!src) { + std::cerr << "Memory allocation failed on node" << src_node << std::endl; + return 0; + } + + char *dst = (char *)numa_alloc_onnode(opts.size, dst_node); + if (!dst) { + std::cerr << "Memory allocation failed on node" << dst_node << std::endl; + return 0; + } + + // Initialize the source memory with some data + memset(src, 1, opts.size); + + // Measure the time taken for memcpy between nodes + auto start = std::chrono::high_resolution_clock::now(); + + // Perform the memory copy + memcpy(dst, src, opts.size); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + + // Calculate the latency (nanoseconds per byte) + double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds + + // Free the allocated memory + numa_free(src, opts.size); + numa_free(dst, opts.size); + + if (opts.check_data) { + // Check the data integrity after the copy + if (memcmp(src, dst, opts.size) != 0) { + std::cerr << "Data integrity check failed!" << dst_node << std::endl; + + return -1; + } + } + + return total_time_ns; +} + +/** + * @brief Runs the CPU copy benchmark between all pairs of NUMA nodes. + * + * This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system. + * It calculates the average bandwidth and latency for each pair of nodes and outputs the results. + * + * @param src_node The source NUMA node from which data will be copied. + * @param dst_node The destination NUMA node to which data will be copied. + * @param opts A reference to an Opts object containing various options and configurations for the benchmark. + */ +double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) { + double max_time_ns = 0; + + // Run warm up rounds + for (int i = 0; i < opts.num_warm_up; i++) { + BenchmarkNUMACopy(src_node, dst_node, opts); + } + + for (int i = 0; i < opts.num_loops; i++) { + double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts); + max_time_ns = std::max(max_time_ns, time_used_ns); + } + + return max_time_ns; +} + +int main(int argc, char **argv) { + Opts opts; + int ret = -1; + ret = ParseOpts(argc, argv, &opts); + if (0 != ret) { + return ret; + } + + // Check if the system has multiple NUMA nodes + if (-1 == numa_available()) { + std::cerr << "NUMA is not available on this system!" << std::endl; + return 1; + } + + int num_of_numa_nodes = numa_num_configured_nodes(); + + if (num_of_numa_nodes < 2) { + std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl; + return 1; + } + + // Run the benchmark + for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) { + if (!HasMemForNumaNode(src_node)) { + // Skip the NUMA node if there are no CPUs available + continue; + } + + for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) { + if (src_node == dst_node) { + // Skip the same NUMA node + continue; + } + + if (!HasMemForNumaNode(dst_node)) { + // Skip the NUMA node if there are no CPUs available + continue; + } + + // + if (!HasCPUsForNumaNode(src_node) && !HasCPUsForNumaNode(dst_node)) { + // Skip the process if there are no CPUs available on both NUMA nodes + continue; + } + + double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts); + double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s + double latency = time_used_ns / opts.size; // ns/byte + + // Output the result + std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_" << dst_node << "_bw: " << std::setprecision(9) + << bw << std::endl; + std::cout << "mem_bandwidth_matrix_numa_" << src_node << "_to_node" << dst_node + << "_lat: " << std::setprecision(9) << latency << std::endl; + } + } + + return 0; +} diff --git a/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py b/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py index 85559cb65..79e369ec9 100644 --- a/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py +++ b/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py @@ -1,9 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""Module for running the Intel MLC tool to measure memory bandwidth and latency.""" +"""Module to measure memory bandwidth and latency.""" import os +import platform from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, ReturnCode @@ -21,13 +22,14 @@ def __init__(self, name, parameters=''): """ super().__init__(name, parameters) - self._bin_name = 'mlc' + self._bin_name = 'mlc' if 'x86_64' in platform.machine() else 'cpu_copy' self.__support_mlc_commands = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth'] def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() + # Add arguments for the Intel MLC tool. self._parser.add_argument( '--tests', type=str, @@ -37,15 +39,39 @@ def add_parser_arguments(self): help='The modes to run mlc with. Possible values are {}.'.format(' '.join(self.__support_mlc_commands)) ) - def _preprocess(self): - """Preprocess/preparation operations before the benchmarking. + # Add arguments for the general CPU copy benchmark. + self._parser.add_argument( + '--size', + type=int, + default=256 * 1024**2, + required=False, + help='Size of data buffer in bytes for non mlc benchmark. Default is 256MB.', + ) - Return: - True if _preprocess() succeed. - """ - if not super()._preprocess(): - return False + self._parser.add_argument( + '--num_warm_up', + type=int, + default=20, + required=False, + help='Number of warm up rounds for non mlc benchmark. Default is 20.', + ) + + self._parser.add_argument( + '--num_loops', + type=int, + default=100, + required=False, + help='Number of data buffer copies performed for non mlc benchmark. Default is 100.', + ) + self._parser.add_argument( + '--check_data', + action='store_true', + help='Enable data checking for non mlc benchmark. Default is False.', + ) + + def _preprocess_mlc(self): + """ Preprocess/preparation operations for the Intel MLC tool.""" mlc_path = os.path.join(self._args.bin_dir, self._bin_name) ret_val = os.access(mlc_path, os.X_OK | os.F_OK) if not ret_val: @@ -66,18 +92,36 @@ def _preprocess(self): self._commands.append(mlc_wrapper % command) return True - def _process_raw_result(self, cmd_idx, raw_output): - """Function to parse raw results and save the summarized results. + def _preprocess_general(self): + """Preprocess/preparation operations for the general CPU copy benchmark.""" + # TODO: enable hugepages? - self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name) - Args: - cmd_idx (int): the index of command corresponding with the raw_output. - raw_output (str): raw output string of the micro-benchmark. + args = '--size %d --num_warm_up %d --num_loops %d' % ( + self._args.size, self._args.num_warm_up, self._args.num_loops + ) + + if self._args.check_data: + args += ' --check_data' + + self._commands = ['%s %s' % (self.__bin_path, args)] + + return True + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. Return: - True if the raw output string is valid and result can be extracted. + True if _preprocess() succeed. """ + if not super()._preprocess(): + return False + + return self._preprocess_mlc() if 'x86_64' in platform.machine() else self._preprocess_general() + + def _process_raw_result_mlc(self, cmd_idx, raw_output): + """Function to parse raw results for the Intel MLC tool and save the summarized results.""" self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) # parse the command to see which command this output belongs to @@ -116,8 +160,47 @@ def _process_raw_result(self, cmd_idx, raw_output): else: metric = 'mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure).lower() self._result.add_result(metric, float(out_table[key][index])) + return True + def _process_raw_result_genneral(self, cmd_idx, raw_output): + """Function to parse raw results for the general CPU copy benchmark and save the summarized results.""" + self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) + + try: + for output_line in raw_output.strip().splitlines(): + name, value = output_line.split(':') + self._result.add_result(name.strip(), float(value.strip())) + except BaseException as e: + self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format( + self._curr_run_index, self._name, raw_output, str(e) + ) + ) + + return False + + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to parse raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + return ( + self._process_raw_result_mlc(cmd_idx, raw_output) + if 'x86_64' in platform.machine() + else self._process_raw_result_genneral(cmd_idx, raw_output) + ) + def _parse_bw_latency(self, raw_output): out_table = dict() for line in raw_output.splitlines(): @@ -146,5 +229,4 @@ def _parse_max_bw(self, raw_output): out_table[key] = [vals[-1]] return out_table - BenchmarkRegistry.register_benchmark('cpu-memory-bw-latency', CpuMemBwLatencyBenchmark)