diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index 74710c3c8..46c83c028 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -3,9 +3,9 @@ // GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA. +#include // errno #include #include -#include #include #include @@ -313,6 +313,25 @@ int SetGpu(int gpu_id) { return 0; } +// Check if its NUMA node has CPUs. +bool HasCPUsForNumaNode(int node) { + struct bitmask *bm = numa_allocate_cpumask(); + + int numa_err = numa_node_to_cpus(node, bm); + if (numa_err != 0) { + fprintf(stderr, "HasCPUsForNumaNode::numa_node_to_cpus error on node: %d, code: %d, message: %s\n", node, errno, + strerror(errno)); + + numa_bitmask_free(bm); + return false; // On error + } + + // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes + bool has_cpus = (numa_bitmask_weight(bm) > 0); + numa_free_cpumask(bm); + return has_cpus; +} + #if defined(__HIP_PLATFORM_AMD__) bool UseFineGrained(const SubBenchArgs &args) { return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id; @@ -1134,6 +1153,12 @@ int main(int argc, char **argv) { // Scan all NUMA nodes for (int i = 0; i < numa_count; i++) { args.numa_id = i; + + // Avoid numa nodes without CPUS(eg. Nvidia Grace systems have memory only numa node) + if (!HasCPUsForNumaNode(args.numa_id)) { + continue; + } + // Scan all GPUs for (int j = 0; j < gpu_count; j++) { // Host-to-device benchmark