From 82cd7c186ca13c9e8024a7f76f969002a188525c Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 30 Oct 2024 14:17:13 -0700 Subject: [PATCH 1/5] skip mem only numa nodes on grace systems --- .../gpu_copy_performance/gpu_copy.cu | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index 74710c3c8..83c3a8af7 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -313,6 +313,21 @@ int SetGpu(int gpu_id) { return 0; } +bool HasCPUsForNumaNode(int node) { + struct bitmask *bm = numa_allocate_nodemask(); + + if (numa_node_to_cpus(node, bm) < 0) { + fprintf(stderr, "numa_node_to_cpus error on node: %d\n", node); + numa_bitmask_free(bm); + return false; // On error + } + + // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes + bool has_cpus = (numa_bitmask_weight(bm) > 0); + numa_bitmask_free(bm); + return has_cpus; +} + #if defined(__HIP_PLATFORM_AMD__) bool UseFineGrained(const SubBenchArgs &args) { return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id; @@ -1134,6 +1149,12 @@ int main(int argc, char **argv) { // Scan all NUMA nodes for (int i = 0; i < numa_count; i++) { args.numa_id = i; + + // Avoid numa nodes without CPUS(eg. Nvidia Grace Hopper memory only numa node) + if (!HasCPUsForNumaNode(args.numa_id)) { + continue; + } + // Scan all GPUs for (int j = 0; j < gpu_count; j++) { // Host-to-device benchmark From 68b262bf60824c4277f3abeb59ba57f2ecfecc28 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 30 Oct 2024 14:19:44 -0700 Subject: [PATCH 2/5] comment cleanup --- .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index 83c3a8af7..6ec8625f5 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -1150,7 +1150,7 @@ int main(int argc, char **argv) { for (int i = 0; i < numa_count; i++) { args.numa_id = i; - // Avoid numa nodes without CPUS(eg. Nvidia Grace Hopper memory only numa node) + // Avoid numa nodes without CPUS(eg. Nvidia Grace systems have memory only numa node) if (!HasCPUsForNumaNode(args.numa_id)) { continue; } From a28c6d9c5550a525b6804c52dae3260fe8f76141 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 30 Oct 2024 19:10:38 -0700 Subject: [PATCH 3/5] address PR, add error code in logs --- .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index 6ec8625f5..d2d137e91 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -313,11 +313,13 @@ int SetGpu(int gpu_id) { return 0; } +// Check if its NUMA node has CPUs. bool HasCPUsForNumaNode(int node) { struct bitmask *bm = numa_allocate_nodemask(); - if (numa_node_to_cpus(node, bm) < 0) { - fprintf(stderr, "numa_node_to_cpus error on node: %d\n", node); + int numa_err = numa_node_to_cpus(node, bm); + if (numa_err != 0) { + fprintf(stderr, "numa_node_to_cpus error on node: %d, error code: %d\n", node, numa_err); numa_bitmask_free(bm); return false; // On error } From 41492d6da92b70e490d16898e4ac320ad59feaae Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 30 Oct 2024 20:18:14 -0700 Subject: [PATCH 4/5] address PR, fix error code and msg --- .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index d2d137e91..ce5210f46 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -3,9 +3,9 @@ // GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA. +#include // errno #include #include -#include #include #include @@ -319,7 +319,9 @@ bool HasCPUsForNumaNode(int node) { int numa_err = numa_node_to_cpus(node, bm); if (numa_err != 0) { - fprintf(stderr, "numa_node_to_cpus error on node: %d, error code: %d\n", node, numa_err); + fprintf(stderr, "HasCPUsForNumaNode::numa_node_to_cpus error on node: %d, code: %d, message: %s\n", node, errno, + strerror(errno)); + numa_bitmask_free(bm); return false; // On error } From 49db162ba947d38b30ec3327255be350c43bec8c Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 4 Nov 2024 16:44:24 -0800 Subject: [PATCH 5/5] fix wrong bitmask usage --- .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu index ce5210f46..46c83c028 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu @@ -315,7 +315,7 @@ int SetGpu(int gpu_id) { // Check if its NUMA node has CPUs. bool HasCPUsForNumaNode(int node) { - struct bitmask *bm = numa_allocate_nodemask(); + struct bitmask *bm = numa_allocate_cpumask(); int numa_err = numa_node_to_cpus(node, bm); if (numa_err != 0) { @@ -328,7 +328,7 @@ bool HasCPUsForNumaNode(int node) { // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes bool has_cpus = (numa_bitmask_weight(bm) > 0); - numa_bitmask_free(bm); + numa_free_cpumask(bm); return has_cpus; }