From 82cd7c186ca13c9e8024a7f76f969002a188525c Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 30 Oct 2024 14:17:13 -0700
Subject: [PATCH 1/5] skip mem only numa nodes on grace systems

---
 .../gpu_copy_performance/gpu_copy.cu          | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
index 74710c3c8..83c3a8af7 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -313,6 +313,21 @@ int SetGpu(int gpu_id) {
     return 0;
 }
 
+bool HasCPUsForNumaNode(int node) {
+    struct bitmask *bm = numa_allocate_nodemask();
+
+    if (numa_node_to_cpus(node, bm) < 0) {
+        fprintf(stderr, "numa_node_to_cpus error on node: %d\n", node);
+        numa_bitmask_free(bm);
+        return false; // On error
+    }
+
+    // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes
+    bool has_cpus = (numa_bitmask_weight(bm) > 0);
+    numa_bitmask_free(bm);
+    return has_cpus;
+}
+
 #if defined(__HIP_PLATFORM_AMD__)
 bool UseFineGrained(const SubBenchArgs &args) {
     return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
@@ -1134,6 +1149,12 @@ int main(int argc, char **argv) {
     // Scan all NUMA nodes
     for (int i = 0; i < numa_count; i++) {
         args.numa_id = i;
+
+        // Avoid numa nodes without CPUS(eg. Nvidia Grace Hopper memory only numa node)
+        if (!HasCPUsForNumaNode(args.numa_id)) {
+            continue;
+        }
+
         // Scan all GPUs
         for (int j = 0; j < gpu_count; j++) {
             // Host-to-device benchmark

From 68b262bf60824c4277f3abeb59ba57f2ecfecc28 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 30 Oct 2024 14:19:44 -0700
Subject: [PATCH 2/5] comment cleanup

---
 .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
index 83c3a8af7..6ec8625f5 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -1150,7 +1150,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < numa_count; i++) {
         args.numa_id = i;
 
-        // Avoid numa nodes without CPUS(eg. Nvidia Grace Hopper memory only numa node)
+        // Avoid numa nodes without CPUS(eg. Nvidia Grace systems have memory only numa node)
         if (!HasCPUsForNumaNode(args.numa_id)) {
             continue;
         }

From a28c6d9c5550a525b6804c52dae3260fe8f76141 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 30 Oct 2024 19:10:38 -0700
Subject: [PATCH 3/5] address PR, add error code in logs

---
 .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
index 6ec8625f5..d2d137e91 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -313,11 +313,13 @@ int SetGpu(int gpu_id) {
     return 0;
 }
 
+// Check if its NUMA node has CPUs.
 bool HasCPUsForNumaNode(int node) {
     struct bitmask *bm = numa_allocate_nodemask();
 
-    if (numa_node_to_cpus(node, bm) < 0) {
-        fprintf(stderr, "numa_node_to_cpus error on node: %d\n", node);
+    int numa_err = numa_node_to_cpus(node, bm);
+    if (numa_err != 0) {
+        fprintf(stderr, "numa_node_to_cpus error on node: %d, error code: %d\n", node, numa_err);
         numa_bitmask_free(bm);
         return false; // On error
     }

From 41492d6da92b70e490d16898e4ac320ad59feaae Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 30 Oct 2024 20:18:14 -0700
Subject: [PATCH 4/5] address PR, fix error code and msg

---
 .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
index d2d137e91..ce5210f46 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -3,9 +3,9 @@
 
 // GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA.
 
+#include <cerrno> // errno
 #include <cstdio>
 #include <cstring>
-#include <string>
 #include <vector>
 
 #include <getopt.h>
@@ -319,7 +319,9 @@ bool HasCPUsForNumaNode(int node) {
 
     int numa_err = numa_node_to_cpus(node, bm);
     if (numa_err != 0) {
-        fprintf(stderr, "numa_node_to_cpus error on node: %d, error code: %d\n", node, numa_err);
+        fprintf(stderr, "HasCPUsForNumaNode::numa_node_to_cpus error on node: %d, code: %d, message: %s\n", node, errno,
+                strerror(errno));
+
         numa_bitmask_free(bm);
         return false; // On error
     }

From 49db162ba947d38b30ec3327255be350c43bec8c Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 4 Nov 2024 16:44:24 -0800
Subject: [PATCH 5/5] fix wrong bitmask usage

---
 .../micro_benchmarks/gpu_copy_performance/gpu_copy.cu         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
index ce5210f46..46c83c028 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -315,7 +315,7 @@ int SetGpu(int gpu_id) {
 
 // Check if its NUMA node has CPUs.
 bool HasCPUsForNumaNode(int node) {
-    struct bitmask *bm = numa_allocate_nodemask();
+    struct bitmask *bm = numa_allocate_cpumask();
 
     int numa_err = numa_node_to_cpus(node, bm);
     if (numa_err != 0) {
@@ -328,7 +328,7 @@ bool HasCPUsForNumaNode(int node) {
 
     // Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes
     bool has_cpus = (numa_bitmask_weight(bm) > 0);
-    numa_bitmask_free(bm);
+    numa_free_cpumask(bm);
     return has_cpus;
 }