From 1acf48b1c1bfa1c0f8cc90a443f0a511f7e7d0c7 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Tue, 21 Nov 2023 18:41:37 +0800
Subject: [PATCH 1/2] Upgrade Docker image to CUDA 12.2

Upgrade Docker image to CUDA 12.2 for H100:
* upgrade base image to 23.10
* fix onnxruntime version in python3.10
* fix compilation errors
---
 ...uda12.1.dockerfile => cuda12.2.dockerfile} | 33 ++++++++++---------
 setup.py                                      |  3 +-
 .../cublas_function/cublas_benchmark.h        |  6 ++--
 3 files changed, 22 insertions(+), 20 deletions(-)
 rename dockerfile/{cuda12.1.dockerfile => cuda12.2.dockerfile} (80%)

diff --git a/dockerfile/cuda12.1.dockerfile b/dockerfile/cuda12.2.dockerfile
similarity index 80%
rename from dockerfile/cuda12.1.dockerfile
rename to dockerfile/cuda12.2.dockerfile
index 2f9e430fa..a3cf01f33 100644
--- a/dockerfile/cuda12.1.dockerfile
+++ b/dockerfile/cuda12.2.dockerfile
@@ -1,16 +1,16 @@
-FROM nvcr.io/nvidia/pytorch:23.03-py3
+FROM nvcr.io/nvidia/pytorch:23.10-py3
 
 # OS:
-#   - Ubuntu: 20.04
-#   - OpenMPI: 4.1.5a1
+#   - Ubuntu: 22.04
+#   - OpenMPI: 4.1.5rc2
 #   - Docker Client: 20.10.8
 # NVIDIA:
-#   - CUDA: 12.1.0
-#   - cuDNN: 8.8.1.3
-#   - NCCL: v2.17.1-1
+#   - CUDA: 12.2.2
+#   - cuDNN: 8.9.5
+#   - NCCL: v2.19.3-1
 # Mellanox:
-#   - OFED: 5.2-2.2.3.0 # TODO
-#   - HPC-X: v2.14
+#   - OFED: 23.07-0.5.1.2
+#   - HPC-X: v2.16
 # Intel:
 #   - mlc: v3.10
 
@@ -74,20 +74,20 @@ RUN mkdir -p /root/.ssh && \
     echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
 
 # Install OFED
-ENV OFED_VERSION=5.2-2.2.3.0
+ENV OFED_VERSION=23.07-0.5.1.2
 RUN cd /tmp && \
-    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
-ENV HPCX_VERSION=v2.14
+ENV HPCX_VERSION=v2.16
 RUN cd /opt && \
     rm -rf hpcx && \
-    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
+    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz -O hpcx.tbz && \
     tar xf hpcx.tbz && \
-    mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
+    mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64 hpcx && \
     rm hpcx.tbz
 
 # Install Intel MLC
@@ -131,7 +131,8 @@ ADD third_party third_party
 RUN make -C third_party cuda
 
 ADD . .
-RUN python3 -m pip install --no-cache-dir .[nvworker] && \
+RUN python3 -m pip install --upgrade setuptools==65.7 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
     make cppbuild && \
     make postinstall && \
     rm -rf .git
diff --git a/setup.py b/setup.py
index 23c796833..a05dcfda4 100644
--- a/setup.py
+++ b/setup.py
@@ -213,7 +213,8 @@ def run(self):
             ],
             'ort': [
                 'onnx>=1.10.2',
-                'onnxruntime-gpu==1.10.0',
+                'onnxruntime-gpu==1.10.0; python_version<"3.10"',
+                'onnxruntime-gpu; python_version>="3.10"',
             ],
             'nvidia': ['py3nvml>=0.2.6'],
         }
diff --git a/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h b/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h
index a432c6392..9092ad024 100644
--- a/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h
+++ b/superbench/benchmarks/micro_benchmarks/cublas_function/cublas_benchmark.h
@@ -366,8 +366,8 @@ void CublasFunction::matrix_calculation_on_cpu_with_data(const T1 *Parameter_0_0
             for (int j = 0; j < n; j++) {
                 (*Result_cpu)[i + j * m + b * m * n] = beta * (T2)(Result_3_0_host[i + j * m + b * m * n]);
                 for (int p = 0; p < k; p++) {
-                    (*Result_cpu)[i + j * m + b * m * n] +=
-                        Parameter_0_0_host_op[p * m + i + b * m * k] * Parameter_1_0_host_op[j * k + p + b * k * n];
+                    (*Result_cpu)[i + j * m + b * m * n] += (T2)(Parameter_0_0_host_op[p * m + i + b * m * k] *
+                                                                 Parameter_1_0_host_op[j * k + p + b * k * n]);
                     (*Result_cpu)[i + j * m + b * m * n] *= alpha;
                 }
             }
@@ -444,7 +444,7 @@ int CublasFunction::check_result(int batch_count, T1 *Result_3_0, T2 *Result_cpu
     //     |<x, y>_cpu - <x,y>_gpu|/|<x, y>_cpu|/dot_length  < eps
     int error_count = 0;
     for (int i = 0; i < static_cast<int>(m * n) * batch_count; i++) {
-        double abs_err = fabs(Result_cpu[i] - Result_3_0_host[i]);
+        double abs_err = fabs(Result_cpu[i] - (T2)(Result_3_0_host[i]));
         double dot_length = k;
         double abs_val = fabs(Result_cpu[i]);
         double rel_err = abs_err / abs_val / dot_length;

From 797db2b736afa0e824a2888a3d78c15e6b43a2c8 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Tue, 21 Nov 2023 19:23:50 +0800
Subject: [PATCH 2/2] Update github action

Update github action.
---
 .github/workflows/build-image.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index e2dad1a66..3a5fad8df 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -24,9 +24,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cuda12.1
-          dockerfile: cuda12.1
-          tags: superbench/main:cuda12.1
+        - name: cuda12.2
+          dockerfile: cuda12.2
+          tags: superbench/main:cuda12.2
         - name: cuda11.1.1
           dockerfile: cuda11.1.1
           tags: superbench/main:cuda11.1.1,superbench/superbench:latest