Skip to content

Commit

Permalink
Merge branch 'guzhao/amd_monitor' of github.com:microsoft/superbenchm…
Browse files Browse the repository at this point in the history
…ark into guzhao/amd_monitor
  • Loading branch information
guoshzhao committed Nov 23, 2023
2 parents f7a96de + b516b0e commit 81e060d
Show file tree
Hide file tree
Showing 16 changed files with 454 additions and 134 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/cuda-unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ steps:
- script: |
SB_MICRO_PATH=$PWD python3 setup.py test
displayName: Run unit tests
timeoutInMinutes: 30
timeoutInMinutes: 60
- script: |
bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test
displayName: Report coverage results
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
strategy:
matrix:
include:
- name: cuda12.1
dockerfile: cuda12.1
tags: superbench/main:cuda12.1
- name: cuda12.2
dockerfile: cuda12.2
tags: superbench/main:cuda12.2
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
Expand Down
33 changes: 17 additions & 16 deletions dockerfile/cuda12.1.dockerfile → dockerfile/cuda12.2.dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
FROM nvcr.io/nvidia/pytorch:23.03-py3
FROM nvcr.io/nvidia/pytorch:23.10-py3

# OS:
# - Ubuntu: 20.04
# - OpenMPI: 4.1.5a1
# - Ubuntu: 22.04
# - OpenMPI: 4.1.5rc2
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 12.1.0
# - cuDNN: 8.8.1.3
# - NCCL: v2.17.1-1
# - CUDA: 12.2.2
# - cuDNN: 8.9.5
# - NCCL: v2.19.3-1
# Mellanox:
# - OFED: 5.2-2.2.3.0 # TODO
# - HPC-X: v2.14
# - OFED: 23.07-0.5.1.2
# - HPC-X: v2.16
# Intel:
# - mlc: v3.10

Expand Down Expand Up @@ -74,20 +74,20 @@ RUN mkdir -p /root/.ssh && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf

# Install OFED
ENV OFED_VERSION=5.2-2.2.3.0
ENV OFED_VERSION=23.07-0.5.1.2
RUN cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.14
ENV HPCX_VERSION=v2.16
RUN cd /opt && \
rm -rf hpcx && \
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64 hpcx && \
rm hpcx.tbz

# Install Intel MLC
Expand Down Expand Up @@ -131,7 +131,8 @@ ADD third_party third_party
RUN make -C third_party cuda

ADD . .
RUN python3 -m pip install --no-cache-dir .[nvworker] && \
RUN python3 -m pip install --upgrade setuptools==65.7 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git
2 changes: 1 addition & 1 deletion dockerfile/rocm5.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party rocm
RUN make -C third_party rocm -o rocm_hipblaslt

ADD . .
RUN python3 -m pip install --upgrade setuptools==65.7 && \
Expand Down
2 changes: 1 addition & 1 deletion dockerfile/rocm5.1.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt

ADD . .
RUN python3 -m pip install --no-cache-dir .[amdworker] && \
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ def run(self):
],
'ort': [
'onnx>=1.10.2',
'onnxruntime-gpu==1.10.0',
'onnxruntime-gpu==1.10.0; python_version<"3.10"',
'onnxruntime-gpu; python_version>="3.10"',
],
'nvidia': ['py3nvml>=0.2.6'],
'amd': ['pyrsmi>=1.0.1'],
Expand Down
4 changes: 4 additions & 0 deletions superbench/benchmarks/micro_benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
from superbench.benchmarks.micro_benchmarks.blaslt_function_base import BlasLtBaseBenchmark
from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBenchmark
from superbench.benchmarks.micro_benchmarks.hipblaslt_function import HipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
Expand Down Expand Up @@ -37,6 +39,7 @@
from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops

__all__ = [
'BlasLtBaseBenchmark',
'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CpuHplBenchmark',
Expand All @@ -49,6 +52,7 @@
'CudnnBenchmark',
'DiskBenchmark',
'DistInference',
'HipBlasLtBenchmark',
'GPCNetBenchmark',
'GemmFlopsBenchmark',
'GpuBurnBenchmark',
Expand Down
141 changes: 141 additions & 0 deletions superbench/benchmarks/micro_benchmarks/blaslt_function_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module of the BLASLt GEMM Base Class."""
import itertools

from superbench.common.utils import logger
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


def mrange(start, stop=-1, multiplication_factor=2, symbol='x'):
"""Range constructor with multiplication factor.
Args:
start (int): Start number.
stop (int, optional): Stop number. Defaults to -1.
multiplication_factor (int, optional): Multiplication factor. Defaults to 2.
symbol (str, optional): Symbol. Defaults to 'x' (multiplication).
Yields:
int: number in the range.
"""
if symbol == 'x':
while True:
yield start
start *= multiplication_factor
if start > stop or start == 0 or multiplication_factor < 2:
break
elif symbol == '+':
while True:
yield start
start = start + multiplication_factor
if start > stop or start == 0 or multiplication_factor < 1:
break
else:
raise ValueError(f'Invalid symbol {symbol}.')


def validate_mrange(string):
"""Validate mrange string in format start[[:stop]:multiplication_factor].
Args:
string (str): mrange string.
Returns:
bool: whether the mrange is expected.
"""
nums = string.split(':')
if len(nums) > 3:
return False

if len(nums) < 3:
return all(x.isdigit() for x in nums)
return nums[0].isdigit() and nums[1].isdigit() and (nums[2].lstrip('+').isdigit() or nums[2].lstrip('x').isdigit())


class BlasLtBaseBenchmark(MicroBenchmarkWithInvoke):
"""The BLASLt GEMM Base class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)

def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()

self._parser.add_argument(
'--shapes',
type=str,
nargs='+',
default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]],
help='Shapes in m,n,k format. Support format start:stop:multiplication_factor, e.g., 16:128:2.',
)
self._parser.add_argument(
'--batch',
type=str,
default='0',
required=False,
help=(
'Batch size for strided batch GEMM, set 0 to disable.'
' Support format start:stop:multiplication_factor, e.g., 16:128:2.'
),
)
self._parser.add_argument(
'--num_warmup',
type=int,
default=20,
required=False,
help='Number of warm up steps.',
)
self._parser.add_argument(
'--num_steps',
type=int,
default=50,
required=False,
help='Number of steps to measure.',
)

def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False

if not validate_mrange(self._args.batch):
logger.error(f'Invalid batch size {self._args.batch}.')
return False

for _in_type in self._args.in_types:
if _in_type not in self._in_types:
logger.error(f'Invalid input type {_in_type}.')
return False

self._shapes_to_run = []
for _in_type in self._args.in_types:
for _b in mrange(*map(int, self._args.batch.split(':'))):
for shape in self._args.shapes:
shape_list = shape.replace(',', ' ').split()
if len(shape_list) != 3 or not all(validate_mrange(x) for x in shape_list):
logger.error(f'Invalid shape {shape}.')
return False
for _m, _n, _k in itertools.product(
*map(
lambda shape: mrange(
*map(lambda dim: int(dim.lstrip('+').lstrip('x')), shape.split(':')),
symbol=shape.split(':')[2][0]
if len(shape.split(':')) == 3 and any([i in shape for i in ['+', 'x']]) else 'x'
), shape_list
)
):
self._shapes_to_run.append((_m, _n, _k, _b, _in_type))

return True
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ void CublasFunction::matrix_calculation_on_cpu_with_data(const T1 *Parameter_0_0
for (int j = 0; j < n; j++) {
(*Result_cpu)[i + j * m + b * m * n] = beta * (T2)(Result_3_0_host[i + j * m + b * m * n]);
for (int p = 0; p < k; p++) {
(*Result_cpu)[i + j * m + b * m * n] +=
Parameter_0_0_host_op[p * m + i + b * m * k] * Parameter_1_0_host_op[j * k + p + b * k * n];
(*Result_cpu)[i + j * m + b * m * n] += (T2)(Parameter_0_0_host_op[p * m + i + b * m * k] *
Parameter_1_0_host_op[j * k + p + b * k * n]);
(*Result_cpu)[i + j * m + b * m * n] *= alpha;
}
}
Expand Down Expand Up @@ -444,7 +444,7 @@ int CublasFunction::check_result(int batch_count, T1 *Result_3_0, T2 *Result_cpu
// |<x, y>_cpu - <x,y>_gpu|/|<x, y>_cpu|/dot_length < eps
int error_count = 0;
for (int i = 0; i < static_cast<int>(m * n) * batch_count; i++) {
double abs_err = fabs(Result_cpu[i] - Result_3_0_host[i]);
double abs_err = fabs(Result_cpu[i] - (T2)(Result_3_0_host[i]));
double dot_length = k;
double abs_val = fabs(Result_cpu[i]);
double rel_err = abs_err / abs_val / dot_length;
Expand Down
Loading

0 comments on commit 81e060d

Please sign in to comment.