Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks - Add configurations for NDv5 and AMD MI300 #652

Merged
merged 4 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ jobs:
else
echo "No Docker images found with the specified references."
fi
sudo docker ps -q | grep build | xargs -r sudo docker stop
echo y | sudo docker system prune -a --volumes
df -h
- name: Prepare metadata
id: metadata
Expand Down
5 changes: 5 additions & 0 deletions dockerfile/rocm6.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASL
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

# Install AMD SMI Python Library
RUN apt install amd-smi-lib -y && \
cd /opt/rocm/share/amd_smi && \
python3 -m pip install .

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def run(self):
'onnxruntime-gpu; python_version>="3.10"',
],
'nvidia': ['py3nvml>=0.2.6'],
'amd': ['pyrsmi>=1.0.1'],
'amd': ['amdsmi'],
}
),
include_package_data=True,
Expand Down
232 changes: 232 additions & 0 deletions superbench/config/amd_mi300.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# SuperBench Config
version: v0.11
superbench:
enable: null
var:
default_local_mode: &default_local_mode
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
model_ddp_parameter: &model_ddp_param
duration: 0
num_warmup: 128
num_steps: 512
sample_count: 8192
batch_size: 128
precision: [float32, float16]
model_action: [train]
pin_memory: yes
num_workers: 0
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
parameters:
m: 7680
n: 8192
k: 8192
hipblaslt-gemm:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: yes
parameters:
in_types: ["fp32", "fp16", "bf16", 'fp8']
tolerant_fail: yes
num_warmup: 100
num_steps: 1000
shapes:
- 4096,4096,4096
- 8192,8192,8192
- 16384,16384,16384
rccl-bw:
enable: true
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
parameters:
maxbytes: 16G
ngpus: 1
operation: allreduce
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
parallel: no
ib-loopback:
enable: true
modes:
- name: local
proc_num: 16
prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
parallel: no
parameters:
msg_size: 8388608
disk-benchmark:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
block_devices: []
gpu-copy-bw:correctness:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
size: 4096
num_warm_up: 0
num_loops: 1
check_data: true
gpu-copy-bw:perf:
enable: true
modes:
- name: local
parallel: no
parameters:
mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
copy_type: [sm, dma]
ib-traffic:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
btl: tcp,self
pml: ob1
btl_tcp_if_include: ens17f0
gpcnet-network-test:
enable: false
modes:
- name: mpi
proc_num: 1
mca:
pml: ucx
btl: ^uct
btl_tcp_if_include: ens17f0
tcp-connectivity:
enable: false
modes:
- name: local
parallel: no
parameters:
port: 22
dist-inference:
modes:
- name: mpi
proc_num: 8
node_num: 1
mca:
pml: ob1
btl: ^openib
btl_tcp_if_exclude: lo,docker0
coll_hcoll_enable: 0
frameworks:
- pytorch
parameters:
num_layers: 50
num_warmup: 20
num_steps: 100
use_cuda_graph: true
precision: float16
hidden_size: 128
input_size: 128
batch_size: 1024
model-benchmarks:gpt:
enable: true
<<: *default_pytorch_mode
models:
- gpt2-small
- gpt2-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
batch_size: 32
seq_len: 224
model-benchmarks:bert:
enable: true
<<: *default_pytorch_mode
models:
- bert-base
- bert-large
parameters:
<<: *model_ddp_param
precision: [float32, float16, fp8_hybrid]
seq_len: 224
model-benchmarks:lstm:
enable: true
<<: *default_pytorch_mode
models:
- lstm
parameters:
<<: *model_ddp_param
batch_size: 1024
input_size: 224
hidden_size: 1000
seq_len: 32
model-benchmarks:resnet:
enable: true
<<: *default_pytorch_mode
models:
- resnet50
- resnet101
- resnet152
parameters:
<<: *model_ddp_param
batch_size: 384
model-benchmarks:densenet:
enable: true
<<: *default_pytorch_mode
models:
- densenet169
- densenet201
parameters:
<<: *model_ddp_param
model-benchmarks:vgg:
enable: true
<<: *default_pytorch_mode
models:
- vgg11
- vgg13
- vgg16
- vgg19
parameters:
<<: *model_ddp_param
Loading
Loading