From 711b9ef33682a0f09fc7f10c3e7a71bcfec6cb81 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 26 Sep 2024 00:19:51 +0800 Subject: [PATCH] add configurations for NDv5 and AMD MI300 --- superbench/config/amd_mi300.yaml | 232 +++++++++++++++++++++++ superbench/config/azure_ndv5.yaml | 305 ++++++++++++++++++++++++++++++ 2 files changed, 537 insertions(+) create mode 100644 superbench/config/amd_mi300.yaml create mode 100644 superbench/config/azure_ndv5.yaml diff --git a/superbench/config/amd_mi300.yaml b/superbench/config/amd_mi300.yaml new file mode 100644 index 000000000..b7aefba63 --- /dev/null +++ b/superbench/config/amd_mi300.yaml @@ -0,0 +1,232 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: null + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + model_ddp_parameter: &model_ddp_param + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + m: 7680 + n: 8192 + k: 8192 + hipblaslt-gemm: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + parameters: + in_types: ["fp32", "fp16", "bf16", 'fp8'] + tolerant_fail: yes + num_warmup: 100 + num_steps: 1000 + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + rccl-bw: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + parameters: + maxbytes: 16G + ngpus: 1 + operation: allreduce + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4)) + parallel: no + ib-loopback: + enable: true + modes: + - name: local + proc_num: 16 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) + parallel: no + parameters: + msg_size: 8388608 + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: [] + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + btl: tcp,self + pml: ob1 + btl_tcp_if_include: ens17f0 + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: ens17f0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + dist-inference: + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + model-benchmarks:gpt: + enable: true + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + enable: true + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + enable: true + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *model_ddp_param + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + model-benchmarks:resnet: + enable: true + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *model_ddp_param + batch_size: 384 + model-benchmarks:densenet: + enable: true + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *model_ddp_param + model-benchmarks:vgg: + enable: true + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *model_ddp_param diff --git a/superbench/config/azure_ndv5.yaml b/superbench/config/azure_ndv5.yaml new file mode 100644 index 000000000..d4e030b4c --- /dev/null +++ b/superbench/config/azure_ndv5.yaml @@ -0,0 +1,305 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: + monitor: + enable: true + sample_duration: 1 + sample_interval: 10 + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"] + cublaslt-gemm: + <<: *default_local_mode + parameters: + in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8'] + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + gpu-burn: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + time: 900 + doubles: true + tensor_core: true + nccl-bw:default: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + ngpus: 8 + nccl-bw:gdr-only: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + env: + NCCL_IB_PCI_RELAXED_ORDERING: '1' + NCCL_NET_GDR_LEVEL: '5' + NCCL_P2P_DISABLE: '1' + NCCL_SHM_DISABLE: '1' + NCCL_MIN_NCHANNELS: '16' + NCCL_IB_DISABLE: '0' + parameters: + ngpus: 8 + nccl-lat:default: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + parameters: + maxbytes: 16M + warmup_iters: 20 + iters: 1000 + graph_iters: 1 + ib-loopback: + timeout: *default_timeout + modes: + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1 + parallel: yes + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1 + parallel: yes + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: + - /dev/nvme0n1 + - /dev/nvme1n1 + - /dev/nvme2n1 + - /dev/nvme3n1 + - /dev/nvme4n1 + - /dev/nvme5n1 + - /dev/nvme6n1 + - /dev/nvme7n1 + seq_read_runtime: 60 + seq_write_runtime: 60 + seq_readwrite_runtime: 60 + rand_read_runtime: 60 + rand_write_runtime: 60 + rand_readwrite_runtime: 60 + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + dist-inference: + enable: true + timeout: 600 + modes: + - name: mpi + proc_num: 8 + node_num: 1 + env: + NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml' + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 8 + parameters: + msg_size: 8388608 + ib_dev: mlx5_$LOCAL_RANK + gpu_dev: $LOCAL_RANK + numa_dev: $((LOCAL_RANK/2)) + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + gpcnet-network-load-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + ort-inference: + <<: *default_local_mode + tensorrt-inference: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: int8 + model-benchmarks:gpt: + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *common_model_config + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet: + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *common_model_config + batch_size: 384 + num_steps: 512 + model-benchmarks:densenet: + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *common_model_config + pin_memory: no + model-benchmarks:vgg: + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *common_model_config + pin_memory: no