From b0d1da04dc98245dc30052042fb8560dacc5b2a3 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 20 Sep 2024 06:22:42 +0000 Subject: [PATCH 1/7] add release node for 0.11 --- README.md | 2 +- dockerfile/rocm5.7.x.dockerfile | 3 +- dockerfile/rocm6.0.x.dockerfile | 5 +++ docs/getting-started/installation.mdx | 2 +- docs/getting-started/run-superbench.md | 2 +- docs/superbench-config.mdx | 2 +- docs/user-tutorial/container-images.mdx | 6 +-- docs/user-tutorial/data-diagnosis.md | 2 +- docs/user-tutorial/result-summary.md | 2 +- superbench/__init__.py | 2 +- superbench/config/amd_mi100_hpe.yaml | 2 +- superbench/config/amd_mi100_z53.yaml | 2 +- .../inference/standard_nc64as_t4_v3.yaml | 2 +- .../inference/standard_nc96ads_a100_v4.yaml | 2 +- .../inference/standard_nv18ads_a10_v5.yaml | 2 +- superbench/config/azure_ndmv4.yaml | 2 +- superbench/config/azure_ndv4.yaml | 2 +- superbench/config/default.yaml | 2 +- website/blog/2024-09-20-release-0-11.md | 40 +++++++++++++++++++ website/docusaurus.config.js | 2 +- website/package-lock.json | 8 ++-- website/package.json | 4 +- 22 files changed, 72 insertions(+), 26 deletions(-) create mode 100644 website/blog/2024-09-20-release-0-11.md diff --git a/README.md b/README.md index 7706fc472..8d4e0a8f1 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ __SuperBench__ is a validation and profiling tool for AI infrastructure. -📢 [v0.10.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.10.0) has been released! +📢 [v0.11.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.11.0) has been released! ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index 85ba1919e..2dac98fc7 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -143,7 +143,8 @@ RUN cd /opt/ && \ make -j${NUM_MAKE_JOBS} # Install AMD SMI Python Library -RUN cd /opt/rocm/share/amd_smi && \ +RUN apt install amd-smi-lib && \ + cd /opt/rocm/share/amd_smi && \ python3 -m pip install --user . ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index bd33e289a..20a5e0a0a 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -148,6 +148,11 @@ RUN cd /opt/ && \ .. && \ make -j${NUM_MAKE_JOBS} +# Install AMD SMI Python Library +RUN apt install amd-smi-lib && \ + cd /opt/rocm/share/amd_smi && \ + python3 -m pip install --user . + ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index cf48c4caa..30fdee829 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. :::note Note You should checkout corresponding tag to use release version, for example, -`git clone -b v0.10.0 https://github.com/microsoft/superbenchmark` +`git clone -b v0.11.0 https://github.com/microsoft/superbenchmark` ::: ```bash diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md index a6bb3bc1e..22c24c5e6 100644 --- a/docs/getting-started/run-superbench.md +++ b/docs/getting-started/run-superbench.md @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] :::note Note You should deploy corresponding Docker image to use release version, for example, -`sb deploy -f local.ini -i superbench/superbench:v0.10.0-cuda12.2` +`sb deploy -f local.ini -i superbench/superbench:v0.11.0-cuda12.2` You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index b8ad058fa..102b8d69f 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -70,7 +70,7 @@ superbench: ```yaml -version: v0.10 +version: v0.11 superbench: enable: benchmark_1 monitor: diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index ffca22796..b3c692fd8 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -30,8 +30,8 @@ available tags are listed below for all stable versions. | Tag | Description | |--------------------|-------------------------------------| -| v0.10.0-cuda12.2 | SuperBench v0.10.0 with CUDA 12.2 | -| v0.10.0-cuda11.1.1 | SuperBench v0.10.0 with CUDA 11.1.1 | +| v0.11.0-cuda12.2 | SuperBench v0.11.0 with CUDA 12.2 | +| v0.11.0-cuda11.1.1 | SuperBench v0.11.0 with CUDA 11.1.1 | | v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | | v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 | | v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | @@ -50,7 +50,7 @@ available tags are listed below for all stable versions. | Tag | Description | |-------------------------------|--------------------------------------------------| -| v0.10.0-rocm5.7 | SuperBench v0.10.0 with ROCm 5.7 | +| v0.11.0-rocm5.7 | SuperBench v0.11.0 with ROCm 5.7 | | v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 | | v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 | | v0.9.0-rocm5.0.1 | SuperBench v0.9.0 with ROCm 5.0.1 | diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md index c2f0e3369..0c1c11878 100644 --- a/docs/user-tutorial/data-diagnosis.md +++ b/docs/user-tutorial/data-diagnosis.md @@ -65,7 +65,7 @@ superbench: example: ```yaml # SuperBench rules -version: v0.10 +version: v0.11 superbench: rules: failure-rule: diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md index dffee2514..475beffbe 100644 --- a/docs/user-tutorial/result-summary.md +++ b/docs/user-tutorial/result-summary.md @@ -58,7 +58,7 @@ superbench: ```yaml title="Example" # SuperBench rules -version: v0.10 +version: v0.11 superbench: rules: kernel_launch: diff --git a/superbench/__init__.py b/superbench/__init__.py index e1f4234fd..cbd47c45e 100644 --- a/superbench/__init__.py +++ b/superbench/__init__.py @@ -6,5 +6,5 @@ Provide hardware and software benchmarks for AI systems. """ -__version__ = '0.10.0' +__version__ = '0.11.0' __author__ = 'Microsoft' diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml index 9aec785f5..0388fcda8 100644 --- a/superbench/config/amd_mi100_hpe.yaml +++ b/superbench/config/amd_mi100_hpe.yaml @@ -3,7 +3,7 @@ # Server: # - Product: HPE Apollo 6500 -version: v0.10 +version: v0.11 superbench: enable: null var: diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml index 7e56d1a33..4ed8addf6 100644 --- a/superbench/config/amd_mi100_z53.yaml +++ b/superbench/config/amd_mi100_z53.yaml @@ -4,7 +4,7 @@ # - Product: G482-Z53 # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html -version: v0.10 +version: v0.11 superbench: enable: null var: diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml index 7624a86de..b9d57bfbc 100644 --- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml +++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml @@ -1,4 +1,4 @@ -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml index befcd1783..224508e0d 100644 --- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml +++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml @@ -1,4 +1,4 @@ -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml index af19e0a22..e44510b27 100644 --- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml +++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml @@ -1,4 +1,4 @@ -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml index 3ef0c399a..7d7a8f185 100644 --- a/superbench/config/azure_ndmv4.yaml +++ b/superbench/config/azure_ndmv4.yaml @@ -3,7 +3,7 @@ # Azure NDm A100 v4 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml index 921a446b8..b095d5c23 100644 --- a/superbench/config/azure_ndv4.yaml +++ b/superbench/config/azure_ndv4.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 9533806cd..601136e9f 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.10 +version: v0.11 superbench: enable: null monitor: diff --git a/website/blog/2024-09-20-release-0-11.md b/website/blog/2024-09-20-release-0-11.md new file mode 100644 index 000000000..d0fa02a4b --- /dev/null +++ b/website/blog/2024-09-20-release-0-11.md @@ -0,0 +1,40 @@ +--- +slug: release-sb-v0.11 +title: Releasing SuperBench v0.11 +author: Peng Cheng +author_title: SuperBench Team +author_url: https://github.com/cp5555 +author_image_url: https://github.com/cp5555.png +tags: [superbench, announcement, release] +--- + +We are very happy to announce that **SuperBench 0.11.0 version** is officially released today! + +You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation). + +## SuperBench 0.11.0 Release Notes + +### SuperBench Improvements + +- Add CUDA 12.4 dockerfile. +- Add ROCm6.2 dockerfile. +- Update hpcx link in cuda11.1 dockerfile to fix docker build failure. +- Improve document (fix typos and add BibTeX in README and repo). +- Limit protobuf version to be 3.20.x. +- Update omegaconf version to 2.3.0. +- Fix MSCCL build error in CUDA12.4 docker build pipeline. +- Update Docker Exec Command for Persistent HPCX Environment. +- Use types-setuptools to replace types-pkg_resources. + +### Micro-benchmark Improvements + +- Add hipblasLt tuning to dist-inference cpp implementation. +- Add support for NVIDIA L4/L40/L40s GPUs in gemm-flops. +- Upgrade mlc to v3.11. + +## Model-benchmark Improvements + +- Support FP8 transformer model training in ROCm6.2 dockerfile. + +### Result Analysis +- Fix bug of failure test and warning of pandas in data diagnosis diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index a533084e3..41ed52665 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -101,7 +101,7 @@ module.exports = { announcementBar: { id: 'supportus', content: - '📢 v0.10.0 has been released! ' + + '📢 v0.11.0 has been released! ' + '⭐️ If you like SuperBench, give it a star on GitHub! ⭐️', }, algolia: { diff --git a/website/package-lock.json b/website/package-lock.json index b4652de12..01d913769 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.10.0", + "version": "0.11.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -7467,8 +7467,8 @@ } }, "node-forge": { - "version": "0.10.0", - "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.10.0.tgz", + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.11.0.tgz", "integrity": "sha512-PPmu8eEeG9saEUvI97fm4OYxXVB6bFvyNTyiUOBichBpFG8A1Ljw3bY62+5oOjDEMHRnd0Y7HQ+x7uzxOzC6JA==" }, "node-releases": { @@ -9428,7 +9428,7 @@ "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-1.10.11.tgz", "integrity": "sha512-aVmbPOfViZqOZPgRBT0+3u4yZFHpmnIghLMlAcb5/xhp5ZtB/RVnKhz5vl2M32CLXAqR4kha9zfhNg0Lf/sxKA==", "requires": { - "node-forge": "^0.10.0" + "node-forge": "^0.11.0" } }, "semver": { diff --git a/website/package.json b/website/package.json index f2bb9ed76..8aa4c78b8 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.10.0", + "version": "0.11.0", "private": true, "scripts": { "docusaurus": "docusaurus", @@ -38,4 +38,4 @@ "last 1 safari version" ] } -} \ No newline at end of file +} From 6f764c0506f3c88b1df6007f605d375a9feff157 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 20 Sep 2024 06:29:01 +0000 Subject: [PATCH 2/7] recover --- dockerfile/rocm5.7.x.dockerfile | 3 +-- dockerfile/rocm6.0.x.dockerfile | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index 2dac98fc7..85ba1919e 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -143,8 +143,7 @@ RUN cd /opt/ && \ make -j${NUM_MAKE_JOBS} # Install AMD SMI Python Library -RUN apt install amd-smi-lib && \ - cd /opt/rocm/share/amd_smi && \ +RUN cd /opt/rocm/share/amd_smi && \ python3 -m pip install --user . ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 20a5e0a0a..bd33e289a 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -148,11 +148,6 @@ RUN cd /opt/ && \ .. && \ make -j${NUM_MAKE_JOBS} -# Install AMD SMI Python Library -RUN apt install amd-smi-lib && \ - cd /opt/rocm/share/amd_smi && \ - python3 -m pip install --user . - ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ From 6ea7a651ebdaccde59338d6360870dbfde1c098e Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Sat, 21 Sep 2024 09:29:05 +0800 Subject: [PATCH 3/7] Update package-lock.json --- website/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index 01d913769..ba0c41478 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -7467,8 +7467,8 @@ } }, "node-forge": { - "version": "0.11.0", - "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.11.0.tgz", + "version": "0.10.0", + "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.10.0.tgz", "integrity": "sha512-PPmu8eEeG9saEUvI97fm4OYxXVB6bFvyNTyiUOBichBpFG8A1Ljw3bY62+5oOjDEMHRnd0Y7HQ+x7uzxOzC6JA==" }, "node-releases": { @@ -9428,7 +9428,7 @@ "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-1.10.11.tgz", "integrity": "sha512-aVmbPOfViZqOZPgRBT0+3u4yZFHpmnIghLMlAcb5/xhp5ZtB/RVnKhz5vl2M32CLXAqR4kha9zfhNg0Lf/sxKA==", "requires": { - "node-forge": "^0.11.0" + "node-forge": "^0.10.0" } }, "semver": { From 14d3d219eb033e09c0a9fdf77b71b19186e72443 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Sun, 22 Sep 2024 13:43:36 +0800 Subject: [PATCH 4/7] Update docker image version --- docs/user-tutorial/container-images.mdx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index b3c692fd8..58a8079d7 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -30,8 +30,11 @@ available tags are listed below for all stable versions. | Tag | Description | |--------------------|-------------------------------------| +| v0.11.0-cuda12.4 | SuperBench v0.11.0 with CUDA 12.4 | | v0.11.0-cuda12.2 | SuperBench v0.11.0 with CUDA 12.2 | | v0.11.0-cuda11.1.1 | SuperBench v0.11.0 with CUDA 11.1.1 | +| v0.10.0-cuda12.2 | SuperBench v0.10.0 with CUDA 12.2 | +| v0.10.0-cuda11.1.1 | SuperBench v0.10.0 with CUDA 11.1.1 | | v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | | v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 | | v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | @@ -50,7 +53,10 @@ available tags are listed below for all stable versions. | Tag | Description | |-------------------------------|--------------------------------------------------| -| v0.11.0-rocm5.7 | SuperBench v0.11.0 with ROCm 5.7 | +| v0.11.0-rocm6.2 | SuperBench v0.11.0 with ROCm 6.2 | +| v0.11.0-rocm6.0 | SuperBench v0.11.0 with ROCm 6.0 | +| v0.10.0-rocm6.0 | SuperBench v0.10.0 with ROCm 6.0 | +| v0.10.0-rocm5.7 | SuperBench v0.10.0 with ROCm 5.7 | | v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 | | v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 | | v0.9.0-rocm5.0.1 | SuperBench v0.9.0 with ROCm 5.0.1 | From ad864281f30c5fc9b8deb7dbef7c37df5198d891 Mon Sep 17 00:00:00 2001 From: yukirora Date: Mon, 23 Sep 2024 03:26:00 +0000 Subject: [PATCH 5/7] add ndv5 and mi300 example configs --- superbench/config/amd_mi300.yaml | 232 +++++++++++++++++++++++ superbench/config/azure_ndv5.yaml | 305 ++++++++++++++++++++++++++++++ 2 files changed, 537 insertions(+) create mode 100644 superbench/config/amd_mi300.yaml create mode 100644 superbench/config/azure_ndv5.yaml diff --git a/superbench/config/amd_mi300.yaml b/superbench/config/amd_mi300.yaml new file mode 100644 index 000000000..b7aefba63 --- /dev/null +++ b/superbench/config/amd_mi300.yaml @@ -0,0 +1,232 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: null + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + model_ddp_parameter: &model_ddp_param + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + m: 7680 + n: 8192 + k: 8192 + hipblaslt-gemm: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} + parallel: yes + parameters: + in_types: ["fp32", "fp16", "bf16", 'fp8'] + tolerant_fail: yes + num_warmup: 100 + num_steps: 1000 + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + rccl-bw: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + parameters: + maxbytes: 16G + ngpus: 1 + operation: allreduce + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4)) + parallel: no + ib-loopback: + enable: true + modes: + - name: local + proc_num: 16 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) + parallel: no + parameters: + msg_size: 8388608 + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: [] + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + btl: tcp,self + pml: ob1 + btl_tcp_if_include: ens17f0 + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: ens17f0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + dist-inference: + modes: + - name: mpi + proc_num: 8 + node_num: 1 + mca: + pml: ob1 + btl: ^openib + btl_tcp_if_exclude: lo,docker0 + coll_hcoll_enable: 0 + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + model-benchmarks:gpt: + enable: true + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + enable: true + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + enable: true + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *model_ddp_param + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + model-benchmarks:resnet: + enable: true + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *model_ddp_param + batch_size: 384 + model-benchmarks:densenet: + enable: true + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *model_ddp_param + model-benchmarks:vgg: + enable: true + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *model_ddp_param diff --git a/superbench/config/azure_ndv5.yaml b/superbench/config/azure_ndv5.yaml new file mode 100644 index 000000000..d4e030b4c --- /dev/null +++ b/superbench/config/azure_ndv5.yaml @@ -0,0 +1,305 @@ +# SuperBench Config +version: v0.11 +superbench: + enable: + monitor: + enable: true + sample_duration: 1 + sample_interval: 10 + var: + default_local_mode: &default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + enable: true + modes: + - name: torch.distributed + proc_num: 8 + node_num: 1 + frameworks: + - pytorch + common_model_config: &common_model_config + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + parameters: + precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"] + cublaslt-gemm: + <<: *default_local_mode + parameters: + in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8'] + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + gpu-burn: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + time: 900 + doubles: true + tensor_core: true + nccl-bw:default: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + ngpus: 8 + nccl-bw:gdr-only: + enable: true + modes: + - name: local + proc_num: 1 + parallel: no + env: + NCCL_IB_PCI_RELAXED_ORDERING: '1' + NCCL_NET_GDR_LEVEL: '5' + NCCL_P2P_DISABLE: '1' + NCCL_SHM_DISABLE: '1' + NCCL_MIN_NCHANNELS: '16' + NCCL_IB_DISABLE: '0' + parameters: + ngpus: 8 + nccl-lat:default: + enable: true + modes: + - name: mpi + proc_num: 8 + node_num: 1 + parameters: + maxbytes: 16M + warmup_iters: 20 + iters: 1000 + graph_iters: 1 + ib-loopback: + timeout: *default_timeout + modes: + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1 + parallel: yes + - name: local + proc_num: 4 + prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1 + parallel: yes + cpu-memory-bw-latency: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + enable: true + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no + disk-benchmark: + enable: false + modes: + - name: local + proc_num: 1 + parallel: no + parameters: + block_devices: + - /dev/nvme0n1 + - /dev/nvme1n1 + - /dev/nvme2n1 + - /dev/nvme3n1 + - /dev/nvme4n1 + - /dev/nvme5n1 + - /dev/nvme6n1 + - /dev/nvme7n1 + seq_read_runtime: 60 + seq_write_runtime: 60 + seq_readwrite_runtime: 60 + rand_read_runtime: 60 + rand_write_runtime: 60 + rand_readwrite_runtime: 60 + gpu-copy-bw:correctness: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-copy-bw:perf: + enable: true + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] + copy_type: [sm, dma] + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + dist-inference: + enable: true + timeout: 600 + modes: + - name: mpi + proc_num: 8 + node_num: 1 + env: + NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml' + frameworks: + - pytorch + parameters: + num_layers: 50 + num_warmup: 20 + num_steps: 100 + use_cuda_graph: true + precision: float16 + hidden_size: 128 + input_size: 128 + batch_size: 1024 + ib-traffic: + enable: false + modes: + - name: mpi + proc_num: 8 + parameters: + msg_size: 8388608 + ib_dev: mlx5_$LOCAL_RANK + gpu_dev: $LOCAL_RANK + numa_dev: $((LOCAL_RANK/2)) + gpcnet-network-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + gpcnet-network-load-test: + enable: false + modes: + - name: mpi + proc_num: 1 + mca: + pml: ucx + btl: ^uct + btl_tcp_if_include: eth0 + tcp-connectivity: + enable: false + modes: + - name: local + parallel: no + parameters: + port: 22 + ort-inference: + <<: *default_local_mode + tensorrt-inference: + <<: *default_local_mode + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: int8 + model-benchmarks:gpt: + <<: *default_pytorch_mode + models: + - gpt2-small + - gpt2-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + <<: *default_pytorch_mode + models: + - bert-base + - bert-large + parameters: + <<: *common_model_config + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + <<: *default_pytorch_mode + models: + - lstm + parameters: + <<: *common_model_config + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + pin_memory: no + model-benchmarks:resnet: + <<: *default_pytorch_mode + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *common_model_config + batch_size: 384 + num_steps: 512 + model-benchmarks:densenet: + <<: *default_pytorch_mode + models: + - densenet169 + - densenet201 + parameters: + <<: *common_model_config + pin_memory: no + model-benchmarks:vgg: + <<: *default_pytorch_mode + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *common_model_config + pin_memory: no From 83a94f4dc92f0c344b6cb5e7d5f70a705ec2e8d6 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 26 Sep 2024 00:19:55 +0800 Subject: [PATCH 6/7] update according to comments --- superbench/config/amd_mi300.yaml | 232 ------------------ superbench/config/azure_ndv5.yaml | 305 ------------------------ website/blog/2024-09-20-release-0-11.md | 12 +- 3 files changed, 8 insertions(+), 541 deletions(-) delete mode 100644 superbench/config/amd_mi300.yaml delete mode 100644 superbench/config/azure_ndv5.yaml diff --git a/superbench/config/amd_mi300.yaml b/superbench/config/amd_mi300.yaml deleted file mode 100644 index b7aefba63..000000000 --- a/superbench/config/amd_mi300.yaml +++ /dev/null @@ -1,232 +0,0 @@ -# SuperBench Config -version: v0.11 -superbench: - enable: null - var: - default_local_mode: &default_local_mode - enable: true - modes: - - name: local - proc_num: 8 - prefix: HIP_VISIBLE_DEVICES={proc_rank} - parallel: yes - default_pytorch_mode: &default_pytorch_mode - enable: true - modes: - - name: torch.distributed - proc_num: 8 - node_num: 1 - frameworks: - - pytorch - common_model_config: &common_model_config - model_ddp_parameter: &model_ddp_param - duration: 0 - num_warmup: 128 - num_steps: 512 - sample_count: 8192 - batch_size: 128 - precision: [float32, float16] - model_action: [train] - pin_memory: yes - num_workers: 0 - benchmarks: - kernel-launch: - <<: *default_local_mode - gemm-flops: - <<: *default_local_mode - parameters: - m: 7680 - n: 8192 - k: 8192 - hipblaslt-gemm: - enable: true - modes: - - name: local - proc_num: 8 - prefix: HIP_VISIBLE_DEVICES={proc_rank} - parallel: yes - parameters: - in_types: ["fp32", "fp16", "bf16", 'fp8'] - tolerant_fail: yes - num_warmup: 100 - num_steps: 1000 - shapes: - - 4096,4096,4096 - - 8192,8192,8192 - - 16384,16384,16384 - rccl-bw: - enable: true - modes: - - name: mpi - proc_num: 8 - node_num: 1 - mca: - pml: ob1 - btl: ^openib - btl_tcp_if_exclude: lo,docker0 - coll_hcoll_enable: 0 - parameters: - maxbytes: 16G - ngpus: 1 - operation: allreduce - cpu-memory-bw-latency: - enable: false - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - tests: - - bandwidth_matrix - - latency_matrix - - max_bandwidth - mem-bw: - enable: true - modes: - - name: local - proc_num: 8 - prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4)) - parallel: no - ib-loopback: - enable: true - modes: - - name: local - proc_num: 16 - prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) - parallel: no - parameters: - msg_size: 8388608 - disk-benchmark: - enable: false - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - block_devices: [] - gpu-copy-bw:correctness: - enable: true - modes: - - name: local - parallel: no - parameters: - mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] - copy_type: [sm, dma] - size: 4096 - num_warm_up: 0 - num_loops: 1 - check_data: true - gpu-copy-bw:perf: - enable: true - modes: - - name: local - parallel: no - parameters: - mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] - copy_type: [sm, dma] - ib-traffic: - enable: false - modes: - - name: mpi - proc_num: 1 - mca: - btl: tcp,self - pml: ob1 - btl_tcp_if_include: ens17f0 - gpcnet-network-test: - enable: false - modes: - - name: mpi - proc_num: 1 - mca: - pml: ucx - btl: ^uct - btl_tcp_if_include: ens17f0 - tcp-connectivity: - enable: false - modes: - - name: local - parallel: no - parameters: - port: 22 - dist-inference: - modes: - - name: mpi - proc_num: 8 - node_num: 1 - mca: - pml: ob1 - btl: ^openib - btl_tcp_if_exclude: lo,docker0 - coll_hcoll_enable: 0 - frameworks: - - pytorch - parameters: - num_layers: 50 - num_warmup: 20 - num_steps: 100 - use_cuda_graph: true - precision: float16 - hidden_size: 128 - input_size: 128 - batch_size: 1024 - model-benchmarks:gpt: - enable: true - <<: *default_pytorch_mode - models: - - gpt2-small - - gpt2-large - parameters: - <<: *model_ddp_param - precision: [float32, float16, fp8_hybrid] - batch_size: 32 - seq_len: 224 - model-benchmarks:bert: - enable: true - <<: *default_pytorch_mode - models: - - bert-base - - bert-large - parameters: - <<: *model_ddp_param - precision: [float32, float16, fp8_hybrid] - seq_len: 224 - model-benchmarks:lstm: - enable: true - <<: *default_pytorch_mode - models: - - lstm - parameters: - <<: *model_ddp_param - batch_size: 1024 - input_size: 224 - hidden_size: 1000 - seq_len: 32 - model-benchmarks:resnet: - enable: true - <<: *default_pytorch_mode - models: - - resnet50 - - resnet101 - - resnet152 - parameters: - <<: *model_ddp_param - batch_size: 384 - model-benchmarks:densenet: - enable: true - <<: *default_pytorch_mode - models: - - densenet169 - - densenet201 - parameters: - <<: *model_ddp_param - model-benchmarks:vgg: - enable: true - <<: *default_pytorch_mode - models: - - vgg11 - - vgg13 - - vgg16 - - vgg19 - parameters: - <<: *model_ddp_param diff --git a/superbench/config/azure_ndv5.yaml b/superbench/config/azure_ndv5.yaml deleted file mode 100644 index d4e030b4c..000000000 --- a/superbench/config/azure_ndv5.yaml +++ /dev/null @@ -1,305 +0,0 @@ -# SuperBench Config -version: v0.11 -superbench: - enable: - monitor: - enable: true - sample_duration: 1 - sample_interval: 10 - var: - default_local_mode: &default_local_mode - enable: true - modes: - - name: local - proc_num: 8 - prefix: CUDA_VISIBLE_DEVICES={proc_rank} - parallel: yes - default_pytorch_mode: &default_pytorch_mode - enable: true - modes: - - name: torch.distributed - proc_num: 8 - node_num: 1 - frameworks: - - pytorch - common_model_config: &common_model_config - duration: 0 - num_warmup: 128 - num_steps: 512 - sample_count: 8192 - batch_size: 128 - precision: [float32, float16] - model_action: [train] - pin_memory: yes - num_workers: 0 - benchmarks: - kernel-launch: - <<: *default_local_mode - gemm-flops: - <<: *default_local_mode - parameters: - precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"] - cublaslt-gemm: - <<: *default_local_mode - parameters: - in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8'] - shapes: - - 4096,4096,4096 - - 8192,8192,8192 - - 16384,16384,16384 - gpu-burn: - enable: false - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - time: 900 - doubles: true - tensor_core: true - nccl-bw:default: - enable: true - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - ngpus: 8 - nccl-bw:gdr-only: - enable: true - modes: - - name: local - proc_num: 1 - parallel: no - env: - NCCL_IB_PCI_RELAXED_ORDERING: '1' - NCCL_NET_GDR_LEVEL: '5' - NCCL_P2P_DISABLE: '1' - NCCL_SHM_DISABLE: '1' - NCCL_MIN_NCHANNELS: '16' - NCCL_IB_DISABLE: '0' - parameters: - ngpus: 8 - nccl-lat:default: - enable: true - modes: - - name: mpi - proc_num: 8 - node_num: 1 - parameters: - maxbytes: 16M - warmup_iters: 20 - iters: 1000 - graph_iters: 1 - ib-loopback: - timeout: *default_timeout - modes: - - name: local - proc_num: 4 - prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1 - parallel: yes - - name: local - proc_num: 4 - prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1 - parallel: yes - cpu-memory-bw-latency: - enable: false - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - tests: - - bandwidth_matrix - - latency_matrix - - max_bandwidth - mem-bw: - enable: true - modes: - - name: local - proc_num: 8 - prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) - parallel: no - disk-benchmark: - enable: false - modes: - - name: local - proc_num: 1 - parallel: no - parameters: - block_devices: - - /dev/nvme0n1 - - /dev/nvme1n1 - - /dev/nvme2n1 - - /dev/nvme3n1 - - /dev/nvme4n1 - - /dev/nvme5n1 - - /dev/nvme6n1 - - /dev/nvme7n1 - seq_read_runtime: 60 - seq_write_runtime: 60 - seq_readwrite_runtime: 60 - rand_read_runtime: 60 - rand_write_runtime: 60 - rand_readwrite_runtime: 60 - gpu-copy-bw:correctness: - enable: true - modes: - - name: local - parallel: no - parameters: - mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] - copy_type: [sm, dma] - size: 4096 - num_warm_up: 0 - num_loops: 1 - check_data: true - gpu-copy-bw:perf: - enable: true - modes: - - name: local - parallel: no - parameters: - mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] - copy_type: [sm, dma] - cudnn-function: - <<: *default_local_mode - cublas-function: - <<: *default_local_mode - matmul: - <<: *default_local_mode - frameworks: - - pytorch - sharding-matmul: - <<: *default_pytorch_mode - computation-communication-overlap: - <<: *default_pytorch_mode - dist-inference: - enable: true - timeout: 600 - modes: - - name: mpi - proc_num: 8 - node_num: 1 - env: - NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml' - frameworks: - - pytorch - parameters: - num_layers: 50 - num_warmup: 20 - num_steps: 100 - use_cuda_graph: true - precision: float16 - hidden_size: 128 - input_size: 128 - batch_size: 1024 - ib-traffic: - enable: false - modes: - - name: mpi - proc_num: 8 - parameters: - msg_size: 8388608 - ib_dev: mlx5_$LOCAL_RANK - gpu_dev: $LOCAL_RANK - numa_dev: $((LOCAL_RANK/2)) - gpcnet-network-test: - enable: false - modes: - - name: mpi - proc_num: 1 - mca: - pml: ucx - btl: ^uct - btl_tcp_if_include: eth0 - gpcnet-network-load-test: - enable: false - modes: - - name: mpi - proc_num: 1 - mca: - pml: ucx - btl: ^uct - btl_tcp_if_include: eth0 - tcp-connectivity: - enable: false - modes: - - name: local - parallel: no - parameters: - port: 22 - ort-inference: - <<: *default_local_mode - tensorrt-inference: - <<: *default_local_mode - parameters: - pytorch_models: - - resnet50 - - resnet101 - - resnet152 - - densenet169 - - densenet201 - - bert-base - - bert-large - seq_length: 224 - batch_size: 32 - precision: int8 - model-benchmarks:gpt: - <<: *default_pytorch_mode - models: - - gpt2-small - - gpt2-large - parameters: - <<: *common_model_config - precision: [float32, float16, fp8_hybrid] - batch_size: 32 - seq_len: 224 - model-benchmarks:bert: - <<: *default_pytorch_mode - models: - - bert-base - - bert-large - parameters: - <<: *common_model_config - precision: [float32, float16, fp8_hybrid] - seq_len: 224 - model-benchmarks:lstm: - <<: *default_pytorch_mode - models: - - lstm - parameters: - <<: *common_model_config - batch_size: 1024 - input_size: 224 - hidden_size: 1000 - seq_len: 32 - pin_memory: no - model-benchmarks:resnet: - <<: *default_pytorch_mode - models: - - resnet50 - - resnet101 - - resnet152 - parameters: - <<: *common_model_config - batch_size: 384 - num_steps: 512 - model-benchmarks:densenet: - <<: *default_pytorch_mode - models: - - densenet169 - - densenet201 - parameters: - <<: *common_model_config - pin_memory: no - model-benchmarks:vgg: - <<: *default_pytorch_mode - models: - - vgg11 - - vgg13 - - vgg16 - - vgg19 - parameters: - <<: *common_model_config - pin_memory: no diff --git a/website/blog/2024-09-20-release-0-11.md b/website/blog/2024-09-20-release-0-11.md index d0fa02a4b..308e78dc1 100644 --- a/website/blog/2024-09-20-release-0-11.md +++ b/website/blog/2024-09-20-release-0-11.md @@ -17,14 +17,18 @@ You can install and try superbench by following [Getting Started Tutorial](https ### SuperBench Improvements - Add CUDA 12.4 dockerfile. +- Upgrade nccl version to v2.23.4 and install ucx v1.16.0 in cuda 12.4 dockefile. +- Fix MSCCL build error in CUDA12.4 docker build pipeline. - Add ROCm6.2 dockerfile. - Update hpcx link in cuda11.1 dockerfile to fix docker build failure. -- Improve document (fix typos and add BibTeX in README and repo). -- Limit protobuf version to be 3.20.x. -- Update omegaconf version to 2.3.0. +- Improve document (Fix metrics name and typos in user tutorial, add BibTeX in README and repo). +- Limit protobuf version to be 3.20.x to fix onnxruntime dependency error. +- Update omegaconf version to 2.3.0 and fix issues caused by omegaconf version update. - Fix MSCCL build error in CUDA12.4 docker build pipeline. - Update Docker Exec Command for Persistent HPCX Environment. +- Fix cuda 12.2 dockerfile LD_LIBRARY_PATH issue. - Use types-setuptools to replace types-pkg_resources. +- Add configuration for NDv5 H100 and AMD MI300x. ### Micro-benchmark Improvements @@ -37,4 +41,4 @@ You can install and try superbench by following [Getting Started Tutorial](https - Support FP8 transformer model training in ROCm6.2 dockerfile. ### Result Analysis -- Fix bug of failure test and warning of pandas in data diagnosis +- Fix bug of failure test and warning of pandas in data diagnosis. From 7080dec6208de20047270b30516edd91dbe509d2 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 26 Sep 2024 22:40:44 +0800 Subject: [PATCH 7/7] Update run-superbench.md --- docs/getting-started/run-superbench.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md index 22c24c5e6..73f00c9c0 100644 --- a/docs/getting-started/run-superbench.md +++ b/docs/getting-started/run-superbench.md @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] :::note Note You should deploy corresponding Docker image to use release version, for example, -`sb deploy -f local.ini -i superbench/superbench:v0.11.0-cuda12.2` +`sb deploy -f local.ini -i superbench/superbench:v0.11.0-cuda12.4` You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.