From 031eab4ed331553c12766a3937664b1462164078 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 2 May 2024 15:08:06 -0700 Subject: [PATCH 01/14] commit change --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index fd5b8e40ab..504c1c9d03 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements +RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry From 406f7eb176f7dcc17a69025a0205519c5f4101ba Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 2 May 2024 15:29:58 -0700 Subject: [PATCH 02/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 504c1c9d03..1a60d82758 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable +RUN pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry From f454ffb0bdd2fa6abba71f6fd6a4648627ada2fd Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 2 May 2024 16:04:10 -0700 Subject: [PATCH 03/14] commit change --- .github/workflows/docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 1b02b4fd9b..6ca10fcd47 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -12,7 +12,7 @@ on: workflow_dispatch: {} jobs: docker-build: - runs-on: mosaic-4wide + runs-on: mosaic-8wide if: github.repository_owner == 'mosaicml' strategy: matrix: From 63c5e8bd98fefa7d93952a562e1efd4ed4b1c951 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 10:46:49 -0700 Subject: [PATCH 04/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1a60d82758..6cf59ed201 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable +RUN MAX_JOBS=1 pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry From 55f5e2a0ff2ba617d44bd5246ddb1a8aee65eefb Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 11:39:09 -0700 Subject: [PATCH 05/14] commit change --- .github/workflows/docker.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 6ca10fcd47..a3d87bb137 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -19,10 +19,10 @@ jobs: include: - name: "2.3.0_cu121_flash2" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 - dep_groups: "[gpu-flash2]" + dep_groups: "" - name: "2.3.0_cu121_flash2_aws" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws - dep_groups: "[gpu-flash2]" + dep_groups: "" steps: - name: Maximize Build Space on Worker uses: easimon/maximize-build-space@v4 From fb8c97fbfee1bb2540a2d35e5a31e9ff54aae6db Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 11:40:40 -0700 Subject: [PATCH 06/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6cf59ed201..ca9312f24b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN MAX_JOBS=1 pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable +RUN MAX_JOBS=1 pip install --verbose --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry From 297f6cc989a4525665763aaf58f66b87346011ac Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 14:47:16 -0700 Subject: [PATCH 07/14] commit change --- .github/workflows/docker.yaml | 4 ++-- Dockerfile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index a3d87bb137..0173780dbb 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -19,10 +19,10 @@ jobs: include: - name: "2.3.0_cu121_flash2" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 - dep_groups: "" + dep_groups: "gpu-flash2" - name: "2.3.0_cu121_flash2_aws" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws - dep_groups: "" + dep_groups: "gpu-flash2" steps: - name: Maximize Build Space on Worker uses: easimon/maximize-build-space@v4 diff --git a/Dockerfile b/Dockerfile index ca9312f24b..041bf27164 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,8 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN MAX_JOBS=1 pip install --verbose --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable +RUN MAX_JOBS=1 pip install --verbose --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git -RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" +RUN MAX_JOBS=1 pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry RUN rm -rf llm-foundry From 08c7bcc1ebebe2cc50561e4c86b85ecaea53262a Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 14:49:51 -0700 Subject: [PATCH 08/14] commit change --- .github/workflows/docker.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 0173780dbb..6ca10fcd47 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -19,10 +19,10 @@ jobs: include: - name: "2.3.0_cu121_flash2" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 - dep_groups: "gpu-flash2" + dep_groups: "[gpu-flash2]" - name: "2.3.0_cu121_flash2_aws" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws - dep_groups: "gpu-flash2" + dep_groups: "[gpu-flash2]" steps: - name: Maximize Build Space on Worker uses: easimon/maximize-build-space@v4 From de34458b37a375f569a5f9438756132cadfcb217 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 3 May 2024 22:04:38 -0700 Subject: [PATCH 09/14] commit change --- .github/workflows/docker.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 6ca10fcd47..3f25b155d6 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,11 +17,11 @@ jobs: strategy: matrix: include: - - name: "2.3.0_cu121_flash2" - base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + - name: "2.1.0_cu121_flash2" + base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 dep_groups: "[gpu-flash2]" - - name: "2.3.0_cu121_flash2_aws" - base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws + - name: "2.1.0_cu121_flash2_aws" + base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws dep_groups: "[gpu-flash2]" steps: - name: Maximize Build Space on Worker From f170eee973bfa59e38bb47de62f6961f1a5df27b Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 9 May 2024 12:19:14 -0700 Subject: [PATCH 10/14] commit change --- Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 041bf27164..185a6fb274 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,10 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN MAX_JOBS=1 pip install --verbose --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@stable -RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN MAX_JOBS=1 pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry RUN rm -rf llm-foundry +RUN git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git +RUN cd TransformerEngine && git submodule update --init --recursive +RUN export NVTE_FRAMEWORK=pytorch && pip install . +RUN cd .. From 179acba7b541c55169affe80f7fab5c587383134 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 9 May 2024 12:19:52 -0700 Subject: [PATCH 11/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 185a6fb274..69eefcd4a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install and uninstall foundry to cache foundry requirements -RUN MAX_JOBS=1 pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" +RUN pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry RUN rm -rf llm-foundry RUN git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git From 4c0959cf98a07bd7440de614a643dead47bdc0ae Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 9 May 2024 12:38:39 -0700 Subject: [PATCH 12/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 69eefcd4a9..0fff8d35a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,5 +18,5 @@ RUN pip uninstall -y llm-foundry RUN rm -rf llm-foundry RUN git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git RUN cd TransformerEngine && git submodule update --init --recursive -RUN export NVTE_FRAMEWORK=pytorch && pip install . +RUN export NVTE_FRAMEWORK=pytorch && MAX_JOBS=1 pip install . RUN cd .. From 315446b8c5b0f9a2da0b91823e366f8098dea1fd Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Mon, 13 May 2024 19:03:42 -0700 Subject: [PATCH 13/14] commit change --- .github/workflows/docker.yaml | 8 ++++---- Dockerfile | 6 ++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 3f25b155d6..6ca10fcd47 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,11 +17,11 @@ jobs: strategy: matrix: include: - - name: "2.1.0_cu121_flash2" - base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 + - name: "2.3.0_cu121_flash2" + base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 dep_groups: "[gpu-flash2]" - - name: "2.1.0_cu121_flash2_aws" - base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws + - name: "2.3.0_cu121_flash2_aws" + base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws dep_groups: "[gpu-flash2]" steps: - name: Maximize Build Space on Worker diff --git a/Dockerfile b/Dockerfile index 0fff8d35a8..be16614dfc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,9 +14,7 @@ RUN rm setup.py # Install and uninstall foundry to cache foundry requirements RUN pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" +RUN pip install --verbose --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@main RUN pip uninstall -y llm-foundry +RUN pip uninstall -y transformer-engine RUN rm -rf llm-foundry -RUN git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git -RUN cd TransformerEngine && git submodule update --init --recursive -RUN export NVTE_FRAMEWORK=pytorch && MAX_JOBS=1 pip install . -RUN cd .. From 0d2a0e4cad95f26f10b82bf93c462c30b58ce2b3 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Mon, 13 May 2024 19:04:33 -0700 Subject: [PATCH 14/14] commit change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index be16614dfc..db68fc9d2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ RUN rm setup.py # Install and uninstall foundry to cache foundry requirements RUN pip install --verbose --no-cache-dir "./llm-foundry${DEP_GROUPS}" -RUN pip install --verbose --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@main +RUN NVTE_FRAMEWORK=pytorch pip install --verbose --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@main RUN pip uninstall -y llm-foundry RUN pip uninstall -y transformer-engine RUN rm -rf llm-foundry