diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 6f4d730e39..730ef3ad9e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -20,15 +20,19 @@ jobs: - name: "2.3.1_cu121" base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 dep_groups: "[all]" + te_commit: b5a7c9f - name: "2.3.1_cu121_aws" base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws dep_groups: "[all]" + te_commit: b5a7c9f - name: "2.4.0_cu124" base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 dep_groups: "[all]" + te_commit: 901e5d2 - name: "2.4.0_cu124_aws" base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws dep_groups: "[all]" + te_commit: 901e5d2 steps: - name: Checkout @@ -95,3 +99,4 @@ jobs: BRANCH_NAME=${{ github.head_ref || github.ref_name }} BASE_IMAGE=${{ matrix.base_image }} DEP_GROUPS=${{ matrix.dep_groups }} + TE_COMMIT=${{ matrix.te_commit }} diff --git a/Dockerfile b/Dockerfile index 0a996ff72a..ca52532395 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ FROM $BASE_IMAGE ARG BRANCH_NAME ARG DEP_GROUPS +ARG TE_COMMIT ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0" @@ -15,7 +16,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install TransformerEngine -RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@901e5d2 +RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@$TE_COMMIT # Install and uninstall foundry to cache foundry requirements RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git