diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 02a154bd..d1053fee 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -5,20 +5,22 @@ concurrency:
   cancel-in-progress: true
 
 on:
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'Makefile'
-      - 'pyproject.toml'
-      - 'src/olmo_core/version.py'
-      - 'src/Dockerfile'
-      - '.github/workflows/docker.yml'
-  push:
-    branches:
-      - main
-    tags:
-      - 'v*.*.*'
+  workflow_dispatch:
+  # TODO: disabled for now because it takes too long in CI
+  # pull_request:
+  #   branches:
+  #     - main
+  #   paths:
+  #     - 'Makefile'
+  #     - 'pyproject.toml'
+  #     - 'src/olmo_core/version.py'
+  #     - 'src/Dockerfile'
+  #     - '.github/workflows/docker.yml'
+  # push:
+  #   branches:
+  #     - main
+  #   tags:
+  #     - 'v*.*.*'
 
 jobs:
   beaker:
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 58c41f78..ec884c9a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -103,7 +103,7 @@ jobs:
   gpu_checks:
     name: ${{ matrix.task.name }}
     runs-on: ubuntu-latest
-    timeout-minutes: 8
+    timeout-minutes: 15
     strategy:
       fail-fast: false
       matrix:
@@ -118,14 +118,14 @@ jobs:
                 src/test/
 
           - name: Test checkpoint (GPU)
-            image: olmo-core-nightly
+            image: olmo-core
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
                 src/test/distributed/checkpoint*
 
           - name: Test MoE (GPU)
-            image: olmo-core-nightly
+            image: olmo-core
             gpus: 1
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
@@ -174,17 +174,17 @@ jobs:
                 image:
                   beaker: ${{ env.BEAKER_IMAGE }}
                 context:
-                  priority: low
+                  priority: normal
                   preemptible: true
                 resources:
                   gpuCount: ${{ matrix.task.gpus }}
                 constraints:
                   cluster:
+                    - ai2/jupiter-cirrascale-2
+                    - ai2/pluto-cirrascale
                     # - ai2/allennlp-cirrascale
                     # - ai2/allennlp-elanding-a100-40g
-                    - ai2/pluto-cirrascale
-                    - ai2/jupiter-cirrascale-2
-                    # - ai2/saturn-cirrascale
+                    - ai2/saturn-cirrascale
                 envVars:
                   - name: CUBLAS_WORKSPACE_CONFIG
                     value: ":16:8"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 329b11a2..e7186cff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals).
 
-### Removed
-
-- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.
-
 ### Fixed
 
 - Made GCS client more robust by automatically retrying timeout errors for most operations.
diff --git a/Makefile b/Makefile
index cba52330..0f226fb9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,15 @@
 # NOTE: make sure CUDA versions match across these variables
-BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
-CUDA_TOOLKIT_VERSION = 12.1.0
-TORCH_CUDA_VERSION = 121
+CUDA_VERSION = 12.1
+TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .)
+BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel
+BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime
 
 # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
 # in 'pyproject.toml' to include that nightly version.
-NIGHTLY_VERSION = "2.6.0.dev20241009+cu121"
-TORCHAO_VERSION = "torchao==0.5.0"
+NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)"
+TORCHAO_VERSION = "0.5.0"
 MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps"
+FLASH_ATTN_VERSION = "2.6.3"
 
 VERSION = $(shell python src/olmo_core/version.py)
 VERSION_SHORT = $(shell python src/olmo_core/version.py short)
@@ -49,9 +51,10 @@ build :
 stable-image :
 	docker build -f src/Dockerfile \
 		--build-arg BUILDKIT_INLINE_CACHE=1 \
-		--build-arg BASE=$(BASE_IMAGE) \
-		--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
+		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
+		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
+		--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--target stable \
@@ -63,9 +66,10 @@ stable-image :
 nightly-image :
 	docker build -f src/Dockerfile \
 		--build-arg BUILDKIT_INLINE_CACHE=1 \
-		--build-arg BASE=$(BASE_IMAGE) \
-		--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
+		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
+		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
+		--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \
diff --git a/src/Dockerfile b/src/Dockerfile
index a5726d0b..81215546 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -1,30 +1,36 @@
-# Base image comes with PyTorch, numpy, flash-attn
-ARG BASE
+ARG BASE_BUILD
+ARG BASE_RUNTIME
 
 #########################################################################
 # Build image
 #########################################################################
 
-FROM ${BASE} as build
+FROM ${BASE_BUILD} as build
 
 WORKDIR /app/build
 
-# Install CUDA toolkit.
-ARG CUDA_TOOLKIT_VERSION
-RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION}
+# Install system dependencies.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        wget \
+        libxml2-dev \
+        git && \
+    rm -rf /var/lib/apt/lists/*
 
-ARG TORCH_CUDA_VERSION
+# Install/upgrade Python build dependencies.
+RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja
+
+# Build flash-attn.
+ARG FLASH_ATTN_VERSION
+RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION}
 
-# Build megablocks and grouped-gemm.
+# Build megablocks, grouped-gemm, stanford-stk
 ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
 ENV GROUPED_GEMM_CUTLASS=1
 ARG MEGABLOCKS_VERSION
-RUN pip wheel --no-build-isolation --no-cache-dir \
-    --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
-    "${MEGABLOCKS_VERSION}"
-
-# Flash-attn from pre-built wheel (can't get this to work at the moment)
-#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"
 
 # Only keep the target wheels and dependencies with CUDA extensions.
 RUN echo "Built wheels:" \
@@ -37,15 +43,38 @@ RUN echo "Built wheels:" \
 # Stable image
 #########################################################################
 
-FROM ${BASE} as stable
-
-ARG TORCH_CUDA_VERSION
+FROM ${BASE_RUNTIME} as stable
+
+# Install system dependencies.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        wget \
+        libxml2-dev \
+        git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 24.01-0.3.3.1
+ENV OS_VER ubuntu22.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# Install/upgrade Python build dependencies.
+RUN pip install --upgrade --no-cache-dir pip wheel packaging
 
 # Install torchao.
+ARG TORCH_CUDA_VERSION
 ARG TORCHAO_VERSION
 RUN pip install --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
-    ${TORCHAO_VERSION}
+    torchao==${TORCHAO_VERSION}
 
 # Copy and install wheels from build image.
 COPY --from=build /app/build /app/build
@@ -68,7 +97,6 @@ WORKDIR /app/olmo-core
 FROM stable as nightly
 
 ARG TORCH_CUDA_VERSION
-
 ARG NIGHTLY_VERSION
 RUN pip install --no-cache-dir --pre \
     --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \