From 4fe3df8d25fc9705b12e2162df760c8c6484c1f9 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Wed, 30 Oct 2024 15:24:37 -0700
Subject: [PATCH 01/17] Make GCS client more robust

---
 CHANGELOG.md        |  4 ++++
 src/olmo_core/io.py | 24 +++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fdfd5b51..329b11a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.
 
+### Fixed
+
+- Made GCS client more robust by automatically retrying timeout errors for most operations.
+
 ## [v1.5.0](https://github.com/allenai/OLMo-core/releases/tag/v1.5.0) - 2024-10-23
 
 ### Added
diff --git a/src/olmo_core/io.py b/src/olmo_core/io.py
index c4f79f06..948af244 100644
--- a/src/olmo_core/io.py
+++ b/src/olmo_core/io.py
@@ -13,6 +13,7 @@
 except ImportError:
     from functools import lru_cache as cache
 
+import requests
 import torch
 from cached_path import cached_path
 from cached_path.schemes import S3Client, SchemeClient, add_scheme_client
@@ -409,6 +410,20 @@ def _get_gcs_client():
     return gcs.Client()
 
 
+def _gcs_is_retriable(exc: Exception) -> bool:
+    from google.api_core.retry import if_transient_error
+
+    return if_transient_error(exc) or isinstance(exc, requests.exceptions.Timeout)
+
+
+def _get_gcs_retry():
+    from google.api_core.retry import Retry
+
+    return Retry(
+        predicate=_gcs_is_retriable, initial=1.0, maximum=10.0, multiplier=2.0, timeout=600.0
+    )
+
+
 def _gcs_file_size(bucket_name: str, key: str) -> int:
     from google.api_core.exceptions import NotFound
 
@@ -416,7 +431,7 @@ def _gcs_file_size(bucket_name: str, key: str) -> int:
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
     try:
-        blob.reload()
+        blob.reload(retry=_get_gcs_retry())
     except NotFound:
         raise FileNotFoundError(f"gs://{bucket_name}/{key}")
     assert blob.size is not None
@@ -433,7 +448,9 @@ def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes
         blob.reload()
     except NotFound:
         raise FileNotFoundError(f"gs://{bucket_name}/{key}")
-    return blob.download_as_bytes(start=bytes_start, end=bytes_start + num_bytes - 1)
+    return blob.download_as_bytes(
+        start=bytes_start, end=bytes_start + num_bytes - 1, retry=_get_gcs_retry()
+    )
 
 
 def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False):
@@ -466,6 +483,7 @@ def _gcs_list_directory(bucket_name: str, prefix: str) -> Generator[str, None, N
                 prefix=prefix,
                 delimiter="/",
                 match_glob=match_glob,
+                retry=_get_gcs_retry(),
             )
         except NotFound:
             raise FileNotFoundError(f"gs://{bucket_name}/{prefix}")
@@ -488,7 +506,7 @@ def _gcs_clear_directory(bucket_name: str, prefix: str):
 
     try:
         bucket = storage_client.bucket(bucket_name)
-        blobs = bucket.list_blobs(prefix=prefix)
+        blobs = bucket.list_blobs(prefix=prefix, retry=_get_gcs_retry())
         for blob in blobs:
             bucket.delete_blob(blob.name)
     except NotFound:

From 17fc0c27f0e128b8b09750c73bd0a3ec9de9720b Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 10:12:37 -0700
Subject: [PATCH 02/17] try building flash again

---
 src/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Dockerfile b/src/Dockerfile
index a5726d0b..bab1f1f8 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -23,6 +23,11 @@ RUN pip wheel --no-build-isolation --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
     "${MEGABLOCKS_VERSION}"
 
+# Build flash-attn.
+RUN pip install --no-cache-dir "setuptools<70.0.0" \
+    && pip uninstall -y flash-attn \
+    && pip wheel --no-build-isolation "flash-attn==2.6.3"
+
 # Flash-attn from pre-built wheel (can't get this to work at the moment)
 #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
 

From a033391804e7d44efca8f9870954e4ffdfc1b5d4 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 11:04:44 -0700
Subject: [PATCH 03/17] update

---
 Makefile       | 16 +++++++++-------
 src/Dockerfile | 29 +++++++++--------------------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/Makefile b/Makefile
index cba52330..4a02e0b0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,13 @@
 # NOTE: make sure CUDA versions match across these variables
-BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
-CUDA_TOOLKIT_VERSION = 12.1.0
+# BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
+CUDA_VERSION = 12.1
 TORCH_CUDA_VERSION = 121
+BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel
+BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime
 
 # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
 # in 'pyproject.toml' to include that nightly version.
-NIGHTLY_VERSION = "2.6.0.dev20241009+cu121"
+NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)"
 TORCHAO_VERSION = "torchao==0.5.0"
 MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps"
 
@@ -49,8 +51,8 @@ build :
 stable-image :
 	docker build -f src/Dockerfile \
 		--build-arg BUILDKIT_INLINE_CACHE=1 \
-		--build-arg BASE=$(BASE_IMAGE) \
-		--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
+		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
+		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
@@ -63,8 +65,8 @@ stable-image :
 nightly-image :
 	docker build -f src/Dockerfile \
 		--build-arg BUILDKIT_INLINE_CACHE=1 \
-		--build-arg BASE=$(BASE_IMAGE) \
-		--build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \
+		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
+		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
diff --git a/src/Dockerfile b/src/Dockerfile
index bab1f1f8..04b20995 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -1,32 +1,22 @@
-# Base image comes with PyTorch, numpy, flash-attn
-ARG BASE
-
 #########################################################################
 # Build image
 #########################################################################
 
-FROM ${BASE} as build
+ARG BASE_BUILD
+FROM ${BASE_BUILD} as build
 
 WORKDIR /app/build
 
-# Install CUDA toolkit.
-ARG CUDA_TOOLKIT_VERSION
-RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION}
-
-ARG TORCH_CUDA_VERSION
+RUN pip install --upgrade --no-cache-dir pip wheel packing "setuptools<70.0.0" ninja
 
-# Build megablocks and grouped-gemm.
+# Build megablocks, grouped-gemm, stanford-stk
 ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
 ENV GROUPED_GEMM_CUTLASS=1
 ARG MEGABLOCKS_VERSION
-RUN pip wheel --no-build-isolation --no-cache-dir \
-    --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
-    "${MEGABLOCKS_VERSION}"
+RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"
 
 # Build flash-attn.
-RUN pip install --no-cache-dir "setuptools<70.0.0" \
-    && pip uninstall -y flash-attn \
-    && pip wheel --no-build-isolation "flash-attn==2.6.3"
+RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3"
 
 # Flash-attn from pre-built wheel (can't get this to work at the moment)
 #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
@@ -42,11 +32,11 @@ RUN echo "Built wheels:" \
 # Stable image
 #########################################################################
 
-FROM ${BASE} as stable
-
-ARG TORCH_CUDA_VERSION
+ARG BASE_RUNTIME
+FROM ${BASE_RUNTIME} as stable
 
 # Install torchao.
+ARG TORCH_CUDA_VERSION
 ARG TORCHAO_VERSION
 RUN pip install --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
@@ -73,7 +63,6 @@ WORKDIR /app/olmo-core
 FROM stable as nightly
 
 ARG TORCH_CUDA_VERSION
-
 ARG NIGHTLY_VERSION
 RUN pip install --no-cache-dir --pre \
     --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \

From 077990d5fdd0dd8adbb11bc20b7cf4a5537c8cd6 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 11:50:19 -0700
Subject: [PATCH 04/17] fixes

---
 src/Dockerfile | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/Dockerfile b/src/Dockerfile
index 04b20995..a981800f 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -1,13 +1,27 @@
+ARG BASE_BUILD
+ARG BASE_RUNTIME
+
 #########################################################################
 # Build image
 #########################################################################
 
-ARG BASE_BUILD
 FROM ${BASE_BUILD} as build
 
 WORKDIR /app/build
 
-RUN pip install --upgrade --no-cache-dir pip wheel packing "setuptools<70.0.0" ninja
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        wget \
+        libxml2-dev \
+        git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja
+
+# Build flash-attn.
+RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3"
 
 # Build megablocks, grouped-gemm, stanford-stk
 ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
@@ -15,9 +29,6 @@ ENV GROUPED_GEMM_CUTLASS=1
 ARG MEGABLOCKS_VERSION
 RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"
 
-# Build flash-attn.
-RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3"
-
 # Flash-attn from pre-built wheel (can't get this to work at the moment)
 #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
 
@@ -32,7 +43,6 @@ RUN echo "Built wheels:" \
 # Stable image
 #########################################################################
 
-ARG BASE_RUNTIME
 FROM ${BASE_RUNTIME} as stable
 
 # Install torchao.

From a2a63c9d493e4b24e6ebf3eaaeb2dd534d6b6ed7 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:00:22 -0700
Subject: [PATCH 05/17] clean up

---
 Makefile       |  8 +++++---
 src/Dockerfile | 27 +++++++++++++++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 4a02e0b0..0f226fb9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,15 @@
 # NOTE: make sure CUDA versions match across these variables
-# BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29
 CUDA_VERSION = 12.1
-TORCH_CUDA_VERSION = 121
+TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .)
 BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel
 BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime
 
 # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
 # in 'pyproject.toml' to include that nightly version.
 NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)"
-TORCHAO_VERSION = "torchao==0.5.0"
+TORCHAO_VERSION = "0.5.0"
 MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps"
+FLASH_ATTN_VERSION = "2.6.3"
 
 VERSION = $(shell python src/olmo_core/version.py)
 VERSION_SHORT = $(shell python src/olmo_core/version.py short)
@@ -54,6 +54,7 @@ stable-image :
 		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
 		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
+		--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--target stable \
@@ -68,6 +69,7 @@ nightly-image :
 		--build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \
 		--build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
+		--build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
 		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \
diff --git a/src/Dockerfile b/src/Dockerfile
index a981800f..fa87ae48 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -9,7 +9,9 @@ FROM ${BASE_BUILD} as build
 
 WORKDIR /app/build
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# Install system dependencies.
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
         curl \
@@ -18,10 +20,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git && \
     rm -rf /var/lib/apt/lists/*
 
+# Install/upgrade Python build dependencies.
 RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja
 
 # Build flash-attn.
-RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3"
+ARG FLASH_ATTN_VERSION
+RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION}
 
 # Build megablocks, grouped-gemm, stanford-stk
 ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
@@ -29,9 +33,6 @@ ENV GROUPED_GEMM_CUTLASS=1
 ARG MEGABLOCKS_VERSION
 RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"
 
-# Flash-attn from pre-built wheel (can't get this to work at the moment)
-#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
-
 # Only keep the target wheels and dependencies with CUDA extensions.
 RUN echo "Built wheels:" \
     && ls -lh . \
@@ -45,12 +46,26 @@ RUN echo "Built wheels:" \
 
 FROM ${BASE_RUNTIME} as stable
 
+# Install system dependencies.
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        wget \
+        libxml2-dev \
+        git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install/upgrade Python build dependencies.
+RUN pip install --upgrade --no-cache-dir pip wheel packaging
+
 # Install torchao.
 ARG TORCH_CUDA_VERSION
 ARG TORCHAO_VERSION
 RUN pip install --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
-    ${TORCHAO_VERSION}
+    torchao==${TORCHAO_VERSION}
 
 # Copy and install wheels from build image.
 COPY --from=build /app/build /app/build

From 7fa2c5d1f3904fa3356ed0d39125a86ba4646d30 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:10:08 -0700
Subject: [PATCH 06/17] fix

---
 src/Dockerfile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Dockerfile b/src/Dockerfile
index fa87ae48..9076efdf 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -10,8 +10,7 @@ FROM ${BASE_BUILD} as build
 WORKDIR /app/build
 
 # Install system dependencies.
-RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
-    apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
         curl \
@@ -47,8 +46,7 @@ RUN echo "Built wheels:" \
 FROM ${BASE_RUNTIME} as stable
 
 # Install system dependencies.
-RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
-    apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
         curl \

From def49b769aae3e498fa39022b8f367040b64c4b0 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:31:23 -0700
Subject: [PATCH 07/17] run with dev image

---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 58c41f78..2e51f5c3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -109,7 +109,7 @@ jobs:
       matrix:
         task:
           - name: Test (GPU)
-            image: olmo-core
+            image: olmo-core-dev
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
@@ -118,14 +118,14 @@ jobs:
                 src/test/
 
           - name: Test checkpoint (GPU)
-            image: olmo-core-nightly
+            image: olmo-core-dev
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
                 src/test/distributed/checkpoint*
 
           - name: Test MoE (GPU)
-            image: olmo-core-nightly
+            image: olmo-core-dev
             gpus: 1
             run: |
               pytest -v --color=yes --durations=3 -m gpu \

From 90f48bc4cf07e6418e8daeb76e9d792fd262b9f3 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:46:57 -0700
Subject: [PATCH 08/17] upgrade to normal priority

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2e51f5c3..32dc817e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -174,7 +174,7 @@ jobs:
                 image:
                   beaker: ${{ env.BEAKER_IMAGE }}
                 context:
-                  priority: low
+                  priority: normal
                   preemptible: true
                 resources:
                   gpuCount: ${{ matrix.task.gpus }}

From 2ed17b65eb357be5a8cc7485d91c86d0d3894162 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:52:42 -0700
Subject: [PATCH 09/17] add user-space drivers

---
 src/Dockerfile | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/Dockerfile b/src/Dockerfile
index 9076efdf..545ab2f8 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -55,6 +55,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git && \
     rm -rf /var/lib/apt/lists/*
 
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 24.01-0.3.3.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
 # Install/upgrade Python build dependencies.
 RUN pip install --upgrade --no-cache-dir pip wheel packaging
 

From a2362959e21d2fd0178da8fd5aa5cd4f0db989c6 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 12:55:04 -0700
Subject: [PATCH 10/17] fix os ver

---
 src/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Dockerfile b/src/Dockerfile
index 545ab2f8..81215546 100644
--- a/src/Dockerfile
+++ b/src/Dockerfile
@@ -58,7 +58,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install MLNX OFED user-space drivers
 # See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
 ENV MOFED_VER 24.01-0.3.3.1
-ENV OS_VER ubuntu20.04
+ENV OS_VER ubuntu22.04
 ENV PLATFORM x86_64
 RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
     tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \

From dd2ca7f592e711f1db5c11f8664c6a7dd964867e Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 13:15:33 -0700
Subject: [PATCH 11/17] run higher priority

---
 .github/workflows/main.yml | 10 ++++++----
 CHANGELOG.md               |  4 ----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 32dc817e..5b6dda7f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -174,16 +174,17 @@ jobs:
                 image:
                   beaker: ${{ env.BEAKER_IMAGE }}
                 context:
-                  priority: normal
+                  # priority: normal
+                  priority: high
                   preemptible: true
                 resources:
                   gpuCount: ${{ matrix.task.gpus }}
                 constraints:
                   cluster:
+                    - ai2/jupiter-cirrascale-2
+                    - ai2/pluto-cirrascale
                     # - ai2/allennlp-cirrascale
                     # - ai2/allennlp-elanding-a100-40g
-                    - ai2/pluto-cirrascale
-                    - ai2/jupiter-cirrascale-2
                     # - ai2/saturn-cirrascale
                 envVars:
                   - name: CUBLAS_WORKSPACE_CONFIG
@@ -201,7 +202,8 @@ jobs:
                 result:
                   path: /unused
           token: ${{ env.BEAKER_TOKEN }}
-          workspace: ${{ env.BEAKER_WORKSPACE }}
+          # workspace: ${{ env.BEAKER_WORKSPACE }}
+          workspace: ai2/OLMo-pretraining-stability
 
   release:
     name: Release
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 329b11a2..e7186cff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals).
 
-### Removed
-
-- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.
-
 ### Fixed
 
 - Made GCS client more robust by automatically retrying timeout errors for most operations.

From af4fa468fde65769f96d92e7a06f87c310c4cfc6 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 13:50:10 -0700
Subject: [PATCH 12/17] clean up

---
 .github/workflows/main.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5b6dda7f..f1b99e74 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -109,7 +109,7 @@ jobs:
       matrix:
         task:
           - name: Test (GPU)
-            image: olmo-core-dev
+            image: olmo-core
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
@@ -118,14 +118,14 @@ jobs:
                 src/test/
 
           - name: Test checkpoint (GPU)
-            image: olmo-core-dev
+            image: olmo-core
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
                 src/test/distributed/checkpoint*
 
           - name: Test MoE (GPU)
-            image: olmo-core-dev
+            image: olmo-core
             gpus: 1
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
@@ -185,7 +185,7 @@ jobs:
                     - ai2/pluto-cirrascale
                     # - ai2/allennlp-cirrascale
                     # - ai2/allennlp-elanding-a100-40g
-                    # - ai2/saturn-cirrascale
+                    - ai2/saturn-cirrascale
                 envVars:
                   - name: CUBLAS_WORKSPACE_CONFIG
                     value: ":16:8"
@@ -202,8 +202,7 @@ jobs:
                 result:
                   path: /unused
           token: ${{ env.BEAKER_TOKEN }}
-          # workspace: ${{ env.BEAKER_WORKSPACE }}
-          workspace: ai2/OLMo-pretraining-stability
+          workspace: ${{ env.BEAKER_WORKSPACE }}
 
   release:
     name: Release

From 2eccb8f722900b3b1d7bedbd3669503e0498064e Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 13:50:44 -0700
Subject: [PATCH 13/17] fix changelog

---
 CHANGELOG.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index acccd446..e7186cff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,10 +15,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Made GCS client more robust by automatically retrying timeout errors for most operations.
 
-### Fixed
-
-- Made GCS client more robust by automatically retrying timeout errors for most operations.
-
 ## [v1.5.0](https://github.com/allenai/OLMo-core/releases/tag/v1.5.0) - 2024-10-23
 
 ### Added

From 00777509587b5d354371b692346605601f6ba33f Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 13:51:53 -0700
Subject: [PATCH 14/17] fix

---
 .github/workflows/main.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f1b99e74..64771e74 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -174,8 +174,7 @@ jobs:
                 image:
                   beaker: ${{ env.BEAKER_IMAGE }}
                 context:
-                  # priority: normal
-                  priority: high
+                  priority: normal
                   preemptible: true
                 resources:
                   gpuCount: ${{ matrix.task.gpus }}

From 47f3bd623ed55c115c8bca514af7204a8dd077f6 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 15:06:13 -0700
Subject: [PATCH 15/17] disable auto docker build for now

---
 .github/workflows/docker.yml | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 02a154bd..27bea638 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -5,20 +5,21 @@ concurrency:
   cancel-in-progress: true
 
 on:
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'Makefile'
-      - 'pyproject.toml'
-      - 'src/olmo_core/version.py'
-      - 'src/Dockerfile'
-      - '.github/workflows/docker.yml'
-  push:
-    branches:
-      - main
-    tags:
-      - 'v*.*.*'
+  # TODO: disabled for now because it takes too long in CI
+  # pull_request:
+  #   branches:
+  #     - main
+  #   paths:
+  #     - 'Makefile'
+  #     - 'pyproject.toml'
+  #     - 'src/olmo_core/version.py'
+  #     - 'src/Dockerfile'
+  #     - '.github/workflows/docker.yml'
+  # push:
+  #   branches:
+  #     - main
+  #   tags:
+  #     - 'v*.*.*'
 
 jobs:
   beaker:

From acbe1a3f4295d7aaf9ab0c8a97511b26d81d79ef Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 15:07:25 -0700
Subject: [PATCH 16/17] add workflow dispatch

---
 .github/workflows/docker.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 27bea638..d1053fee 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -5,6 +5,7 @@ concurrency:
   cancel-in-progress: true
 
 on:
+  workflow_dispatch:
   # TODO: disabled for now because it takes too long in CI
   # pull_request:
   #   branches:

From 701978bde6ab2d54492bcedc043a3e791e629686 Mon Sep 17 00:00:00 2001
From: epwalsh <petew@allenai.org>
Date: Thu, 31 Oct 2024 15:14:43 -0700
Subject: [PATCH 17/17] increase time limit for GPU checks

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 64771e74..ec884c9a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -103,7 +103,7 @@ jobs:
   gpu_checks:
     name: ${{ matrix.task.name }}
     runs-on: ubuntu-latest
-    timeout-minutes: 8
+    timeout-minutes: 15
     strategy:
       fail-fast: false
       matrix: