From 4fe3df8d25fc9705b12e2162df760c8c6484c1f9 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 30 Oct 2024 15:24:37 -0700 Subject: [PATCH 01/17] Make GCS client more robust --- CHANGELOG.md | 4 ++++ src/olmo_core/io.py | 24 +++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdfd5b51..329b11a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302. +### Fixed + +- Made GCS client more robust by automatically retrying timeout errors for most operations. + ## [v1.5.0](https://github.com/allenai/OLMo-core/releases/tag/v1.5.0) - 2024-10-23 ### Added diff --git a/src/olmo_core/io.py b/src/olmo_core/io.py index c4f79f06..948af244 100644 --- a/src/olmo_core/io.py +++ b/src/olmo_core/io.py @@ -13,6 +13,7 @@ except ImportError: from functools import lru_cache as cache +import requests import torch from cached_path import cached_path from cached_path.schemes import S3Client, SchemeClient, add_scheme_client @@ -409,6 +410,20 @@ def _get_gcs_client(): return gcs.Client() +def _gcs_is_retriable(exc: Exception) -> bool: + from google.api_core.retry import if_transient_error + + return if_transient_error(exc) or isinstance(exc, requests.exceptions.Timeout) + + +def _get_gcs_retry(): + from google.api_core.retry import Retry + + return Retry( + predicate=_gcs_is_retriable, initial=1.0, maximum=10.0, multiplier=2.0, timeout=600.0 + ) + + def _gcs_file_size(bucket_name: str, key: str) -> int: from google.api_core.exceptions import NotFound @@ -416,7 +431,7 @@ def _gcs_file_size(bucket_name: str, key: str) -> int: bucket = storage_client.bucket(bucket_name) blob = bucket.blob(key) try: - blob.reload() + blob.reload(retry=_get_gcs_retry()) except NotFound: raise FileNotFoundError(f"gs://{bucket_name}/{key}") assert blob.size is not None @@ -433,7 +448,9 @@ def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes blob.reload() except NotFound: raise FileNotFoundError(f"gs://{bucket_name}/{key}") - return blob.download_as_bytes(start=bytes_start, end=bytes_start + num_bytes - 1) + return blob.download_as_bytes( + start=bytes_start, end=bytes_start + num_bytes - 1, retry=_get_gcs_retry() + ) def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False): @@ -466,6 +483,7 @@ def _gcs_list_directory(bucket_name: str, prefix: str) -> Generator[str, None, N prefix=prefix, delimiter="/", match_glob=match_glob, + retry=_get_gcs_retry(), ) except NotFound: raise FileNotFoundError(f"gs://{bucket_name}/{prefix}") @@ -488,7 +506,7 @@ def _gcs_clear_directory(bucket_name: str, prefix: str): try: bucket = storage_client.bucket(bucket_name) - blobs = bucket.list_blobs(prefix=prefix) + blobs = bucket.list_blobs(prefix=prefix, retry=_get_gcs_retry()) for blob in blobs: bucket.delete_blob(blob.name) except NotFound: From 17fc0c27f0e128b8b09750c73bd0a3ec9de9720b Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 10:12:37 -0700 Subject: [PATCH 02/17] try building flash again --- src/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Dockerfile b/src/Dockerfile index a5726d0b..bab1f1f8 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -23,6 +23,11 @@ RUN pip wheel --no-build-isolation --no-cache-dir \ --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ "${MEGABLOCKS_VERSION}" +# Build flash-attn. +RUN pip install --no-cache-dir "setuptools<70.0.0" \ + && pip uninstall -y flash-attn \ + && pip wheel --no-build-isolation "flash-attn==2.6.3" + # Flash-attn from pre-built wheel (can't get this to work at the moment) #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl From a033391804e7d44efca8f9870954e4ffdfc1b5d4 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 11:04:44 -0700 Subject: [PATCH 03/17] update --- Makefile | 16 +++++++++------- src/Dockerfile | 29 +++++++++-------------------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index cba52330..4a02e0b0 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ # NOTE: make sure CUDA versions match across these variables -BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29 -CUDA_TOOLKIT_VERSION = 12.1.0 +# BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29 +CUDA_VERSION = 12.1 TORCH_CUDA_VERSION = 121 +BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel +BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification # in 'pyproject.toml' to include that nightly version. -NIGHTLY_VERSION = "2.6.0.dev20241009+cu121" +NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)" TORCHAO_VERSION = "torchao==0.5.0" MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps" @@ -49,8 +51,8 @@ build : stable-image : docker build -f src/Dockerfile \ --build-arg BUILDKIT_INLINE_CACHE=1 \ - --build-arg BASE=$(BASE_IMAGE) \ - --build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \ + --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ + --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ @@ -63,8 +65,8 @@ stable-image : nightly-image : docker build -f src/Dockerfile \ --build-arg BUILDKIT_INLINE_CACHE=1 \ - --build-arg BASE=$(BASE_IMAGE) \ - --build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \ + --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ + --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ diff --git a/src/Dockerfile b/src/Dockerfile index bab1f1f8..04b20995 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -1,32 +1,22 @@ -# Base image comes with PyTorch, numpy, flash-attn -ARG BASE - ######################################################################### # Build image ######################################################################### -FROM ${BASE} as build +ARG BASE_BUILD +FROM ${BASE_BUILD} as build WORKDIR /app/build -# Install CUDA toolkit. -ARG CUDA_TOOLKIT_VERSION -RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION} - -ARG TORCH_CUDA_VERSION +RUN pip install --upgrade --no-cache-dir pip wheel packing "setuptools<70.0.0" ninja -# Build megablocks and grouped-gemm. +# Build megablocks, grouped-gemm, stanford-stk ENV TORCH_CUDA_ARCH_LIST="8.0 9.0" ENV GROUPED_GEMM_CUTLASS=1 ARG MEGABLOCKS_VERSION -RUN pip wheel --no-build-isolation --no-cache-dir \ - --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ - "${MEGABLOCKS_VERSION}" +RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}" # Build flash-attn. -RUN pip install --no-cache-dir "setuptools<70.0.0" \ - && pip uninstall -y flash-attn \ - && pip wheel --no-build-isolation "flash-attn==2.6.3" +RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3" # Flash-attn from pre-built wheel (can't get this to work at the moment) #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl @@ -42,11 +32,11 @@ RUN echo "Built wheels:" \ # Stable image ######################################################################### -FROM ${BASE} as stable - -ARG TORCH_CUDA_VERSION +ARG BASE_RUNTIME +FROM ${BASE_RUNTIME} as stable # Install torchao. +ARG TORCH_CUDA_VERSION ARG TORCHAO_VERSION RUN pip install --no-cache-dir \ --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ @@ -73,7 +63,6 @@ WORKDIR /app/olmo-core FROM stable as nightly ARG TORCH_CUDA_VERSION - ARG NIGHTLY_VERSION RUN pip install --no-cache-dir --pre \ --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \ From 077990d5fdd0dd8adbb11bc20b7cf4a5537c8cd6 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 11:50:19 -0700 Subject: [PATCH 04/17] fixes --- src/Dockerfile | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index 04b20995..a981800f 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -1,13 +1,27 @@ +ARG BASE_BUILD +ARG BASE_RUNTIME + ######################################################################### # Build image ######################################################################### -ARG BASE_BUILD FROM ${BASE_BUILD} as build WORKDIR /app/build -RUN pip install --upgrade --no-cache-dir pip wheel packing "setuptools<70.0.0" ninja +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + wget \ + libxml2-dev \ + git && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja + +# Build flash-attn. +RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3" # Build megablocks, grouped-gemm, stanford-stk ENV TORCH_CUDA_ARCH_LIST="8.0 9.0" @@ -15,9 +29,6 @@ ENV GROUPED_GEMM_CUTLASS=1 ARG MEGABLOCKS_VERSION RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}" -# Build flash-attn. -RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3" - # Flash-attn from pre-built wheel (can't get this to work at the moment) #RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl @@ -32,7 +43,6 @@ RUN echo "Built wheels:" \ # Stable image ######################################################################### -ARG BASE_RUNTIME FROM ${BASE_RUNTIME} as stable # Install torchao. From a2a63c9d493e4b24e6ebf3eaaeb2dd534d6b6ed7 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:00:22 -0700 Subject: [PATCH 05/17] clean up --- Makefile | 8 +++++--- src/Dockerfile | 27 +++++++++++++++++++++------ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 4a02e0b0..0f226fb9 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ # NOTE: make sure CUDA versions match across these variables -# BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29 CUDA_VERSION = 12.1 -TORCH_CUDA_VERSION = 121 +TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .) BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification # in 'pyproject.toml' to include that nightly version. NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)" -TORCHAO_VERSION = "torchao==0.5.0" +TORCHAO_VERSION = "0.5.0" MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps" +FLASH_ATTN_VERSION = "2.6.3" VERSION = $(shell python src/olmo_core/version.py) VERSION_SHORT = $(shell python src/olmo_core/version.py short) @@ -54,6 +54,7 @@ stable-image : --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ + --build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ --target stable \ @@ -68,6 +69,7 @@ nightly-image : --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ + --build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ --build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \ diff --git a/src/Dockerfile b/src/Dockerfile index a981800f..fa87ae48 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -9,7 +9,9 @@ FROM ${BASE_BUILD} as build WORKDIR /app/build -RUN apt-get update && apt-get install -y --no-install-recommends \ +# Install system dependencies. +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + apt-get update && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ @@ -18,10 +20,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git && \ rm -rf /var/lib/apt/lists/* +# Install/upgrade Python build dependencies. RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja # Build flash-attn. -RUN pip wheel --no-build-isolation --no-cache-dir "flash-attn==2.6.3" +ARG FLASH_ATTN_VERSION +RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} # Build megablocks, grouped-gemm, stanford-stk ENV TORCH_CUDA_ARCH_LIST="8.0 9.0" @@ -29,9 +33,6 @@ ENV GROUPED_GEMM_CUTLASS=1 ARG MEGABLOCKS_VERSION RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}" -# Flash-attn from pre-built wheel (can't get this to work at the moment) -#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl - # Only keep the target wheels and dependencies with CUDA extensions. RUN echo "Built wheels:" \ && ls -lh . \ @@ -45,12 +46,26 @@ RUN echo "Built wheels:" \ FROM ${BASE_RUNTIME} as stable +# Install system dependencies. +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + wget \ + libxml2-dev \ + git && \ + rm -rf /var/lib/apt/lists/* + +# Install/upgrade Python build dependencies. +RUN pip install --upgrade --no-cache-dir pip wheel packaging + # Install torchao. ARG TORCH_CUDA_VERSION ARG TORCHAO_VERSION RUN pip install --no-cache-dir \ --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ - ${TORCHAO_VERSION} + torchao==${TORCHAO_VERSION} # Copy and install wheels from build image. COPY --from=build /app/build /app/build From 7fa2c5d1f3904fa3356ed0d39125a86ba4646d30 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:10:08 -0700 Subject: [PATCH 06/17] fix --- src/Dockerfile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index fa87ae48..9076efdf 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -10,8 +10,7 @@ FROM ${BASE_BUILD} as build WORKDIR /app/build # Install system dependencies. -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ - apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ @@ -47,8 +46,7 @@ RUN echo "Built wheels:" \ FROM ${BASE_RUNTIME} as stable # Install system dependencies. -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ - apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ From def49b769aae3e498fa39022b8f367040b64c4b0 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:31:23 -0700 Subject: [PATCH 07/17] run with dev image --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 58c41f78..2e51f5c3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -109,7 +109,7 @@ jobs: matrix: task: - name: Test (GPU) - image: olmo-core + image: olmo-core-dev gpus: 2 run: | pytest -v --color=yes --durations=3 -m gpu \ @@ -118,14 +118,14 @@ jobs: src/test/ - name: Test checkpoint (GPU) - image: olmo-core-nightly + image: olmo-core-dev gpus: 2 run: | pytest -v --color=yes --durations=3 -m gpu \ src/test/distributed/checkpoint* - name: Test MoE (GPU) - image: olmo-core-nightly + image: olmo-core-dev gpus: 1 run: | pytest -v --color=yes --durations=3 -m gpu \ From 90f48bc4cf07e6418e8daeb76e9d792fd262b9f3 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:46:57 -0700 Subject: [PATCH 08/17] upgrade to normal priority --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2e51f5c3..32dc817e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -174,7 +174,7 @@ jobs: image: beaker: ${{ env.BEAKER_IMAGE }} context: - priority: low + priority: normal preemptible: true resources: gpuCount: ${{ matrix.task.gpus }} From 2ed17b65eb357be5a8cc7485d91c86d0d3894162 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:52:42 -0700 Subject: [PATCH 09/17] add user-space drivers --- src/Dockerfile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Dockerfile b/src/Dockerfile index 9076efdf..545ab2f8 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -55,6 +55,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git && \ rm -rf /var/lib/apt/lists/* +# Install MLNX OFED user-space drivers +# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile +ENV MOFED_VER 24.01-0.3.3.1 +ENV OS_VER ubuntu20.04 +ENV PLATFORM x86_64 +RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \ + rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \ + rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz + # Install/upgrade Python build dependencies. RUN pip install --upgrade --no-cache-dir pip wheel packaging From a2362959e21d2fd0178da8fd5aa5cd4f0db989c6 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 12:55:04 -0700 Subject: [PATCH 10/17] fix os ver --- src/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dockerfile b/src/Dockerfile index 545ab2f8..81215546 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -58,7 +58,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Install MLNX OFED user-space drivers # See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile ENV MOFED_VER 24.01-0.3.3.1 -ENV OS_VER ubuntu20.04 +ENV OS_VER ubuntu22.04 ENV PLATFORM x86_64 RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ From dd2ca7f592e711f1db5c11f8664c6a7dd964867e Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 13:15:33 -0700 Subject: [PATCH 11/17] run higher priority --- .github/workflows/main.yml | 10 ++++++---- CHANGELOG.md | 4 ---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 32dc817e..5b6dda7f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -174,16 +174,17 @@ jobs: image: beaker: ${{ env.BEAKER_IMAGE }} context: - priority: normal + # priority: normal + priority: high preemptible: true resources: gpuCount: ${{ matrix.task.gpus }} constraints: cluster: + - ai2/jupiter-cirrascale-2 + - ai2/pluto-cirrascale # - ai2/allennlp-cirrascale # - ai2/allennlp-elanding-a100-40g - - ai2/pluto-cirrascale - - ai2/jupiter-cirrascale-2 # - ai2/saturn-cirrascale envVars: - name: CUBLAS_WORKSPACE_CONFIG @@ -201,7 +202,8 @@ jobs: result: path: /unused token: ${{ env.BEAKER_TOKEN }} - workspace: ${{ env.BEAKER_WORKSPACE }} + # workspace: ${{ env.BEAKER_WORKSPACE }} + workspace: ai2/OLMo-pretraining-stability release: name: Release diff --git a/CHANGELOG.md b/CHANGELOG.md index 329b11a2..e7186cff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals). -### Removed - -- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302. - ### Fixed - Made GCS client more robust by automatically retrying timeout errors for most operations. From af4fa468fde65769f96d92e7a06f87c310c4cfc6 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 13:50:10 -0700 Subject: [PATCH 12/17] clean up --- .github/workflows/main.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5b6dda7f..f1b99e74 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -109,7 +109,7 @@ jobs: matrix: task: - name: Test (GPU) - image: olmo-core-dev + image: olmo-core gpus: 2 run: | pytest -v --color=yes --durations=3 -m gpu \ @@ -118,14 +118,14 @@ jobs: src/test/ - name: Test checkpoint (GPU) - image: olmo-core-dev + image: olmo-core gpus: 2 run: | pytest -v --color=yes --durations=3 -m gpu \ src/test/distributed/checkpoint* - name: Test MoE (GPU) - image: olmo-core-dev + image: olmo-core gpus: 1 run: | pytest -v --color=yes --durations=3 -m gpu \ @@ -185,7 +185,7 @@ jobs: - ai2/pluto-cirrascale # - ai2/allennlp-cirrascale # - ai2/allennlp-elanding-a100-40g - # - ai2/saturn-cirrascale + - ai2/saturn-cirrascale envVars: - name: CUBLAS_WORKSPACE_CONFIG value: ":16:8" @@ -202,8 +202,7 @@ jobs: result: path: /unused token: ${{ env.BEAKER_TOKEN }} - # workspace: ${{ env.BEAKER_WORKSPACE }} - workspace: ai2/OLMo-pretraining-stability + workspace: ${{ env.BEAKER_WORKSPACE }} release: name: Release From 2eccb8f722900b3b1d7bedbd3669503e0498064e Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 13:50:44 -0700 Subject: [PATCH 13/17] fix changelog --- CHANGELOG.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index acccd446..e7186cff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Made GCS client more robust by automatically retrying timeout errors for most operations. -### Fixed - -- Made GCS client more robust by automatically retrying timeout errors for most operations. - ## [v1.5.0](https://github.com/allenai/OLMo-core/releases/tag/v1.5.0) - 2024-10-23 ### Added From 00777509587b5d354371b692346605601f6ba33f Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 13:51:53 -0700 Subject: [PATCH 14/17] fix --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f1b99e74..64771e74 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -174,8 +174,7 @@ jobs: image: beaker: ${{ env.BEAKER_IMAGE }} context: - # priority: normal - priority: high + priority: normal preemptible: true resources: gpuCount: ${{ matrix.task.gpus }} From 47f3bd623ed55c115c8bca514af7204a8dd077f6 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 15:06:13 -0700 Subject: [PATCH 15/17] disable auto docker build for now --- .github/workflows/docker.yml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 02a154bd..27bea638 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -5,20 +5,21 @@ concurrency: cancel-in-progress: true on: - pull_request: - branches: - - main - paths: - - 'Makefile' - - 'pyproject.toml' - - 'src/olmo_core/version.py' - - 'src/Dockerfile' - - '.github/workflows/docker.yml' - push: - branches: - - main - tags: - - 'v*.*.*' + # TODO: disabled for now because it takes too long in CI + # pull_request: + # branches: + # - main + # paths: + # - 'Makefile' + # - 'pyproject.toml' + # - 'src/olmo_core/version.py' + # - 'src/Dockerfile' + # - '.github/workflows/docker.yml' + # push: + # branches: + # - main + # tags: + # - 'v*.*.*' jobs: beaker: From acbe1a3f4295d7aaf9ab0c8a97511b26d81d79ef Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 15:07:25 -0700 Subject: [PATCH 16/17] add workflow dispatch --- .github/workflows/docker.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 27bea638..d1053fee 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -5,6 +5,7 @@ concurrency: cancel-in-progress: true on: + workflow_dispatch: # TODO: disabled for now because it takes too long in CI # pull_request: # branches: From 701978bde6ab2d54492bcedc043a3e791e629686 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 31 Oct 2024 15:14:43 -0700 Subject: [PATCH 17/17] increase time limit for GPU checks --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 64771e74..ec884c9a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -103,7 +103,7 @@ jobs: gpu_checks: name: ${{ matrix.task.name }} runs-on: ubuntu-latest - timeout-minutes: 8 + timeout-minutes: 15 strategy: fail-fast: false matrix: