diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 02a154bd..d1053fee 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -5,20 +5,22 @@ concurrency: cancel-in-progress: true on: - pull_request: - branches: - - main - paths: - - 'Makefile' - - 'pyproject.toml' - - 'src/olmo_core/version.py' - - 'src/Dockerfile' - - '.github/workflows/docker.yml' - push: - branches: - - main - tags: - - 'v*.*.*' + workflow_dispatch: + # TODO: disabled for now because it takes too long in CI + # pull_request: + # branches: + # - main + # paths: + # - 'Makefile' + # - 'pyproject.toml' + # - 'src/olmo_core/version.py' + # - 'src/Dockerfile' + # - '.github/workflows/docker.yml' + # push: + # branches: + # - main + # tags: + # - 'v*.*.*' jobs: beaker: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 58c41f78..ec884c9a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -103,7 +103,7 @@ jobs: gpu_checks: name: ${{ matrix.task.name }} runs-on: ubuntu-latest - timeout-minutes: 8 + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -118,14 +118,14 @@ jobs: src/test/ - name: Test checkpoint (GPU) - image: olmo-core-nightly + image: olmo-core gpus: 2 run: | pytest -v --color=yes --durations=3 -m gpu \ src/test/distributed/checkpoint* - name: Test MoE (GPU) - image: olmo-core-nightly + image: olmo-core gpus: 1 run: | pytest -v --color=yes --durations=3 -m gpu \ @@ -174,17 +174,17 @@ jobs: image: beaker: ${{ env.BEAKER_IMAGE }} context: - priority: low + priority: normal preemptible: true resources: gpuCount: ${{ matrix.task.gpus }} constraints: cluster: + - ai2/jupiter-cirrascale-2 + - ai2/pluto-cirrascale # - ai2/allennlp-cirrascale # - ai2/allennlp-elanding-a100-40g - - ai2/pluto-cirrascale - - ai2/jupiter-cirrascale-2 - # - ai2/saturn-cirrascale + - ai2/saturn-cirrascale envVars: - name: CUBLAS_WORKSPACE_CONFIG value: ":16:8" diff --git a/CHANGELOG.md b/CHANGELOG.md index 329b11a2..e7186cff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `DownstreamEvaluatorCallbackConfig` class for running in-loop downstream eval via [OLMo-in-loop-evals](https://github.com/allenai/OLMo-in-loop-evals). -### Removed - -- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302. - ### Fixed - Made GCS client more robust by automatically retrying timeout errors for most operations. diff --git a/Makefile b/Makefile index cba52330..0f226fb9 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,15 @@ # NOTE: make sure CUDA versions match across these variables -BASE_IMAGE = ghcr.io/allenai/pytorch:2.5.1-cuda12.1-python3.11-v2024.10.29 -CUDA_TOOLKIT_VERSION = 12.1.0 -TORCH_CUDA_VERSION = 121 +CUDA_VERSION = 12.1 +TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .) +BASE_BUILD_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-devel +BASE_RUNTIME_IMAGE = pytorch/pytorch:2.5.1-cuda$(CUDA_VERSION)-cudnn9-runtime # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification # in 'pyproject.toml' to include that nightly version. -NIGHTLY_VERSION = "2.6.0.dev20241009+cu121" -TORCHAO_VERSION = "torchao==0.5.0" +NIGHTLY_VERSION = "2.6.0.dev20241009+cu$(TORCH_CUDA_VERSION)" +TORCHAO_VERSION = "0.5.0" MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://git@github.com/epwalsh/megablocks.git@epwalsh/deps" +FLASH_ATTN_VERSION = "2.6.3" VERSION = $(shell python src/olmo_core/version.py) VERSION_SHORT = $(shell python src/olmo_core/version.py short) @@ -49,9 +51,10 @@ build : stable-image : docker build -f src/Dockerfile \ --build-arg BUILDKIT_INLINE_CACHE=1 \ - --build-arg BASE=$(BASE_IMAGE) \ - --build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \ + --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ + --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ + --build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ --target stable \ @@ -63,9 +66,10 @@ stable-image : nightly-image : docker build -f src/Dockerfile \ --build-arg BUILDKIT_INLINE_CACHE=1 \ - --build-arg BASE=$(BASE_IMAGE) \ - --build-arg CUDA_TOOLKIT_VERSION=$(CUDA_TOOLKIT_VERSION) \ + --build-arg BASE_BUILD=$(BASE_BUILD_IMAGE) \ + --build-arg BASE_RUNTIME=$(BASE_RUNTIME_IMAGE) \ --build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \ + --build-arg FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \ --build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \ --build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \ --build-arg NIGHTLY_VERSION=$(NIGHTLY_VERSION) \ diff --git a/src/Dockerfile b/src/Dockerfile index a5726d0b..81215546 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -1,30 +1,36 @@ -# Base image comes with PyTorch, numpy, flash-attn -ARG BASE +ARG BASE_BUILD +ARG BASE_RUNTIME ######################################################################### # Build image ######################################################################### -FROM ${BASE} as build +FROM ${BASE_BUILD} as build WORKDIR /app/build -# Install CUDA toolkit. -ARG CUDA_TOOLKIT_VERSION -RUN conda install -y -c nvidia cuda-toolkit==${CUDA_TOOLKIT_VERSION} +# Install system dependencies. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + wget \ + libxml2-dev \ + git && \ + rm -rf /var/lib/apt/lists/* -ARG TORCH_CUDA_VERSION +# Install/upgrade Python build dependencies. +RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja + +# Build flash-attn. +ARG FLASH_ATTN_VERSION +RUN pip wheel --no-build-isolation --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} -# Build megablocks and grouped-gemm. +# Build megablocks, grouped-gemm, stanford-stk ENV TORCH_CUDA_ARCH_LIST="8.0 9.0" ENV GROUPED_GEMM_CUTLASS=1 ARG MEGABLOCKS_VERSION -RUN pip wheel --no-build-isolation --no-cache-dir \ - --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ - "${MEGABLOCKS_VERSION}" - -# Flash-attn from pre-built wheel (can't get this to work at the moment) -#RUN wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl +RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}" # Only keep the target wheels and dependencies with CUDA extensions. RUN echo "Built wheels:" \ @@ -37,15 +43,38 @@ RUN echo "Built wheels:" \ # Stable image ######################################################################### -FROM ${BASE} as stable - -ARG TORCH_CUDA_VERSION +FROM ${BASE_RUNTIME} as stable + +# Install system dependencies. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + wget \ + libxml2-dev \ + git && \ + rm -rf /var/lib/apt/lists/* + +# Install MLNX OFED user-space drivers +# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile +ENV MOFED_VER 24.01-0.3.3.1 +ENV OS_VER ubuntu22.04 +ENV PLATFORM x86_64 +RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \ + rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \ + rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz + +# Install/upgrade Python build dependencies. +RUN pip install --upgrade --no-cache-dir pip wheel packaging # Install torchao. +ARG TORCH_CUDA_VERSION ARG TORCHAO_VERSION RUN pip install --no-cache-dir \ --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \ - ${TORCHAO_VERSION} + torchao==${TORCHAO_VERSION} # Copy and install wheels from build image. COPY --from=build /app/build /app/build @@ -68,7 +97,6 @@ WORKDIR /app/olmo-core FROM stable as nightly ARG TORCH_CUDA_VERSION - ARG NIGHTLY_VERSION RUN pip install --no-cache-dir --pre \ --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \