Skip to content

Commit

Permalink
Merge branch 'main' into ethan/MCLOUD-4621
Browse files Browse the repository at this point in the history
  • Loading branch information
ethanma-db authored Aug 7, 2024
2 parents 8e76bee + 84cb2ed commit 5d44b8a
Show file tree
Hide file tree
Showing 127 changed files with 9,567 additions and 4,595 deletions.
4 changes: 4 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# This includes setup.py, the README, and the CODEOWNERS file itself!
/* @mosaicml/composer-team-admins

# Require team approval for code changes
/llmfoundry/ @mosaicml/composer-team-eng
/scripts/ @mosaicml/composer-team-eng

# Require admin approval to change the CI build configuration
# All CI Changes should be reviewed for security
/.ci/ @mosaicml/composer-team-admins
Expand Down
7 changes: 7 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
open-pull-requests-limit: 5
4 changes: 2 additions & 2 deletions .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ defaults:
working-directory: .
jobs:
code-quality:
runs-on: ubuntu-20.04
runs-on: linux-ubuntu-latest
timeout-minutes: 30
strategy:
matrix:
Expand All @@ -34,7 +34,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.5
ref: v0.0.9
path: ./ci-testing
- uses: ./ci-testing/.github/actions/code-quality
with:
Expand Down
50 changes: 0 additions & 50 deletions .github/workflows/codeql-analysis.yml

This file was deleted.

4 changes: 2 additions & 2 deletions .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ on:
jobs:
coverage:
timeout-minutes: 5
runs-on: ubuntu-latest
runs-on: linux-ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Get composite run steps repository
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.5
ref: v0.0.9
path: ./ci-testing
- uses: ./ci-testing/.github/actions/coverage
with:
Expand Down
34 changes: 19 additions & 15 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,13 @@ jobs:
strategy:
matrix:
include:
- name: "2.3.0_cu121_flash2"
base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04
dep_groups: "[gpu-flash2]"
- name: "2.3.0_cu121_flash2_aws"
base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu-flash2]"
- name: "2.3.1_cu121"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
dep_groups: "[all]"
- name: "2.3.1_cu121_aws"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[all]"
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
with:
overprovision-lvm: true
remove-dotnet: true
remove-android: true
remove-haskell: true

- name: Checkout
uses: actions/checkout@v3
Expand All @@ -47,6 +40,13 @@ jobs:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Login to GHCR
uses: docker/login-action@v2
with:
username: ${{ secrets.GHCR_USERNAME }}
password: ${{ secrets.GHCR_TOKEN }}
registry: ghcr.io

- name: Calculate Docker Image Variables
run: |
set -euxo pipefail
Expand All @@ -60,13 +60,17 @@ jobs:
if [ "${{ github.event_name }}" == "pull_request" ]; then
echo "Triggered by pull_request event."
STAGING_REPO="mosaicml/ci-staging"
IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
GHCR_STAGING_REPO="ghcr.io/databricks-mosaic/ci-staging"
GHCR_IMAGE_TAG="${GHCR_STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_IMAGE_TAG}"
IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
else
# Triggered by push or workflow_dispatch event
echo "Triggered by ${{ github.event_name }} event, releasing to prod"
PROD_REPO="mosaicml/llm-foundry"
IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
GHCR_PROD_REPO="ghcr.io/databricks-mosaic/llm-foundry"
GHCR_IMAGE_TAG="${GHCR_PROD_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_PROD_REPO}:${{matrix.name}}-latest"
IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest,${GHCR_IMAGE_TAG}"
IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
fi
Expand Down
29 changes: 17 additions & 12 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: ubuntu-latest
strategy:
matrix:
include:
- name: "cpu-2.3.0"
container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
- name: "cpu-2.3.1"
pip_deps: "[all-cpu]"
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
name: ${{ matrix.name }}
pip_deps: "[all-cpu]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
safe_directory: llm-foundry
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
name: ${{ matrix.name }}
container: ${{ matrix.container }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
safe_directory: llm-foundry
coverage:
uses: ./.github/workflows/coverage.yaml
name: Coverage Results
Expand Down
94 changes: 79 additions & 15 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,95 @@ on:
- main
- release/**
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-gpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
pytest-gpu-1:
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.0"
container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04
- name: "gpu-2.3.1-1"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 1
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-2:
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-2"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 2
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-4:
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-4"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 4
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
name: Build and Publish llm-foundry PyPI Package
needs:
- code-quality
runs-on: ubuntu-latest
runs-on: linux-ubuntu-latest
steps:
- name: Checkout source
uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defaults:
working-directory: .
jobs:
smoketest:
runs-on: ubuntu-20.04
runs-on: linux-ubuntu-latest
timeout-minutes: 20
strategy:
matrix:
Expand All @@ -32,7 +32,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.5
ref: v0.0.9
path: ./ci-testing
- uses: ./ci-testing/.github/actions/smoketest
with:
Expand Down
11 changes: 0 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,6 @@ repos:
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
- repo: https://github.com/PyCQA/pydocstyle
hooks:
- id: pydocstyle
name: pydocstyle
entry: pydocstyle
language: python
types: [python]
exclude: (.ci|.github)
additional_dependencies:
- toml
rev: 6.1.1
- repo: https://github.com/adrienverge/yamllint.git
rev: v1.28.0
hooks:
Expand Down
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ FROM $BASE_IMAGE
ARG BRANCH_NAME
ARG DEP_GROUPS

ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"

# Check for changes in setup.py.
# If there are changes, the docker cache is invalidated and a fresh pip installation is triggered.
ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py
RUN rm setup.py

# Install TransformerEngine
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@b5a7c9f

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
Expand Down
Loading

0 comments on commit 5d44b8a

Please sign in to comment.