Skip to content

Commit

Permalink
Torch 2.4 in docker images (#1491)
Browse files Browse the repository at this point in the history
  • Loading branch information
snarayan21 authored Aug 30, 2024
1 parent b21cc0c commit 31c8ba2
Show file tree
Hide file tree
Showing 10 changed files with 26 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.2.0
ref: v0.1.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/code-quality
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.2.0
ref: v0.1.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/coverage
with:
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,19 @@ jobs:
- name: "2.3.1_cu121"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
dep_groups: "[all]"
te_commit: b5a7c9f
- name: "2.3.1_cu121_aws"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[all]"
te_commit: b5a7c9f
- name: "2.4.0_cu124"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
dep_groups: "[all]"
te_commit: 901e5d2
- name: "2.4.0_cu124_aws"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
dep_groups: "[all]"
te_commit: 901e5d2
steps:

- name: Checkout
Expand Down Expand Up @@ -89,3 +99,4 @@ jobs:
BRANCH_NAME=${{ github.head_ref || github.ref_name }}
BASE_IMAGE=${{ matrix.base_image }}
DEP_GROUPS=${{ matrix.dep_groups }}
TE_COMMIT=${{ matrix.te_commit }}
2 changes: 1 addition & 1 deletion .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@v2
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.2.0
uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.2
with:
name: ${{ matrix.name }}
container: ${{ matrix.container }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ jobs:
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.2.0
ci_repo_gpu_test_ref: v0.1.2
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.0
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
Expand All @@ -56,10 +56,10 @@ jobs:
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.2.0
ci_repo_gpu_test_ref: v0.1.2
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.0
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
Expand All @@ -85,10 +85,10 @@ jobs:
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.2.0
ci_repo_gpu_test_ref: v0.1.2
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.0
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.2.0
ref: v0.1.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/smoketest
with:
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ FROM $BASE_IMAGE

ARG BRANCH_NAME
ARG DEP_GROUPS
ARG TE_COMMIT

ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"

Expand All @@ -15,7 +16,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
RUN rm setup.py

# Install TransformerEngine
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@b5a7c9f
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@$TE_COMMIT

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
Expand Down
2 changes: 1 addition & 1 deletion scripts/inference/convert_hf_to_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def export_to_onnx(
ort_session = ort.InferenceSession(str(output_file))

for key, value in sample_input.items():
sample_input[key] = value.cpu().numpy()
sample_input[key] = value.cpu().numpy() # pyright: ignore

loaded_model_out = ort_session.run(None, sample_input)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
'accelerate>=0.25,<0.34', # for HF inference `device_map`
'transformers>=4.43.2,<4.44',
'mosaicml-streaming>=0.8.1,<0.9',
'torch>=2.3.0,<2.4',
'torch>=2.3.0,<2.4.1',
'datasets>=2.19,<2.20',
'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data
'sentencepiece==0.2.0',
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_onnx_export(tie_word_embeddings: bool, tmp_path: pathlib.Path):
ort_session = ort.InferenceSession(str(tmp_path / 'mpt.onnx'))

for key, value in sample_input.items():
sample_input[key] = value.cpu().numpy()
sample_input[key] = value.cpu().numpy() # pyright: ignore

loaded_model_out = ort_session.run(None, sample_input)

Expand Down

0 comments on commit 31c8ba2

Please sign in to comment.