Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel AMD build. #2011

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 77 additions & 84 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ jobs:
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
pytest -s -vv integration-tests

build-and-push-image-rocm:
build-and-push-image-intel:
concurrency:
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs:
- start-runner
Expand All @@ -198,6 +198,9 @@ jobs:
# with sigstore/fulcio when running outside of PRs.
id-token: write
security-events: write
outputs:
# env is not available in the later `container:`, but previous job outputs are.
short_sha: ${{ env.GITHUB_SHA_SHORT }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -240,7 +243,7 @@ jobs:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
Expand All @@ -254,29 +257,55 @@ jobs:
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}-rocm
type=semver,pattern={{major}}.{{minor}}-rocm
type=raw,value=latest-rocm,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
type=semver,pattern={{version}}-intel
type=semver,pattern={{major}}.{{minor}}-intel
type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile_amd
file: Dockerfile_intel
push: true
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min

build-and-push-image-intel:
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- build-and-push-image
- build-and-push-image-intel
- integration-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

build-and-push-image-rocm:
concurrency:
group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }}
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs:
- start-runner
Expand All @@ -290,9 +319,6 @@ jobs:
# with sigstore/fulcio when running outside of PRs.
id-token: write
security-events: write
outputs:
# env is not available in the later `container:`, but previous job outputs are.
short_sha: ${{ env.GITHUB_SHA_SHORT }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -335,7 +361,7 @@ jobs:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
Expand All @@ -349,85 +375,52 @@ jobs:
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}-intel
type=semver,pattern={{major}}.{{minor}}-intel
type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
type=semver,pattern={{version}}-rocm
type=semver,pattern={{major}}.{{minor}}-rocm
type=raw,value=latest-rocm,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile_intel
file: Dockerfile_amd
push: true
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-rocm
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min

stop-runner:
name: Stop self-hosted EC2 runner
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
integration-tests-rocm:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on: [amd-gpu-tgi, mi250]
needs:
- start-runner
- build-and-push-image
- build-and-push-image-rocm
- build-and-push-image-intel
- integration-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
- uses: actions/setup-python@v5
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

# TODO: Move this to `build_amd.yml` (and `build_nvidia.yml`)

# integration-tests-rocm:
# concurrency:
# group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
# cancel-in-progress: true
# needs:
# - start-runner
# - build-and-push-image
# - integration-tests
# - build-and-push-image-rocm
# - stop-runner
# runs-on: [self-hosted, amd-gpu, multi-gpu, mi300]
# container:
# image: registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ needs.build-and-push-image-rocm.outputs.short_sha }}-rocm
# options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/cache
# env:
# DOCKER_VOLUME: /cache
# steps:
# - name: ROCM-SMI
# run: |
# rocm-smi
# - name: ROCM-INFO
# run: |
# rocminfo | grep "Agent" -A 14
# - name: Show ROCR environment
# run: |
# echo "ROCR: $ROCR_VISIBLE_DEVICES"
# - name: Install
# run: |
# make install-integration-tests
# - name: Run tests
# run: |
# export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# pytest -s -vv integration-tests
python-version: '3.10'
- uses: actions/checkout@v4
- name: install deps
run: |
make install-integration-tests
- name: ROCM-SMI
run: |
rocm-smi
- name: ROCM-INFO
run: |
rocminfo | grep "Agent" -A 14
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Run tests
run: |
export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
export DOCKER_DEVICES=/dev/kfd,/dev/dri
python -m pytest -s -vv integration-tests/models/test_flash_gpt2.py
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Rust builder
# Dummy.
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
WORKDIR /usr/src

Expand Down
19 changes: 16 additions & 3 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")


class ResponseComparator(JSONSnapshotExtension):
Expand Down Expand Up @@ -435,6 +436,7 @@ def docker_launcher(
container = client.containers.get(container_name)
container.stop()
container.wait()
client.containers.prune()
except NotFound:
pass

Expand All @@ -453,16 +455,27 @@ def docker_launcher(
if DOCKER_VOLUME:
volumes = [f"{DOCKER_VOLUME}:/data"]

if DOCKER_DEVICES:
devices = DOCKER_DEVICES.split(",")
visible = os.getenv("ROCR_VISIBLE_DEVICES")
if visible:
env["ROCR_VISIBLE_DEVICES"] = visible
device_requests = []
else:
devices = []
device_requests = [
docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
]

container = client.containers.run(
DOCKER_IMAGE,
command=args,
name=container_name,
environment=env,
auto_remove=False,
devices=devices,
device_requests=device_requests,
detach=True,
device_requests=[
docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
],
volumes=volumes,
ports={"80/tcp": port},
shm_size="1G",
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/models/test_flash_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

@pytest.fixture(scope="module")
def flash_gpt2_handle(launcher):
with launcher("openai-community/gpt2", num_shard=2) as handle:
with launcher("openai-community/gpt2", num_shard=1) as handle:
yield handle


Expand Down
Loading