Skip to content

Commit

Permalink
Internal runner ?
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Jun 5, 2024
1 parent 2a48a10 commit 1390136
Showing 1 changed file with 4 additions and 106 deletions.
110 changes: 4 additions & 106 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,47 +22,11 @@ on:
- 'main'

jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
EC2_AMI_ID: ami-0789b6925c11b1fb2
EC2_INSTANCE_TYPE: g5.12xlarge
EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
EC2_SECURITY_GROUP: sg-030175c435ac141d6
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-tgi-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
build-and-push-image:
concurrency:
group: ${{ github.workflow }}-build-and-push-image-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, nvidia-gpu , multi-gpu, a10, ci]
permissions:
contents: write
packages: write
Expand Down Expand Up @@ -151,9 +115,8 @@ jobs:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs:
- start-runner
- build-and-push-image # Wait for the docker image to be built
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, nvidia-gpu , multi-gpu, a10, ci]
env:
DOCKER_VOLUME: /cache
steps:
Expand Down Expand Up @@ -187,10 +150,9 @@ jobs:
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs:
- start-runner
- build-and-push-image # Wait for the main docker image to be built
- integration-tests # Wait for the main integration-tests
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, nvidia-gpu , multi-gpu, a10, ci]
permissions:
contents: write
packages: write
Expand Down Expand Up @@ -279,10 +241,9 @@ jobs:
group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs:
- start-runner
- build-and-push-image # Wait for the main docker image to be built
- integration-tests # Wait for the main integration-tests
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, nvidia-gpu , multi-gpu, a10, ci]
permissions:
contents: write
packages: write
Expand Down Expand Up @@ -368,66 +329,3 @@ jobs:
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min

stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- build-and-push-image
- build-and-push-image-rocm
- build-and-push-image-intel
- integration-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

# TODO: Move this to `build_amd.yml` (and `build_nvidia.yml`)

# integration-tests-rocm:
# concurrency:
# group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
# cancel-in-progress: true
# needs:
# - start-runner
# - build-and-push-image
# - integration-tests
# - build-and-push-image-rocm
# - stop-runner
# runs-on: [self-hosted, amd-gpu, multi-gpu, mi300]
# container:
# image: registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ needs.build-and-push-image-rocm.outputs.short_sha }}-rocm
# options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/cache
# env:
# DOCKER_VOLUME: /cache
# steps:
# - name: ROCM-SMI
# run: |
# rocm-smi
# - name: ROCM-INFO
# run: |
# rocminfo | grep "Agent" -A 14
# - name: Show ROCR environment
# run: |
# echo "ROCR: $ROCR_VISIBLE_DEVICES"
# - name: Install
# run: |
# make install-integration-tests
# - name: Run tests
# run: |
# export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# pytest -s -vv integration-tests

0 comments on commit 1390136

Please sign in to comment.