Skip to content

Build and push HuggingFace TGI docker image #47

Build and push HuggingFace TGI docker image

Build and push HuggingFace TGI docker image #47

name: Build and push HuggingFace TGI docker image
on:
workflow_dispatch:
inputs:
tgi-version:
description: 'tgi version'
required: true
default: '1.1.0'
pytorch-version:
description: 'pytorch version'
required: true
default: '2.0.1'
cuda-version:
description: 'cuda version'
required: true
default: '118'
ubuntu-version:
description: 'ubuntu version'
required: true
default: '20.04'
jobs:
create-runner:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Create new G5 instance
id: create_gpu
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/awslabs/llm-hosting-container/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g5 $token awslabs/llm-hosting-container
outputs:
gpu_instance_id: ${{ steps.create_gpu.outputs.action_g5_instance_id }}
build-and-push-image:
runs-on: [ self-hosted, g5 ]
timeout-minutes: 150
needs: create-runner
env:
TGI_VERSION: ${{github.event.inputs.tgi-version}}
PYTORCH_VERSION: ${{github.event.inputs.pytorch-version}}
CUDA_VERSION: ${{github.event.inputs.cuda-version}}
UBUNTU_VERSION: ${{github.event.inputs.ubuntu-version}}
steps:
- uses: actions/checkout@v3
with:
repository: huggingface/text-generation-inference
ref: v${{ env.TGI_VERSION }}
- uses: actions/checkout@v3
with:
path: llm-hosting-container
- name: Setup Docker buildx
uses: docker/setup-buildx-action@v2
with:
install: true
- name: Inject slug/short variables
uses: rlespinasse/[email protected]
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registries: "125045733377"
- name: Clean docker env
run: |
yes | docker system prune -a --volumes
- name: Build and push docker image
uses: docker/build-push-action@v4
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
REPOSITORY: djl-serving
with:
context: .
file: llm-hosting-container/huggingface/pytorch/tgi/docker/${{ env.TGI_VERSION }}/py3/cu${{ env.CUDA_VERSION }}/Dockerfile.gpu
push: true
target: sagemaker
platforms: 'linux/amd64'
provenance: false
tags: ${{ env.REGISTRY }}/${{ env.REPOSITORY }}:${{ env.PYTORCH_VERSION }}-tgi${{ env.TGI_VERSION }}-gpu-py39-cu${{ env.CUDA_VERSION }}-ubuntu${{ env.UBUNTU_VERSION }}
cache-from: type=gha
cache-to: type=gha,mode=max
run-tests:
runs-on: [ self-hosted, g5 ]
timeout-minutes: 30
needs: [build-and-push-image, create-runner]
env:
TGI_VERSION: ${{github.event.inputs.tgi-version}}
REPOSITORY: djl-serving
TAG: ${{github.event.inputs.pytorch-version}}-tgi${{github.event.inputs.tgi-version}}-gpu-py39-cu${{github.event.inputs.cuda-version}}-ubuntu${{github.event.inputs.ubuntu-version}}
steps:
- uses: actions/checkout@v3
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registries: "125045733377"
- name: Pull docker
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
docker pull ${REGISTRY}/${REPOSITORY}:${TAG}
- name: Test bloom-560m
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=bigscience/bloom-560m && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 30
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: Test gpt-neox-20b
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=EleutherAI/gpt-neox-20b && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 400
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: Test flan-t5-xxl
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=google/flan-t5-xxl && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 400
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\""* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
run: |
docker rm -f $(docker ps -aq) || true
stop-runner:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [run-tests, build-and-push-image, create-runner]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runner.outputs.gpu_instance_id }}
./stop_instance.sh $instance_id