Build and push HuggingFace TGI docker image #48
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Build and push HuggingFace TGI docker image | |
on: | |
workflow_dispatch: | |
inputs: | |
tgi-version: | |
description: 'tgi version' | |
required: true | |
default: '1.1.0' | |
pytorch-version: | |
description: 'pytorch version' | |
required: true | |
default: '2.0.1' | |
cuda-version: | |
description: 'cuda version' | |
required: true | |
default: '118' | |
ubuntu-version: | |
description: 'ubuntu version' | |
required: true | |
default: '20.04' | |
jobs: | |
create-runner: | |
runs-on: [ self-hosted, scheduler ] | |
steps: | |
- name: Create new G5 instance | |
id: create_gpu | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ | |
https://api.github.com/repos/awslabs/llm-hosting-container/actions/runners/registration-token \ | |
--fail \ | |
| jq '.token' | tr -d '"' ) | |
./start_instance.sh action_g5 $token awslabs/llm-hosting-container | |
outputs: | |
gpu_instance_id: ${{ steps.create_gpu.outputs.action_g5_instance_id }} | |
build-and-push-image: | |
runs-on: [ self-hosted, g5 ] | |
timeout-minutes: 150 | |
needs: create-runner | |
env: | |
TGI_VERSION: ${{github.event.inputs.tgi-version}} | |
PYTORCH_VERSION: ${{github.event.inputs.pytorch-version}} | |
CUDA_VERSION: ${{github.event.inputs.cuda-version}} | |
UBUNTU_VERSION: ${{github.event.inputs.ubuntu-version}} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
repository: huggingface/text-generation-inference | |
ref: v${{ env.TGI_VERSION }} | |
- uses: actions/checkout@v3 | |
with: | |
path: llm-hosting-container | |
- name: Setup Docker buildx | |
uses: docker/setup-buildx-action@v2 | |
with: | |
install: true | |
- name: Inject slug/short variables | |
uses: rlespinasse/[email protected] | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
aws-region: us-east-1 | |
- name: Login to Amazon ECR | |
id: login-ecr | |
uses: aws-actions/amazon-ecr-login@v1 | |
with: | |
registries: "125045733377" | |
- name: Clean docker env | |
run: | | |
yes | docker system prune -a --volumes | |
- name: Build and push docker image | |
uses: docker/build-push-action@v4 | |
env: | |
REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
REPOSITORY: djl-serving | |
with: | |
context: . | |
file: llm-hosting-container/huggingface/pytorch/tgi/docker/${{ env.TGI_VERSION }}/py3/cu${{ env.CUDA_VERSION }}/Dockerfile.gpu | |
push: true | |
target: sagemaker | |
platforms: 'linux/amd64' | |
provenance: false | |
tags: ${{ env.REGISTRY }}/${{ env.REPOSITORY }}:${{ env.PYTORCH_VERSION }}-tgi${{ env.TGI_VERSION }}-gpu-py39-cu${{ env.CUDA_VERSION }}-ubuntu${{ env.UBUNTU_VERSION }} | |
cache-from: type=gha | |
cache-to: type=gha,mode=max | |
run-tests: | |
runs-on: [ self-hosted, g5 ] | |
timeout-minutes: 30 | |
needs: [build-and-push-image, create-runner] | |
env: | |
TGI_VERSION: ${{github.event.inputs.tgi-version}} | |
REPOSITORY: djl-serving | |
TAG: ${{github.event.inputs.pytorch-version}}-tgi${{github.event.inputs.tgi-version}}-gpu-py39-cu${{github.event.inputs.cuda-version}}-ubuntu${{github.event.inputs.ubuntu-version}} | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Clean env | |
run: | | |
yes | docker system prune -a --volumes | |
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ | |
echo "wait dpkg lock..." | |
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
aws-region: us-east-1 | |
- name: Login to Amazon ECR | |
id: login-ecr | |
uses: aws-actions/amazon-ecr-login@v1 | |
with: | |
registries: "125045733377" | |
- name: Pull docker | |
env: | |
REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
run: | | |
docker pull ${REGISTRY}/${REPOSITORY}:${TAG} | |
- name: Test bloom-560m | |
env: | |
REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
run: | | |
set -ex | |
HF_MODEL_ID=bigscience/bloom-560m && \ | |
SM_NUM_GPUS=4 && \ | |
TGI_VERSION=$TGI_VERSION && \ | |
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \ | |
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \ | |
${REGISTRY}/${REPOSITORY}:${TAG} | |
sleep 30 | |
ret=$(curl http://localhost:8080/invocations -X POST \ | |
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ | |
-H 'Content-Type: application/json') | |
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1 | |
docker rm -f $(docker ps -aq) | |
- name: Test gpt-neox-20b | |
env: | |
REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
run: | | |
set -ex | |
HF_MODEL_ID=EleutherAI/gpt-neox-20b && \ | |
SM_NUM_GPUS=4 && \ | |
TGI_VERSION=$TGI_VERSION && \ | |
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \ | |
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \ | |
${REGISTRY}/${REPOSITORY}:${TAG} | |
sleep 400 | |
ret=$(curl http://localhost:8080/invocations -X POST \ | |
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ | |
-H 'Content-Type: application/json') | |
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1 | |
docker rm -f $(docker ps -aq) | |
- name: Test flan-t5-xxl | |
env: | |
REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
run: | | |
set -ex | |
HF_MODEL_ID=google/flan-t5-xxl && \ | |
SM_NUM_GPUS=4 && \ | |
TGI_VERSION=$TGI_VERSION && \ | |
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \ | |
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \ | |
${REGISTRY}/${REPOSITORY}:${TAG} | |
sleep 400 | |
ret=$(curl http://localhost:8080/invocations -X POST \ | |
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ | |
-H 'Content-Type: application/json') | |
[[ $ret != "[{\"generated_text\""* ]] && exit 1 | |
docker rm -f $(docker ps -aq) | |
- name: On fail step | |
if: ${{ failure() }} | |
run: | | |
docker rm -f $(docker ps -aq) || true | |
stop-runner: | |
if: always() | |
runs-on: [ self-hosted, scheduler ] | |
needs: [run-tests, build-and-push-image, create-runner] | |
steps: | |
- name: Stop all instances | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
instance_id=${{ needs.create-runner.outputs.gpu_instance_id }} | |
./stop_instance.sh $instance_id |