.github/workflows/build-huggingface.yml

name: Build and push HuggingFace TGI docker image

on:
  workflow_dispatch:
    inputs:
      tgi-version:
        description: 'tgi version'
        required: true
        default: '1.1.0'
      pytorch-version:
        description: 'pytorch version'
        required: true
        default: '2.0.1'
      cuda-version:
        description: 'cuda version'
        required: true
        default: '118'
      ubuntu-version:
        description: 'ubuntu version'
        required: true
        default: '20.04'

jobs:
  create-runner:
    runs-on: [ self-hosted, scheduler ]
    steps:
      - name: Create new G5 instance
        id: create_gpu
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
          https://api.github.com/repos/awslabs/llm-hosting-container/actions/runners/registration-token \
          --fail \
          | jq '.token' | tr -d '"' )
          ./start_instance.sh action_g5 $token awslabs/llm-hosting-container
    outputs:
      gpu_instance_id: ${{ steps.create_gpu.outputs.action_g5_instance_id }}

  build-and-push-image:
    runs-on: [ self-hosted, g5 ]
    timeout-minutes: 150
    needs: create-runner
    env:
      TGI_VERSION: ${{github.event.inputs.tgi-version}}
      PYTORCH_VERSION: ${{github.event.inputs.pytorch-version}}
      CUDA_VERSION: ${{github.event.inputs.cuda-version}}
      UBUNTU_VERSION: ${{github.event.inputs.ubuntu-version}}
    steps:
      - uses: actions/checkout@v3
        with:
          repository: huggingface/text-generation-inference
          ref: v${{ env.TGI_VERSION }}
      - uses: actions/checkout@v3
        with:
          path: llm-hosting-container
      - name: Setup Docker buildx
        uses: docker/setup-buildx-action@v2
        with:
          install: true
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v2
        with:
          aws-region: us-east-1
      - name: Login to Amazon ECR
        id: login-ecr
        uses: aws-actions/amazon-ecr-login@v1
        with:
          registries: "125045733377"
      - name: Clean docker env
        run: |
          yes | docker system prune -a --volumes
      - name: Build and push docker image
        uses: docker/build-push-action@v4
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
          REPOSITORY: djl-serving
        with:
          context: .
          file: llm-hosting-container/huggingface/pytorch/tgi/docker/${{ env.TGI_VERSION }}/py3/cu${{ env.CUDA_VERSION }}/Dockerfile.gpu
          push: true
          target: sagemaker
          platforms: 'linux/amd64'
          provenance: false
          tags: ${{ env.REGISTRY }}/${{ env.REPOSITORY }}:${{ env.PYTORCH_VERSION }}-tgi${{ env.TGI_VERSION }}-gpu-py39-cu${{ env.CUDA_VERSION }}-ubuntu${{ env.UBUNTU_VERSION }}
          cache-from: type=gha
          cache-to: type=gha,mode=max

  run-tests:
    runs-on: [ self-hosted, g5 ]
    timeout-minutes: 30
    needs: [build-and-push-image, create-runner]
    env:
      TGI_VERSION: ${{github.event.inputs.tgi-version}}
      REPOSITORY: djl-serving
      TAG: ${{github.event.inputs.pytorch-version}}-tgi${{github.event.inputs.tgi-version}}-gpu-py39-cu${{github.event.inputs.cuda-version}}-ubuntu${{github.event.inputs.ubuntu-version}}
    steps:
      - uses: actions/checkout@v3
      - name: Clean env
        run: |
          yes | docker system prune -a --volumes
          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
          echo "wait dpkg lock..."
          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v2
        with:
          aws-region: us-east-1
      - name: Login to Amazon ECR
        id: login-ecr
        uses: aws-actions/amazon-ecr-login@v1
        with:
          registries: "125045733377"
      - name: Pull docker
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
        run: |
          docker pull ${REGISTRY}/${REPOSITORY}:${TAG}
      - name: Test bloom-560m
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
        run: |
          set -ex
          HF_MODEL_ID=bigscience/bloom-560m && \
          SM_NUM_GPUS=4 && \
          TGI_VERSION=$TGI_VERSION && \
          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
              ${REGISTRY}/${REPOSITORY}:${TAG}
          sleep 30
          ret=$(curl http://localhost:8080/invocations -X POST \
              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
              -H 'Content-Type: application/json')
          [[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
          docker rm -f $(docker ps -aq)
      - name: Test gpt-neox-20b
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
        run: |
          set -ex
          HF_MODEL_ID=EleutherAI/gpt-neox-20b && \
          SM_NUM_GPUS=4 && \
          TGI_VERSION=$TGI_VERSION && \
          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
              ${REGISTRY}/${REPOSITORY}:${TAG}
          sleep 400
          ret=$(curl http://localhost:8080/invocations -X POST \
              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
              -H 'Content-Type: application/json')
          [[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
          docker rm -f $(docker ps -aq)
      - name: Test flan-t5-xxl
        env:
          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
        run: |
          set -ex
          HF_MODEL_ID=google/flan-t5-xxl && \
          SM_NUM_GPUS=4 && \
          TGI_VERSION=$TGI_VERSION && \
          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
              ${REGISTRY}/${REPOSITORY}:${TAG}
          sleep 400
          ret=$(curl http://localhost:8080/invocations -X POST \
              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
              -H 'Content-Type: application/json')
          [[ $ret != "[{\"generated_text\""* ]] && exit 1
          docker rm -f $(docker ps -aq)
      - name: On fail step
        if: ${{ failure() }}
        run: |
          docker rm -f $(docker ps -aq) || true

  stop-runner:
    if: always()
    runs-on: [ self-hosted, scheduler ]
    needs: [run-tests, build-and-push-image, create-runner]
    steps:
      - name: Stop all instances
        run: |
          cd /home/ubuntu/djl_benchmark_script/scripts
          instance_id=${{ needs.create-runner.outputs.gpu_instance_id }}
          ./stop_instance.sh $instance_id