.github/workflows/e2e-nvidia-l4-x1.yml

# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA L4 x1)

on:
  # run against every merge commit to 'main' and release branches
  push:
    branches:
      - main
      - release-*
  # only run on PRs that touch certain regex paths
  pull_request_target:
    branches:
      - main
      - release-*
    paths:
      # note this should match the merging criteria in 'mergify.yml'
      - '**.py'
      - 'pyproject.toml'
      - 'requirements**.txt'
      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

env:
  LC_ALL: en_US.UTF-8

defaults:
  run:
    shell: bash

permissions:
  contents: read

jobs:
  start-medium-ec2-runner:
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: "Harden Runner"
        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
        with:
          egress-policy: audit

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ vars.AWS_REGION }}

      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
          ec2-instance-type: g6.8xlarge
          subnet-id: subnet-02d230cffd9385bd4
          security-group-id: sg-06300447c4a5fbef3
          iam-role-name: instructlab-ci-runner
          aws-resource-tags: >
            [
              {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
            ]
  
  e2e-medium-test:
    needs:
      - start-medium-ec2-runner
    runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}

    # It is important that this job has no write permissions and has
    # no access to any secrets. This part (e2e) is where we are running
    # untrusted code from PRs.
    permissions: {}

    steps:
      - name: "Harden Runner"
        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
        with:
          egress-policy: audit

      - name: Install Packages
        run: |
          cat /etc/os-release
          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

      - name: Checkout instructlab/instructlab
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "instructlab/instructlab"
          path: "instructlab"
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0

      - name: Checkout instructlab/training
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "instructlab/training"
          path: "training"
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0

      - name: Fetch and checkout PR
        if: ${{ github.event_name == 'pull_request_target' }}
        working-directory: ./training
        run: |
          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
          git checkout pr-${{ github.event.pull_request.number }}

      - name: Install ilab
        working-directory: ./instructlab
        run: |
          export CUDA_HOME="/usr/local/cuda"
          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
          export PATH="$PATH:$CUDA_HOME/bin"
          python3.11 -m venv --upgrade-deps venv
          . venv/bin/activate
          nvidia-smi
          python3.11 -m pip cache remove llama_cpp_python

          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .

          # https://github.com/instructlab/instructlab/issues/1821
          # install with Torch and build dependencies installed
          python3.11 -m pip install -v packaging wheel setuptools-scm
          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt

      - name: Update instructlab-training library
        working-directory: ./training
        run: |
          . ../instructlab/venv/bin/activate
          pip install -v .
          pip install -v .[cuda]

      - name: Check disk before tests
        run: |
          df -h

      - name: Run e2e test
        working-directory: ./instructlab
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          . venv/bin/activate
          # set preserve to true so we can retain the logs
          ./scripts/e2e-ci.sh -mp
          
          # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
          #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
          # and we know that it will be written into a directory created by `mktemp -d`. 
          # Given this information, we can use the following command to find the file:
          # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
          # mv "${log_file}" training-log.jsonl

      - name: Check disk after tests
        run: |
          df -h

      # - name: Upload training logs
      #   uses: actions/upload-artifact@v4
      #   with:
      #     name: training-log.jsonl
      #     path: ./instructlab/training-log.jsonl
      #     retention-days: 1
      #     overwrite: true

  stop-medium-ec2-runner:
    needs:
      - start-medium-ec2-runner
      - e2e-medium-test
    runs-on: ubuntu-latest
    if: ${{ always() }}
    steps:
      - name: "Harden Runner"
        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
        with:
          egress-policy: audit

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ vars.AWS_REGION }}

      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-medium-ec2-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
      
      # - name: Download loss data
      #   id: download-logs
      #   uses: actions/download-artifact@v4
      #   with:
      #     name: training-log.jsonl
      #     path: downloaded-data

      # - name: Install dependencies
      #   run: |
      #     pip install -r requirements-dev.txt
      
      # - name: Try to upload to s3
      #   id: upload-s3
      #   continue-on-error: true
      #   run: |
      #     output_file='./test.md' 
      #     python scripts/create-loss-graph.py  \
      #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
      #       --output-file "${output_file}" \
      #       --aws-region "${{ vars.AWS_REGION }}" \
      #       --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
      #       --base-branch "${{ github.event.pull_request.base.ref }}" \
      #       --pr-number "${{ github.event.pull_request.number }}" \
      #       --head-sha "${{ github.event.pull_request.head.sha }}" \
      #       --origin-repository "${{ github.repository }}"

      #     cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

      # - name: Check S3 upload status
      # if: steps.upload-s3.outcome == 'failure'
      #   run: |
      #     echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
      #     echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

  e2e-medium-workflow-complete:
    # we don't want to block PRs on failed EC2 cleanup
    # so not requiring "stop-runner" as well
    needs: ["start-medium-ec2-runner", "e2e-medium-test"]
    runs-on: ubuntu-latest
    steps:
      - name: E2E Workflow Complete
        run: echo "E2E Workflow Complete"