Adds support for knowledge distillation #140

Workflow file for this run

.github/workflows/e2e-nvidia-l4-x1.yml at c0b9cf3

	# SPDX-License-Identifier: Apache-2.0

	name: E2E (NVIDIA L4 x1)

	on:
	# run against every merge commit to 'main' and release branches
	push:
	branches:
	- main
	- release-*
	# only run on PRs that touch certain regex paths
	pull_request_target:
	branches:
	- main
	- release-*
	paths:
	# note this should match the merging criteria in 'mergify.yml'
	- '**.py'
	- 'pyproject.toml'
	- 'requirements**.txt'
	- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	env:
	LC_ALL: en_US.UTF-8

	defaults:
	run:
	shell: bash

	permissions:
	contents: read

	jobs:
	start-medium-ec2-runner:
	runs-on: ubuntu-latest
	outputs:
	label: ${{ steps.start-ec2-runner.outputs.label }}
	ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
	steps:
	- name: "Harden Runner"
	uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
	with:
	egress-policy: audit

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ vars.AWS_REGION }}

	- name: Start EC2 runner
	id: start-ec2-runner
	uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
	with:
	mode: start
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	ec2-image-id: ${{ vars.AWS_EC2_AMI }}
	ec2-instance-type: g6.8xlarge
	subnet-id: subnet-02d230cffd9385bd4
	security-group-id: sg-06300447c4a5fbef3
	iam-role-name: instructlab-ci-runner
	aws-resource-tags: >
	[
	{"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
	{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
	{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
	{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
	]

	e2e-medium-test:
	needs:
	- start-medium-ec2-runner
	runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}

	# It is important that this job has no write permissions and has
	# no access to any secrets. This part (e2e) is where we are running
	# untrusted code from PRs.
	permissions: {}

	steps:
	- name: "Harden Runner"
	uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
	with:
	egress-policy: audit

	- name: Install Packages
	run: \|
	cat /etc/os-release
	sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

	- name: Checkout instructlab/instructlab
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: "instructlab/instructlab"
	path: "instructlab"
	# https://github.com/actions/checkout/issues/249
	fetch-depth: 0

	- name: Checkout instructlab/training
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	repository: "instructlab/training"
	path: "training"
	# https://github.com/actions/checkout/issues/249
	fetch-depth: 0

	- name: Fetch and checkout PR
	if: ${{ github.event_name == 'pull_request_target' }}
	working-directory: ./training
	run: \|
	git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
	git checkout pr-${{ github.event.pull_request.number }}

	- name: Install ilab
	working-directory: ./instructlab
	run: \|
	export CUDA_HOME="/usr/local/cuda"
	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
	export PATH="$PATH:$CUDA_HOME/bin"
	python3.11 -m venv --upgrade-deps venv
	. venv/bin/activate
	nvidia-smi
	python3.11 -m pip cache remove llama_cpp_python

	CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .

	# https://github.com/instructlab/instructlab/issues/1821
	# install with Torch and build dependencies installed
	python3.11 -m pip install -v packaging wheel setuptools-scm
	python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt

	- name: Update instructlab-training library
	working-directory: ./training
	run: \|
	. ../instructlab/venv/bin/activate
	pip install -v .
	pip install -v .[cuda]

	- name: Check disk before tests
	run: \|
	df -h

	- name: Run e2e test
	working-directory: ./instructlab
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	. venv/bin/activate
	# set preserve to true so we can retain the logs
	./scripts/e2e-ci.sh -mp

	# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
	# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
	# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
	# and we know that it will be written into a directory created by `mktemp -d`.
	# Given this information, we can use the following command to find the file:
	# log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
	# mv "${log_file}" training-log.jsonl

	- name: Check disk after tests
	run: \|
	df -h

	# - name: Upload training logs
	# uses: actions/upload-artifact@v4
	# with:
	# name: training-log.jsonl
	# path: ./instructlab/training-log.jsonl
	# retention-days: 1
	# overwrite: true

	stop-medium-ec2-runner:
	needs:
	- start-medium-ec2-runner
	- e2e-medium-test
	runs-on: ubuntu-latest
	if: ${{ always() }}
	steps:
	- name: "Harden Runner"
	uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
	with:
	egress-policy: audit

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ vars.AWS_REGION }}

	- name: Stop EC2 runner
	uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
	with:
	mode: stop
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	label: ${{ needs.start-medium-ec2-runner.outputs.label }}
	ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

	# - name: Download loss data
	# id: download-logs
	# uses: actions/download-artifact@v4
	# with:
	# name: training-log.jsonl
	# path: downloaded-data

	# - name: Install dependencies
	# run: \|
	# pip install -r requirements-dev.txt

	# - name: Try to upload to s3
	# id: upload-s3
	# continue-on-error: true
	# run: \|
	# output_file='./test.md'
	# python scripts/create-loss-graph.py \
	# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
	# --output-file "${output_file}" \
	# --aws-region "${{ vars.AWS_REGION }}" \
	# --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
	# --base-branch "${{ github.event.pull_request.base.ref }}" \
	# --pr-number "${{ github.event.pull_request.number }}" \
	# --head-sha "${{ github.event.pull_request.head.sha }}" \
	# --origin-repository "${{ github.repository }}"

	# cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

	# - name: Check S3 upload status
	# if: steps.upload-s3.outcome == 'failure'
	# run: \|
	# echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
	# echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

	e2e-medium-workflow-complete:
	# we don't want to block PRs on failed EC2 cleanup
	# so not requiring "stop-runner" as well
	needs: ["start-medium-ec2-runner", "e2e-medium-test"]
	runs-on: ubuntu-latest
	steps:
	- name: E2E Workflow Complete
	run: echo "E2E Workflow Complete"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adds support for knowledge distillation #140

Workflow file

Adds support for knowledge distillation #140

Jobs

Run details

Workflow file for this run