Skip to content

Commit

Permalink
Merge pull request instructlab#280 from JamesKunstle/smoketests
Browse files Browse the repository at this point in the history
adds basic smoketests for main_ds and data_process CLI args
  • Loading branch information
mergify[bot] authored Oct 25, 2024
2 parents 03d1b62 + c0f3b44 commit 466474a
Show file tree
Hide file tree
Showing 2 changed files with 232 additions and 0 deletions.
20 changes: 20 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## Overview

`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change.

Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed.

Current tests add features as they go:

1. No Flash Attention or Granite
2. No Granite but Flash Attention enabled
3. Granite and Flash Attention enabled

## Usage

The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`.

The second positional argument is for "number of GPUs"- e.g.: `./smoketest.sh fsdp 8`. This will run the test with 8 GPUs with fsdp as the distributed backend.

> [!NOTE]
> You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode.
212 changes: 212 additions & 0 deletions tests/smoketest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
#!/usr/bin/env bash
set -eux -o pipefail

# ############### Read-only parameters ###############
MODEL_NAME="instructlab/granite-7b-lab"
# gets directory of current file.
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
CORRECT_WORKING_DIR="${SCRIPT_DIR}/../src/instructlab/training/"
SAMPLE_DATA_PATH="${SCRIPT_DIR}/../sample-data/train_all_pruned_SDG.jsonl"
TMP_DIR=$(mktemp -d)
CHECKPOINTS_DIR="${TMP_DIR}/checkpoints"
DATA_DIR="${TMP_DIR}/data"
COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
DEFAULT_DISTRIB_FRAMEWORK='fsdp'
DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
DEFAULT_GPUS=8
NUM_GPUS="${2:-$DEFAULT_GPUS}"

# ############### User-modifiable parameters ###############
# Change these as needed
MAX_BATCH_LEN=60000
NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.

# ############### Test Functions ###############

#######################################
# Creates directories for the precomputed datasets
# and the checkpoints that are saved during training inside
# of the temporary storage created for these tests.
# Globals:
# CHECKPOINTS_DIR
# DATA_DIR
# Arguments:
# None
# Returns:
# None
#######################################
function setup_tmpdir () {
mkdir "$CHECKPOINTS_DIR"
mkdir "$DATA_DIR"
}

#######################################
# Test most common training parameters without using
# Flash Attention
# Globals:
# SAMPLE_DATA_PATH
# DATA_DIR
# MODEL_NAME
# NUM_SAMPLES_TRAINED_ON
# COMPUTED_DATA_PATH
# Arguments:
# None
# Returns:
# echos number of samples trained on to standard out.
#######################################
function prepare_data () {
# preprocesses .jsonl messages data so that it's a valid
# input to the model (inputs tokenized, formatted with mask, etc.)
# then, data is trimmed to a determined length to make training
# go faster.

python3 data_process.py \
--data_path="$SAMPLE_DATA_PATH" \
--data_output_path="$DATA_DIR" \
--max_seq_len=4096 \
--model_name_or_path="$MODEL_NAME"

# trim data so we only keep the first 'n' samples.
# should be enough data for training to be meaningful but not enough
# that training takes a large amount of time.
echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"

echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
}

#######################################
# Clears and remakes the temporary directory where
# artifacts, such as checkpoints and logs, are stored
# during training.
# Globals:
# CHECKPOINTS_DIR
# Arguments:
# None
# Returns:
# writes location of checkpoints dir to standard out.
#######################################
function _cleanup_saved_checkpoints() {
echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
rm -rf "$CHECKPOINTS_DIR"
mkdir "$CHECKPOINTS_DIR"
}

#######################################
# Test most common training parameters without using
# Flash Attention
# Globals:
# NUM_GPUS
# MODEL_NAME
# COMPUTED_DATA_PATH
# CHECKPOINTS_DIR
# DISTRIBUTED_FRAMEWORK
# MAX_BATCH_LEN
# Arguments:
# None
# Returns:
# None
#######################################
function test_standard_loop () {
torchrun \
--standalone \
--nproc_per_node="$NUM_GPUS" \
main_ds.py \
--model_name_or_path="$MODEL_NAME" \
--data_path="$COMPUTED_DATA_PATH" \
--output_dir="$CHECKPOINTS_DIR" \
--num_epochs=1 \
--effective_batch_size=128 \
--save_samples=0 \
--checkpoint_at_epoch \
--accelerate_full_state_at_epoch \
--distributed_training_framework="$DISTRIB_FRAMEWORK" \
--max_batch_len="$MAX_BATCH_LEN" \
--is_granite
}

#######################################
# Test most common training parameters without using
# Flash Attention
# Globals:
# NUM_GPUS
# MODEL_NAME
# COMPUTED_DATA_PATH
# CHECKPOINTS_DIR
# DISTRIBUTED_FRAMEWORK
# MAX_BATCH_LEN
# Arguments:
# None
# Returns:
# None
#######################################
function test_standard_loop_nongranite () {
torchrun \
--standalone \
--nproc_per_node="$NUM_GPUS" \
main_ds.py \
--model_name_or_path="$MODEL_NAME" \
--data_path="$COMPUTED_DATA_PATH" \
--output_dir="$CHECKPOINTS_DIR" \
--num_epochs=1 \
--effective_batch_size=128 \
--save_samples=0 \
--checkpoint_at_epoch \
--accelerate_full_state_at_epoch \
--distributed_training_framework="$DISTRIB_FRAMEWORK" \
--max_batch_len="$MAX_BATCH_LEN"
# --is_granite \
}

#######################################
# Test most common training parameters without using
# Granite or Flash Attention
# Globals:
# NUM_GPUS
# MODEL_NAME
# COMPUTED_DATA_PATH
# CHECKPOINTS_DIR
# DISTRIBUTED_FRAMEWORK
# MAX_BATCH_LEN
# Arguments:
# None
# Returns:
# None
#######################################
function test_standard_loop_noflashattention_nogranite () {
torchrun \
--standalone \
--nproc_per_node="$NUM_GPUS" \
main_ds.py \
--model_name_or_path="$MODEL_NAME" \
--data_path="$COMPUTED_DATA_PATH" \
--output_dir="$CHECKPOINTS_DIR" \
--num_epochs=1 \
--effective_batch_size=128 \
--save_samples=0 \
--checkpoint_at_epoch \
--accelerate_full_state_at_epoch \
--distributed_training_framework="$DISTRIB_FRAMEWORK" \
--max_batch_len="$MAX_BATCH_LEN" \
--disable_flash_attn
# --is_granite
}

function main () {

setup_tmpdir
trap "rm -rf $TMP_DIR" EXIT

#NOTE (jkunstle): script is run as though it's
# in the same source dir as main_ds and data_process.
cd "$CORRECT_WORKING_DIR"
echo "CURRENT WORKING DIRECTORY: $(pwd)"

prepare_data
test_standard_loop_noflashattention_nogranite
_cleanup_saved_checkpoints
test_standard_loop_nongranite
_cleanup_saved_checkpoints
test_standard_loop
}

main

0 comments on commit 466474a

Please sign in to comment.