configs/template_config.yaml

# ------------------------------------ USAGE -----------------------------------
# The YAML configuration contains all necessary paths and parameters to download
# data, train a tokenizer, tokenize the data, train a model, and evaluate the
# model.
#
# To use a YAML config, create a copy of template_config.yaml in the
# user_configs folder and fill in the necessary parameters.
#
# Path names need to be specified properly and be absolute paths. A suggested
# path structure is given in the template. The YAML config file is then passed
# as an argument to any scripts.
#
# For example, to train a model, run the following in the repository root
# directory:
#        python3 train_model.py ./configs/user_configs/my_config.yaml
#
# Do not Add a YAML config to a git commit unless you have a good reason.

# -------------------- TESTING AND GENERATION CONFIGURATION --------------------

# Generation Length (int): Maximum number of tokens during generation
# This is not considered in latency eval
gen_len: 100

# Input Starting Strings (List[str]): Starting strings for generation.py
input_starting_strings:
  - "Once upon a time, there was"
  - "A long time ago, in a galaxy far, far"
  - "It truly happened so suddenly! In one moment"

# CSV Path (str): Path to CSV file to generate from; default None if not using
csv_path: None

# Model Path Directory (str): Path to model to run test harness on.
# Example: "/tmp/data/models/<MODEL_NAME>/checkpoints/hf_ckpt_<CHECKPOINT_NUM>"
model_path_dir: ~

# Model Label (str): Label to use for model run, all checkpoints and logs are saved
# under this path. Good to use a unique name. If not specified, one is generated.
# However, the generated one many not be unique. A unique name is critical for 
# Slurm requeuing. If there already exists a folder with the model name,
# the checkpoints may be overwritten.
# Example: 4-1-retnet-7B
model_label: ~

# Results Out Path (str): JSON file path where test harness results are stored.
# Example: "<YOUR_PATH_HERE>/data/models/<MODEL_NAME>/eval_results.json"
results_out_path: ~

# Tasks (List[str]): A list of tests to apply in the evaluation test harness
# Note, latency evaluation is a custom solution and does not run through the evaluation harness
tasks:
  - "hellaswag"
  - "winogrande"
  - "latency"
# N-shot (int | None): The number of examples to show the model before prompting; 'None' defaults to 0
nshot: ~

# n_tokens_to_generate (int): number of tokens to generate during a single round in latency evaluation; 'None' defaults to 10
n_tokens_to_generate:
  50

# num_latency_trials (int): The number of generation trials to run at each test sequence length; 'None' defaults to 5
num_latency_trials:
  10

# What sizes of sequence length to test (List[int]); No values defaults to model.seq_len
test_sequence_lengths:
  - 10
  - 100
  - 1000

# ----------------------- DATASET AND PATH CONFIGURATION -----------------------

# Checkpoint Path (str): Path to checkpoint to load for generation/inference.
# Typically is the "best_model_path" from training
checkpoint_path: ~

# HuggingFace Dataset Feature (str): Column of the dataset to extract text from
dataset_feature: "text"
# HuggingFace Dataset Name (str)
dataset_name: "c4"
# HuggingFace Dataset Subset (str)
dataset_subset: "c4-en"

# Train Model (str): Path to model folder
models_path: "<YOUR_PATH_HERE>/data/models"

# Root Data Path (str): Useful for any new features that don't want to specify a
# new path parameter in the config
root_data_path: "<YOUR_PATH_HERE>/data"

# Download Data (str): Path to raw dataset folder
raw_dataset_path: "<YOUR_PATH_HERE>/data/datasets/c4"

# Splits (List[int]): Train split, Validation Split, and Test Split
splits:
  - 0.7
  - 0.2
  - 0.1

# Train Tokenizer (str): Path to tokenizer folder
tokenizer_path: "<YOUR_PATH_HERE>/data/tokenizers/c4_tokenizer"

# Tokenize Data (str): Path to tokenized dataset folder
tokenized_dataset_path: "<YOUR_PATH_HERE>/data/tokenized_dataset/c4"

# Emissions Outfile (Optional) (string): Name of .csv file to write tracked emissions to. 
# If left empty, defaults to "emissions.csv'; writes to <models_path+model_name>/<CO2_outfile>
CO2_outfile: ~ # Example: "CO2_grid_search.csv"

# ---------------------------- DEVICE CONFIGURATION ----------------------------

# Device Type (str): Device to train on
device: "cuda" # Options: "cpu", "cuda"
# Number of GPUs (int)
num_devices: 1
# Number of Nodes (int)
num_nodes: 1
# Number of Processes (int): Number of cpu cores. Used for data preprocessing
num_proc: 4
# Num Workers (int): Number of workers for dataloaders. Recommended to set to
# one less than number of CPU cores available
num_workers: 0
# Strategy (string): Distributed strategy for training. Likely no need to change
strategy: "ddp" # Options: "ddp", "ddp_spawn"
# Use Slurm (bool): Whether to use Slurm for training
use_slurm: true

# --------------------------- TRAINING CONFIGURATION ---------------------------

# Accumulate Gradient Batches (int): Accumulate gradients over n batches; note how this interacts w/ every_n_train_steps
accumulate_grad_batches: 1
# Batch Size (int): Batch size for training
batch_size: 8
# Early Stopping (int): Number of validations w/out improvement to wait before stopping training;
# 0 deactivates feature; Defaults to 3
early_stopping: 3
# Epochs (int): Number of epochs to train for
epochs: 1
# Gamma (float): Learning rate scheduler gamma
gamma: 0.85
# Learning Rate (float): Learning rate of model to train
learning_rate: 0.001
# Random Seed (int): Random seed for reproducibility
rand_seed: 42
# Precision (str): Precision for training; refer to Torch Lightning docs
precision: fp16
# Save Top K (int): Number of best models to save; default saves the best single checkpoint. Set to -1 to save all
save_top_k: 3
# Every N hours (int or float): Checkpoint every n hours; overrides every_n_train_steps if not set to default (None or 0). 
every_n_hours: ~
# Every N Train Steps (int): Checkpoint every n train steps; Overridden by every_n_hours if it's not set to 0 or None;
# Note: PyTorch Lightning defines 'steps' as optimizer steps, not batch forward passes, distinct when accumulate_grad_batches > 1.
every_n_train_steps: 10000
# Validation Check Interval (float): Validation frequency (fraction of an epoch)
val_check_interval: 0.5

# ---------------------------- MODEL CONFIGURATION -----------------------------

# Activation Dropout (float): Probability of an element to be zeroed during
# dropout after activation between FFN layers
activation_dropout: 0.0
# Dropout (float): Probability of an element to be zeroed during dropout
dropout: 0.1
# Embedding Dimension (int): Embedding dimension size of each token
embed_dim: 80
# FFN Dimension (int): Hidden layer size of Feed Forward Network (FFN)
ffn_dim: 12
# Heads (int): Number of heads. Head architecture changes based on model
heads: 4
# Layers (int): Number of stacked network layers
layers: 2
# Model Type (str): Name of model architecture to train
model_type: "retnet" # Choices: "retnet", "transformer", "longnet"
# Sequence Length (int): Context window size by number of tokens
seq_len: 128
# Value Embedding Dimension (int): Value embed dimension size
value_embed_dim: 1280
# Vocabulary Size (int): Maximum vocabulary size (unique tokens in vocabulary)
vocab_size: 50000

# ---------------------------- LONGNET CONFIGURATION ----------------------------

# Dilated Ratio (list[int]): Ratio of dilation for the model
dilated_ratio: [1, 2]
# Segment Length (list[int]): Length of segments for the model
segment_length: [2048, 4096]