forked from DRAGNLabs/301r_retnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
template_config.yaml
188 lines (159 loc) · 7.42 KB
/
template_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# ------------------------------------ USAGE -----------------------------------
# The YAML configuration contains all necessary paths and parameters to download
# data, train a tokenizer, tokenize the data, train a model, and evaluate the
# model.
#
# To use a YAML config, create a copy of template_config.yaml in the
# user_configs folder and fill in the necessary parameters.
#
# Path names need to be specified properly and be absolute paths. A suggested
# path structure is given in the template. The YAML config file is then passed
# as an argument to any scripts.
#
# For example, to train a model, run the following in the repository root
# directory:
# python3 train_model.py ./configs/user_configs/my_config.yaml
#
# Do not Add a YAML config to a git commit unless you have a good reason.
# -------------------- TESTING AND GENERATION CONFIGURATION --------------------
# Generation Length (int): Maximum number of tokens during generation
# This is not considered in latency eval
gen_len: 100
# Input Starting Strings (List[str]): Starting strings for generation.py
input_starting_strings:
- "Once upon a time, there was"
- "A long time ago, in a galaxy far, far"
- "It truly happened so suddenly! In one moment"
# CSV Path (str): Path to CSV file to generate from; default None if not using
csv_path: None
# Model Path Directory (str): Path to model to run test harness on.
# Example: "/tmp/data/models/<MODEL_NAME>/checkpoints/hf_ckpt_<CHECKPOINT_NUM>"
model_path_dir: ~
# Model Label (str): Label to use for model run, all checkpoints and logs are saved
# under this path. Good to use a unique name. If not specified, one is generated.
# However, the generated one many not be unique. A unique name is critical for
# Slurm requeuing. If there already exists a folder with the model name,
# the checkpoints may be overwritten.
# Example: 4-1-retnet-7B
model_label: ~
# Results Out Path (str): JSON file path where test harness results are stored.
# Example: "<YOUR_PATH_HERE>/data/models/<MODEL_NAME>/eval_results.json"
results_out_path: ~
# Tasks (List[str]): A list of tests to apply in the evaluation test harness
# Note, latency evaluation is a custom solution and does not run through the evaluation harness
tasks:
- "hellaswag"
- "winogrande"
- "latency"
# N-shot (int | None): The number of examples to show the model before prompting; 'None' defaults to 0
nshot: ~
# n_tokens_to_generate (int): number of tokens to generate during a single round in latency evaluation; 'None' defaults to 10
n_tokens_to_generate:
50
# num_latency_trials (int): The number of generation trials to run at each test sequence length; 'None' defaults to 5
num_latency_trials:
10
# What sizes of sequence length to test (List[int]); No values defaults to model.seq_len
test_sequence_lengths:
- 10
- 100
- 1000
# ----------------------- DATASET AND PATH CONFIGURATION -----------------------
# Checkpoint Path (str): Path to checkpoint to load for generation/inference.
# Typically is the "best_model_path" from training
checkpoint_path: ~
# HuggingFace Dataset Feature (str): Column of the dataset to extract text from
dataset_feature: "text"
# HuggingFace Dataset Name (str)
dataset_name: "c4"
# HuggingFace Dataset Subset (str)
dataset_subset: "c4-en"
# Train Model (str): Path to model folder
models_path: "<YOUR_PATH_HERE>/data/models"
# Root Data Path (str): Useful for any new features that don't want to specify a
# new path parameter in the config
root_data_path: "<YOUR_PATH_HERE>/data"
# Download Data (str): Path to raw dataset folder
raw_dataset_path: "<YOUR_PATH_HERE>/data/datasets/c4"
# Splits (List[int]): Train split, Validation Split, and Test Split
splits:
- 0.7
- 0.2
- 0.1
# Train Tokenizer (str): Path to tokenizer folder
tokenizer_path: "<YOUR_PATH_HERE>/data/tokenizers/c4_tokenizer"
# Tokenize Data (str): Path to tokenized dataset folder
tokenized_dataset_path: "<YOUR_PATH_HERE>/data/tokenized_dataset/c4"
# Emissions Outfile (Optional) (string): Name of .csv file to write tracked emissions to.
# If left empty, defaults to "emissions.csv'; writes to <models_path+model_name>/<CO2_outfile>
CO2_outfile: ~ # Example: "CO2_grid_search.csv"
# ---------------------------- DEVICE CONFIGURATION ----------------------------
# Device Type (str): Device to train on
device: "cuda" # Options: "cpu", "cuda"
# Number of GPUs (int)
num_devices: 1
# Number of Nodes (int)
num_nodes: 1
# Number of Processes (int): Number of cpu cores. Used for data preprocessing
num_proc: 4
# Num Workers (int): Number of workers for dataloaders. Recommended to set to
# one less than number of CPU cores available
num_workers: 0
# Strategy (string): Distributed strategy for training. Likely no need to change
strategy: "ddp" # Options: "ddp", "ddp_spawn"
# Use Slurm (bool): Whether to use Slurm for training
use_slurm: true
# --------------------------- TRAINING CONFIGURATION ---------------------------
# Accumulate Gradient Batches (int): Accumulate gradients over n batches; note how this interacts w/ every_n_train_steps
accumulate_grad_batches: 1
# Batch Size (int): Batch size for training
batch_size: 8
# Early Stopping (int): Number of validations w/out improvement to wait before stopping training;
# 0 deactivates feature; Defaults to 3
early_stopping: 3
# Epochs (int): Number of epochs to train for
epochs: 1
# Gamma (float): Learning rate scheduler gamma
gamma: 0.85
# Learning Rate (float): Learning rate of model to train
learning_rate: 0.001
# Random Seed (int): Random seed for reproducibility
rand_seed: 42
# Precision (str): Precision for training; refer to Torch Lightning docs
precision: fp16
# Save Top K (int): Number of best models to save; default saves the best single checkpoint. Set to -1 to save all
save_top_k: 3
# Every N hours (int or float): Checkpoint every n hours; overrides every_n_train_steps if not set to default (None or 0).
every_n_hours: ~
# Every N Train Steps (int): Checkpoint every n train steps; Overridden by every_n_hours if it's not set to 0 or None;
# Note: PyTorch Lightning defines 'steps' as optimizer steps, not batch forward passes, distinct when accumulate_grad_batches > 1.
every_n_train_steps: 10000
# Validation Check Interval (float): Validation frequency (fraction of an epoch)
val_check_interval: 0.5
# ---------------------------- MODEL CONFIGURATION -----------------------------
# Activation Dropout (float): Probability of an element to be zeroed during
# dropout after activation between FFN layers
activation_dropout: 0.0
# Dropout (float): Probability of an element to be zeroed during dropout
dropout: 0.1
# Embedding Dimension (int): Embedding dimension size of each token
embed_dim: 80
# FFN Dimension (int): Hidden layer size of Feed Forward Network (FFN)
ffn_dim: 12
# Heads (int): Number of heads. Head architecture changes based on model
heads: 4
# Layers (int): Number of stacked network layers
layers: 2
# Model Type (str): Name of model architecture to train
model_type: "retnet" # Choices: "retnet", "transformer", "longnet"
# Sequence Length (int): Context window size by number of tokens
seq_len: 128
# Value Embedding Dimension (int): Value embed dimension size
value_embed_dim: 1280
# Vocabulary Size (int): Maximum vocabulary size (unique tokens in vocabulary)
vocab_size: 50000
# ---------------------------- LONGNET CONFIGURATION ----------------------------
# Dilated Ratio (list[int]): Ratio of dilation for the model
dilated_ratio: [1, 2]
# Segment Length (list[int]): Length of segments for the model
segment_length: [2048, 4096]