-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.yml
392 lines (282 loc) · 13.6 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
{
# TransformerConfig
# Base config class providing general settings for non-mutability and json serialization options
#
"runner": {
# RunnerConfig
# Base config class providing general settings for non-mutability and json serialization options
# Type of the runner to be invoked.
"runner_type": "pdsh",
# Hostsfile path (in MPI style) that defines the resource pool available to the job (e.g., worker-0 slots=4)
"hostsfile": null,
# List of hosts alternative to hostsfile (e.g., worker-0 slots=4)
"hosts": null,
# (optional) Port used by PyTorch distributed for communication during training.
"master_port": 29500,
# (optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.
"master_addr": null,
# User script to launch
"script": "src/scaling/transformer/train.py",
# Number of GPUs per node, is used if not defined in hosts' slots
"default_gpu_count": 8,
# docker configuration in case using a docker runner type
"docker_config": {
# RunnerDockerConfig
# Base config class providing general settings for non-mutability and json serialization options
# Name of the docker container to be started
"docker_container": null,
# Run docker command with sudo
"docker_sudo": false,
# List of directories to be mounted in the docker under the same path
"docker_mounts": [["/mnt/", "/mnt/"]]
},
"use_determined": False
},
#
"logger": {
# LoggerConfig
# Base config class providing general settings for non-mutability and json serialization options
"metrics_ranks": [0],
#
"log_level": "info",
#
"log_dir": null,
#
"use_wandb": false,
#
"use_tensorboard": false,
# define the global ranks of process to write to tensorboard. If the list is omitted or None only rank 0 will write to tensorboard.
"tensorboard_ranks": null
},
#
"topology": {
# TopologyConfig
# Base config class providing general settings for non-mutability and json serialization options
"model_parallel_size": 1,
"pipe_parallel_size": 1,
"data_parallel_size": 1,
# global train batch size including all gradient accumulation steps
"global_batch_size": null,
# Batch size for one training micro step. This is used when the global_batch_size cannot fit in GPU memory to determine the number of gradient accumulation steps.
"micro_batch_size": 2,
# Number of gradient accumulation. This is used when the global_batch_size cannot fit in GPU memory to determine the number of gradient accumulation steps.
"gradient_accumulation_steps": 1,
# Method to assign layers to pipeline stages
"pipe_partition_method": "balanced",
#
"activation_checkpointing_type": "disabled"
}
,
#
"optimizer": {
# AdamWOptimizerConfig
# Base config class providing general settings for non-mutability and json serialization options
# First coefficient used for computing running averages of gradient and its square
"beta1": 0.9,
# Second coefficient used for computing running averages of gradient and its square
"beta2": 0.95,
# term added to the denominator to improve numerical stability (default: 1e-8)
"eps": 1.0e-15,
# clip global l2 grads to this value, deactivate if 0.0
"gradient_clipping": 0.0,
# number of floating points to allreduce in one go
"allreduce_bucket_size": 500000000,
# Configuration of the loss scaler
"loss_scaler": {
# LossScalerConfig
# Loss scaling is designed to combat the problem of underflowing gradients encountered at long
# times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
# scale. Ironically, this may result in overflowing gradients.
# The optimizer then skips the update step for this particular iteration/minibatch,
# and the loss scaler adjusts the loss scale to a lower value.
# If a certain number of iterations occur without overflowing gradients detected,
# the loss scaler increases the loss scale once more.
# In this way the loss scaler attempts to "ride the edge" of
# always using the highest loss scale possible without incurring overflow.
#
"enable": false,
# Initial loss scale
"initial_scale": 4294967296.0,
#
"window": 1000,
#
"hysteresis": 2,
#
"consecutive_hysteresis": false,
#
"min_scale": 1.0,
#
"factor": 2.0
}
,
# enable zero stage 1 optimizer
"zero": true
}
,
"training_groups": [
{
"group_name": "param_group",
"weight_decay": 0.001,
"learning_rate_scheduler": {
"learning_rate": 0.0001,
"learning_rate_minimum": 0.0,
"learning_rate_decay_style": "cosine",
"learning_rate_warmup_steps": 2,
"learning_rate_decay_iters": 10,
},
},
]
,
#
"trainer": {
# TrainerConfig
# Base config class providing general settings for non-mutability and json serialization options
# directory for saving checkpoints
"save_dir": "checkpoints",
# save a checkpoint every 'save_interval' steps to save_dir, iff save_dir is defined
"save_interval": 25,
# directory for loading checkpoints
"load_dir": null,
#
"train_iterations": 50,
#
"seed": 42,
# error out if a checkpoint could not be loaded
"assert_checkpoint_loaded": False
}
,
#
"profiler": {
# ProfilerConfig
# Base config class providing general settings for non-mutability and json serialization options
# number of to be timed steps, will not run profiling if set to 0
"profile_steps": 0,
# start of profiler after this many steps of the current process. Not starting at step 0 give the GPUs time to (physically) warm up and only starts timing after initial metadata has been synced
"profile_start_at_step": 10,
# start of profiler after this many steps of the current process. Not starting at step 0 give the GPUs time to (physically) warm up and only starts timing after initial metadata has been synced
"profiler_output": null
}
,
#
"transformer_architecture": {
# TransformerArchitectureConfig
# Transformer architecture config object containing non-mutable (constant) architecture specific configurations
"weight_tying": false, # turn off weight tying
# Size of the vocabulary before padding; this matches the vocab size of the tokenizer
"vocab_size": 128000,
# Hidden size.
"hidden_size": 256,
# Number of transformer layers
"num_layers": 4,
# Number of attention heads
"num_attention_heads": 2,
#
"rotary_embedding_base": 10000,
# Sequence length in number of tokens in one sample on which a train job is run; at inference time the sequence length of a sample should (usually) not be exceeded.
"sequence_length": 64,
"norm_type": "rms",
"relative_position_embedding_type": "rotary_complex",
"attention_bias": False,
"mlp_type": "swiglu",
"mlp_factor": 2.5,
"mlp_bias": False,
#
"masked_softmax": {
# MaskedSoftmaxConfig
# Base config class providing general settings for non-mutability and json serialization options
# select an optimization kernel, if anything other than torch is selected the optional gpu_optimization dependencies need to be installed
"kernel": "torch",
# Cast tensor to fp32 before softmax for higher precision; this cannot be applied for fused kernels
"softmax_in_fp32": false,
# Scale with which scores are multiplied (not divided!) before softmax is applied. If scale is applied setting also softmax_in_fp32 is likely helpful.
"scale": 1.0
},
#
"layernorm": {
# LayerNormConfig
# Base config class providing general settings for non-mutability and json serialization options
# select an optimization type for the layer norm call, if anything other than torch is selected the optional gpu_optimization dependencies need to be installed
"optimization_type": "torch",
# A value added to the denominator for numerical stability
"layernorm_epsilon": 1e-05
},
#
"precision": "bfloat16",
# dropout applied after the embedding layer
"dropout_embedding": 0.1,
# dropout applied to the attention probabilities
"dropout_attention_probs": 0.1,
# dropout applied after the embedding layer
"dropout_after_attention": 0.1,
# dropout applied after the embedding layer
"dropout_after_mlp": 0.1,
# bitfit finetuning
"bitfit_bias_config": null,
# softprompt finetuning
"softprompt_config": null,
# adapter finetuning
"adapter_config": null,
# LoRA finetuning
"lora_config": null,
# add image encoder to input embedding
"image_encoder": false,
# dropout applied after the image encoder projection
"dropout_image_encoder": 0.0,
}
,
#
"data": {
# DataConfig
# Data config object containing non-mutable (constant) dataset specific configurations
# Training data prefix pointing to tokenized memory map
"legacy_dataset": False,
"finetuning_dataset": False,
"data_prefixes": [
"tests/transformer/files/dataset/data"
],
"use_mmap": True,
# Configuration for the blended dataset
"blended_dataset": {
# BlendedDatasetConfig
# Base config class providing general settings for non-mutability and json serialization options
# If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. WARNING: setting this to True will override any user provided weights
"weight_by_num_documents": true,
#
# Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
# when alpha = 1, the probability of sampling from a given group = n_samples / total_samples
# as alpha -> 0, the probability of sampling from all groups becomes equal, and number of documents has no effect
# as alpha -> inf, the probability of sampling from the groups with *the most samples* -> 1
"weighted_sampler_alpha": 1.0,
# # weights of singular datasets. The list needs to have the same length and order as the datasets provided
# "weights": null,
# # If True (with weight_by_num_documents set to True), this uses a modified method to build dataset weights
# Work out the weighting of each dataset based on 'temperature' T and 'maximum' parameter K.
# l is the list of dataset sizes.
# Examples-proportional mixing sets a "limit" defined by max rate (in terms of samples).
# The sampling rate of the m'th dataset r_m is:
# r_m = min(e_m, K)/sum_n(min(e_n, K))
# where:
# limit: K,
# number of examples in N datasets: e_n,
# m'th dataset example: e_m,
# This does two things:
# - Limits all datasets larger than defined limit to a fixed equal sampling rate
# - Upsamples datasets smaller than limit K to proportionally higher rate.
# We add an option for temperature scaling (with T=1 equivalent to no scaling).
# This raises r_m to the power of 1/T, and normalizes all the weights. As T increases,
# the weights of proportionally smaller datasets increases (converges to equal sampling,
# but this case should use alpha=0 sampling instead).
# See https://arxiv.org/pdf/1910.10683.pdf (page 31) for more details.
# src: https://github.com/huggingface/datasets/issues/217#issuecomment-648115586
"weight_examples_proportional": false,
# If set, rate limit K used in 'weight_examples_proportional'. Only has an effect if `weight_examples_proportional` = True.
"ep_maximum": null,
# Temperature value for `weight_examples_proportional`. Only has an effect if `weight_examples_proportional` = True. Temperature is inverse of alpha (as in weighted_sampler_alpha)
"ep_temperature": 1.0,
# Minimal size of the dataset.
"minimum_dataset_size": 0,
# directory to cache blended dataset index. this only needs to be set if more than one dataset is provided.
"cache_directory": null
}
},
}