Skip to content

Commit

Permalink
spell
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Apr 20, 2024
1 parent 0ac6f00 commit 11835b7
Show file tree
Hide file tree
Showing 21 changed files with 41 additions and 42 deletions.
2 changes: 1 addition & 1 deletion llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback):
Args:
log_interval (int, optional): The interval on which to log (Default: 10).
log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log
log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log
only aggregate statistics (Default: False).
all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log
statistics for GPU 0 (Default: False).
Expand Down
15 changes: 7 additions & 8 deletions llmfoundry/eval/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@
InContextLearningGenerationTaskWithAnswersDataset,
InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
from llmfoundry.eval.datasets.utils import (MultiTokenEOSCriteria,
convert_tokens_to_tensors,
get_continuation_span,
get_fewshot_sample_idxs,
make_padded_input,
stop_sequences_criteria, strip_data,
tokenizer_needs_prefix_space,
trim_context)

# isort: off
from llmfoundry.eval.datasets.utils import (
MultiTokenEOSCriteria, convert_tokens_to_tensors, get_continuation_span,
get_fewshot_sample_idxs, make_padded_input, stop_sequences_criteria,
strip_data, tokenizer_needs_prefix_space, trim_context)
# isort: on

__all__ = [
'InContextLearningDataset',
Expand Down
6 changes: 3 additions & 3 deletions llmfoundry/eval/metrics/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ class InContextLearningMCExpectedCalibrationError(
def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):

outputs = torch.softmax(outputs, dim=2)
probabilites = []
probabilities = []
for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
cont_tok_logits = outputs[batch_idx].index_select(dim=0,
index=cont_idx -
Expand All @@ -671,11 +671,11 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
index=cont_idx - 1)
probability = cont_tok_logits.index_select(
dim=1, index=cont_tok_targ).diagonal().mean()
probabilites.append(probability)
probabilities.append(probability)

for (start, end), gold_idx in zip(batch['choice_groupings'],
batch['gold_indices']):
subset = probabilites[start:end]
subset = probabilities[start:end]
idx_max = subset.index(max(subset))
confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()

Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/models/hf/hf_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def prepare_hf_enc_dec_model_for_fsdp(model: PreTrainedModel,
if encoder_block_type == decoder_block_type:
return

# need to wrap encoder blocks separately for ProhpetNet and Marian
# need to wrap encoder blocks separately for ProphetNet and Marian
model.fsdp_wrap_fn = lambda module: isinstance(module, encoder_block_type)
model.activation_checkpointing_fn = lambda module: isinstance(
module, encoder_block_type)
2 changes: 1 addition & 1 deletion llmfoundry/models/utils/act_ckpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_target_block_list(target_blocks: Any, max_block_idx: int) -> list:
candidate_block_ids.extend(to_add)
else:
raise ValueError(
f'target_blocks must be either a single intege, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}'
f'target_blocks must be either a single integer, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}'
)

candidate_block_ids = list(set(candidate_block_ids))
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/utils/model_download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def download_from_oras(model: str,
credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three
files: `username`, `password`, and `registry`, each of which contains the corresponding credential.
save_dir (str): Path to the directory where files will be downloaded.
tokenizer_only (bool): If true, only download the tokenzier files.
tokenizer_only (bool): If true, only download the tokenizer files.
concurrency (int): The number of concurrent downloads to run.
"""
if shutil.which(ORAS_CLI) is None:
Expand Down
2 changes: 1 addition & 1 deletion mcli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ All the details of multi-gpu and multi-node orchestration are handled automatica

## Using the MosaicML Python SDK to launch runs
You can also use the [Python SDK](https://mcli.docs.mosaicml.com/en/stable/python/hello_world.html) to launch MosaicML platform jobs.
This can be used to programatically sweep hyperparameters or orchestrate training runs within a larger pipeline.
This can be used to programmatically sweep hyperparameters or orchestrate training runs within a larger pipeline.
2 changes: 1 addition & 1 deletion mcli/mcli-1b-max-seq-len-8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ parameters:
name: mpt_causal_lm
init_device: meta
d_model: 2048
n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
n_layers: 24
expansion_ratio: 4
max_seq_len: ${max_seq_len}
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_prep/convert_finetuning_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def parse_args() -> Namespace:
'--skip-preprocessing',
action='store_true',
help=
'Whether to skip preprocesing (e.g., if the dataset is already formatted correctly)'
'Whether to skip preprocessing (e.g., if the dataset is already formatted correctly)'
)
parser.add_argument(
'--out_root',
Expand Down
16 changes: 8 additions & 8 deletions scripts/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def get_task_args(
input_folder (str): Folder of text files to process
n_groups (int): Number of groups to split the object names into
tokenizer_name (str): Name of tokenizer to use
concat_tokens (int): Concantenate up to this many tokens
eos_text (str): Textend to append to each example to separate concatenated samples
concat_tokens (int): Concatenate up to this many tokens
eos_text (str): Text to append to each example to separate concatenated samples
bos_text (str): Text to prepend to each example to separate concatenated samples
no_wrap: (bool): Whether to let text examples wrap across multiple training examples
compression (str): The compression algorithm to use for MDS writing
Expand All @@ -219,7 +219,7 @@ def get_task_args(
def download_and_convert_starargs(args: Tuple):
"""Helper function to call download_and_convert with star args.
This helps us use download_and_convert with mutiprocessing.
This helps us use download_and_convert with multiprocessing.
"""
return download_and_convert(*args)

Expand All @@ -236,15 +236,15 @@ def download_and_convert(
compression: str,
trust_remote_code: bool,
):
"""Downloads and converts text fies to MDS format.
"""Downloads and converts text files to MDS format.
Args:
file_names (List[str]): Files to process
output_folder (str): Folder to write MDS shards to
input_folder (str): Folder of text files to process
tokenizer_name (str): Name of tokenizer to use
concat_tokens (int): Concantenate up to this many tokens
eos_text (str): Textend to append to each example to separate concatenated samples
concat_tokens (int): Concatenate up to this many tokens
eos_text (str): Text to append to each example to separate concatenated samples
bos_text (str): Text to prepend to each example to separate concatenated samples
no_wrap: (bool): Whether to let text examples wrap across multiple training examples
compression (str): The compression algorithm to use for MDS writing
Expand Down Expand Up @@ -375,8 +375,8 @@ def convert_text_to_mds(
tokenizer_name (str): Name of tokenizer to use
output_folder (str): Folder to write MDS shards to
input_folder (str): Folder of text files to process
concat_tokens (int): Concantenate up to this many tokens
eos_text (str): Textend to append to each example to separate concatenated samples
concat_tokens (int): Concatenate up to this many tokens
eos_text (str): Text to append to each example to separate concatenated samples
bos_text (str): Text to prepend to each example to separate concatenated samples
no_wrap: (bool): Whether to let text examples wrap across multiple training examples
compression (str): The compression algorithm to use for MDS writing
Expand Down
2 changes: 1 addition & 1 deletion scripts/eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ In order to do ICL evaluation you must specify a set of benchmarks you'd like to


#### ICL task YAML format
Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate againts, this can either be a list of dictionaries of the form
Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate against, this can either be a list of dictionaries of the form

```jsx
icl_tasks:
Expand Down
4 changes: 2 additions & 2 deletions scripts/inference/benchmarking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ LLM inference consists of two stages: _prefill_ and _decode_. It's important to

During _prefill_, the model processes the input tokens/prompt/context. This is done in a single forward pass, making this stage fast, with excellent use of GPU hardware (ie. high Model Flop Utilization aka [MFU](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train/benchmarking#mfu)). Typically, if people talk about LLM inference being slow, this is _not_ the stage that they are referring to.

During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominant.
During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominent.

##### KV cache

Expand Down Expand Up @@ -132,5 +132,5 @@ The benchmark script supports calling models directly from huggingface (using `h
The analysis is done on a single A100 80GB GPU, with input length 512, and output length 64, while varying the batch size. As in previous sections, the batch sizes swept are 1, 2, 4, 8, 16, 32, 64, unless the GPU ran out of memory, in which case that point is not shown.

As seen here, both MPT-7B and MPT-30B are among the fastest for inference in the open-source community, with MPT-30B being faster than the respective LLAMA-30B model.
Among the 7B models, Falcon-7B tends to have higher througput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies.
Among the 7B models, Falcon-7B tends to have higher throughput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies.
Previously, we found that Falcon-7b was significantly slower than both MPT-7B and LLAMA-7B. This slow speed was due to the KV-cache not being used properly during generation, however this appears to be [fixed](https://huggingface.co/tiiuae/falcon-7b/tree/main) as of July 13, 2022.
2 changes: 1 addition & 1 deletion scripts/inference/convert_hf_to_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def export_to_onnx(
atol=1e-2,
msg=f'output mismatch between the orig and onnx exported model',
)
print('exported model ouptut matches with unexported model!!')
print('exported model output matches with unexported model!!')

if save_object_store is not None:
print('Uploading files to object storage...')
Expand Down
2 changes: 1 addition & 1 deletion scripts/inference/endpoint_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def parse_args() -> Namespace:
'-i',
'--inputs',
nargs='+',
help=f'List of strings, local datafiles (starting with {utils.PROMPTFILE_PREFIX}),' +\
help=f'List of strings, local data files (starting with {utils.PROMPTFILE_PREFIX}),' +\
' and/or remote object stores'
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion scripts/inference/run_mpt_with_ft.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def main():
type=int,
default=0,
choices=[0, 1, 2],
help='Whether to compute the cumulative log probsbility of sentences.' +
help='Whether to compute the cumulative log probability of sentences.' +
' 0: do not return the cumulative log probs' +
' 1: return the cumulative log probs of generated sequences' +
' 2: return the cumulative log probs of sequences')
Expand Down
4 changes: 2 additions & 2 deletions scripts/train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ If the dataset requires a [custom preprocessing function](#custom-data-preproces
train_loader:
name: finetuning
dataset:
hf_name: mosaiml/doge-facts
hf_name: mosaicml/doge-facts
preprocessing_fn: my_data.formatting:dogefacts_prep_fn
split: train
...
Expand Down Expand Up @@ -402,7 +402,7 @@ so you should be able to run the exact same YAML on 8 or 16 or 256 GPUs and get
This is nice because it means you can write device-count-agnostic training configs,
and not worry about OOM-ing or accidentally changing the optimization math.

In previous blogposts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy))
In previous blog posts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy))
we also demonstrated auto microbatching, which takes things a step further by letting Composer determine the `device_train_microbatch_size` on its own.
This makes our configs not only device-count-agnostic, but hardware-agnostic too!
You can try out this feature by setting `device_train_microbatch_size: auto`, but bear in mind that FSDP support is still in alpha mode
Expand Down
6 changes: 3 additions & 3 deletions scripts/train/benchmarking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ python submit_benchmarks.py --cluster [your_mosaicml_cluster] ARGS --RUN
can be used to sweep a larger set of configurations. For example usage of `submit_benchmarks.py` see `sweep.sh` which lists all benchmarks in the tables.

> **Note**
> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submissing and collection scripts.
> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submission and collection scripts.

## MFU and HFU
Expand Down Expand Up @@ -55,7 +55,7 @@ hfu* = 4 * flops_per_seq * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS)
hfu = (4 * flops_per_seq + 4 * attn_flops_per_seq) * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS)
```

Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual lyaers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multipler would be 5 instead of 4.
Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual layers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multiplier would be 5 instead of 4.

## Results

Expand All @@ -65,7 +65,7 @@ python submit_benchmarks.py -m 13b.yaml 30b.yaml -t fp16 -b 21 21 -s 11 14 --RUN
```
This will run 8 configs for 12 steps to get throughput numbers. `python collect_results.py` can then be used to parse all output training logs and create the tables below.

Our microbatching engine enables microbatch sizes that do not divde Global Batchsize while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4.
Our microbatching engine enables microbatch sizes that do not divide global batch size while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4.

[comment]: # (TODO: Update tables with torch 2.0 after next Composer release)

Expand Down
4 changes: 2 additions & 2 deletions scripts/train/benchmarking/collect_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
d_model = run.submitted_config.parameters['model']['d_model']
n_layers = run.submitted_config.parameters['model']['n_layers']

# mfu is approximated using thoughtput and param count
# the number of paramters is approximately the number of multiply-accumulates (MAC) in the network
# mfu is approximated using throughput and param count
# the number of parameters is approximately the number of multiply-accumulates (MAC) in the network
# each MAC has 2 FLOPs - we multiply by 2 ie 2 * n_param
# there are 3 passes of a NN (fwd, bwd, delta) - we multiply by 3 ie 2 * 3 * n_param
# this gets us FLOPs / token
Expand Down
2 changes: 1 addition & 1 deletion scripts/train/benchmarking/submit_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def get_global_train_batch_sizes(max_seq_len: int,
if batch_sizes is None:
batch_sizes = []
if pows:
# global batch size in tokens (defualt: .5M thru 8M)
# global batch size in tokens (default: .5M thru 8M)
global_train_token_counts = [2**n for n in range(pows[0], pows[1] + 1)]
batch_sizes += [t // max_seq_len for t in global_train_token_counts
] # global batch size in samples
Expand Down
2 changes: 1 addition & 1 deletion scripts/train/yamls/finetune/1b_local_data_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ model:
name: mpt_causal_lm
init_device: meta
d_model: 2048
n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
n_layers: 24
expansion_ratio: 4
max_seq_len: ${max_seq_len}
Expand Down
2 changes: 1 addition & 1 deletion scripts/train/yamls/pretrain/mpt-1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ model:
name: mpt_causal_lm
init_device: meta
d_model: 2048
n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
n_layers: 24
expansion_ratio: 4
max_seq_len: ${max_seq_len}
Expand Down

0 comments on commit 11835b7

Please sign in to comment.