From 11835b75bdc9c7d4155c501f1abb6eee589bc8c9 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 20 Apr 2024 11:58:28 -0700 Subject: [PATCH] spell --- .../log_mbmoe_tok_per_expert_callback.py | 2 +- llmfoundry/eval/datasets/__init__.py | 15 +++++++-------- llmfoundry/eval/metrics/nlp.py | 6 +++--- llmfoundry/models/hf/hf_fsdp.py | 2 +- llmfoundry/models/utils/act_ckpt.py | 2 +- llmfoundry/utils/model_download_utils.py | 2 +- mcli/README.md | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- scripts/data_prep/convert_finetuning_dataset.py | 2 +- scripts/data_prep/convert_text_to_mds.py | 16 ++++++++-------- scripts/eval/README.md | 2 +- scripts/inference/benchmarking/README.md | 4 ++-- scripts/inference/convert_hf_to_onnx.py | 2 +- scripts/inference/endpoint_generate.py | 2 +- scripts/inference/run_mpt_with_ft.py | 2 +- scripts/train/README.md | 4 ++-- scripts/train/benchmarking/README.md | 6 +++--- scripts/train/benchmarking/collect_results.py | 4 ++-- scripts/train/benchmarking/submit_benchmarks.py | 2 +- .../train/yamls/finetune/1b_local_data_sft.yaml | 2 +- scripts/train/yamls/pretrain/mpt-1b.yaml | 2 +- 21 files changed, 41 insertions(+), 42 deletions(-) diff --git a/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py b/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py index 48f3ba5758..89ee37cf0c 100644 --- a/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py +++ b/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py @@ -46,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback): Args: log_interval (int, optional): The interval on which to log (Default: 10). - log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log + log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log only aggregate statistics (Default: False). all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log statistics for GPU 0 (Default: False). diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py index 8e33e7731e..0be9882b0c 100644 --- a/llmfoundry/eval/datasets/__init__.py +++ b/llmfoundry/eval/datasets/__init__.py @@ -8,14 +8,13 @@ InContextLearningGenerationTaskWithAnswersDataset, InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset, InContextLearningSchemaTaskDataset, get_icl_task_dataloader) -from llmfoundry.eval.datasets.utils import (MultiTokenEOSCriteria, - convert_tokens_to_tensors, - get_continuation_span, - get_fewshot_sample_idxs, - make_padded_input, - stop_sequences_criteria, strip_data, - tokenizer_needs_prefix_space, - trim_context) + +# isort: off +from llmfoundry.eval.datasets.utils import ( + MultiTokenEOSCriteria, convert_tokens_to_tensors, get_continuation_span, + get_fewshot_sample_idxs, make_padded_input, stop_sequences_criteria, + strip_data, tokenizer_needs_prefix_space, trim_context) +# isort: on __all__ = [ 'InContextLearningDataset', diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py index 55922e28d2..f5a50721e3 100644 --- a/llmfoundry/eval/metrics/nlp.py +++ b/llmfoundry/eval/metrics/nlp.py @@ -662,7 +662,7 @@ class InContextLearningMCExpectedCalibrationError( def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor): outputs = torch.softmax(outputs, dim=2) - probabilites = [] + probabilities = [] for batch_idx, cont_idx in enumerate(batch['continuation_indices']): cont_tok_logits = outputs[batch_idx].index_select(dim=0, index=cont_idx - @@ -671,11 +671,11 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor): index=cont_idx - 1) probability = cont_tok_logits.index_select( dim=1, index=cont_tok_targ).diagonal().mean() - probabilites.append(probability) + probabilities.append(probability) for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']): - subset = probabilites[start:end] + subset = probabilities[start:end] idx_max = subset.index(max(subset)) confidence = torch.tensor(subset).max() / torch.tensor(subset).sum() diff --git a/llmfoundry/models/hf/hf_fsdp.py b/llmfoundry/models/hf/hf_fsdp.py index def8952225..87bffc3af8 100644 --- a/llmfoundry/models/hf/hf_fsdp.py +++ b/llmfoundry/models/hf/hf_fsdp.py @@ -256,7 +256,7 @@ def prepare_hf_enc_dec_model_for_fsdp(model: PreTrainedModel, if encoder_block_type == decoder_block_type: return - # need to wrap encoder blocks separately for ProhpetNet and Marian + # need to wrap encoder blocks separately for ProphetNet and Marian model.fsdp_wrap_fn = lambda module: isinstance(module, encoder_block_type) model.activation_checkpointing_fn = lambda module: isinstance( module, encoder_block_type) diff --git a/llmfoundry/models/utils/act_ckpt.py b/llmfoundry/models/utils/act_ckpt.py index 9f1b235db1..ef9a851a09 100644 --- a/llmfoundry/models/utils/act_ckpt.py +++ b/llmfoundry/models/utils/act_ckpt.py @@ -110,7 +110,7 @@ def get_target_block_list(target_blocks: Any, max_block_idx: int) -> list: candidate_block_ids.extend(to_add) else: raise ValueError( - f'target_blocks must be either a single intege, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}' + f'target_blocks must be either a single integer, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}' ) candidate_block_ids = list(set(candidate_block_ids)) diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index a88e02a33a..3707da3883 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -249,7 +249,7 @@ def download_from_oras(model: str, credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three files: `username`, `password`, and `registry`, each of which contains the corresponding credential. save_dir (str): Path to the directory where files will be downloaded. - tokenizer_only (bool): If true, only download the tokenzier files. + tokenizer_only (bool): If true, only download the tokenizer files. concurrency (int): The number of concurrent downloads to run. """ if shutil.which(ORAS_CLI) is None: diff --git a/mcli/README.md b/mcli/README.md index ced3c42adc..59fb723f57 100644 --- a/mcli/README.md +++ b/mcli/README.md @@ -23,4 +23,4 @@ All the details of multi-gpu and multi-node orchestration are handled automatica ## Using the MosaicML Python SDK to launch runs You can also use the [Python SDK](https://mcli.docs.mosaicml.com/en/stable/python/hello_world.html) to launch MosaicML platform jobs. -This can be used to programatically sweep hyperparameters or orchestrate training runs within a larger pipeline. +This can be used to programmatically sweep hyperparameters or orchestrate training runs within a larger pipeline. diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index 33a891c058..3c7c62f7d4 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -43,7 +43,7 @@ parameters: name: mpt_causal_lm init_device: meta d_model: 2048 - n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention + n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention n_layers: 24 expansion_ratio: 4 max_seq_len: ${max_seq_len} diff --git a/scripts/data_prep/convert_finetuning_dataset.py b/scripts/data_prep/convert_finetuning_dataset.py index e78e76a912..fb6bde4115 100644 --- a/scripts/data_prep/convert_finetuning_dataset.py +++ b/scripts/data_prep/convert_finetuning_dataset.py @@ -59,7 +59,7 @@ def parse_args() -> Namespace: '--skip-preprocessing', action='store_true', help= - 'Whether to skip preprocesing (e.g., if the dataset is already formatted correctly)' + 'Whether to skip preprocessing (e.g., if the dataset is already formatted correctly)' ) parser.add_argument( '--out_root', diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index be986fc24d..636e85abed 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -191,8 +191,8 @@ def get_task_args( input_folder (str): Folder of text files to process n_groups (int): Number of groups to split the object names into tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples bos_text (str): Text to prepend to each example to separate concatenated samples no_wrap: (bool): Whether to let text examples wrap across multiple training examples compression (str): The compression algorithm to use for MDS writing @@ -219,7 +219,7 @@ def get_task_args( def download_and_convert_starargs(args: Tuple): """Helper function to call download_and_convert with star args. - This helps us use download_and_convert with mutiprocessing. + This helps us use download_and_convert with multiprocessing. """ return download_and_convert(*args) @@ -236,15 +236,15 @@ def download_and_convert( compression: str, trust_remote_code: bool, ): - """Downloads and converts text fies to MDS format. + """Downloads and converts text files to MDS format. Args: file_names (List[str]): Files to process output_folder (str): Folder to write MDS shards to input_folder (str): Folder of text files to process tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples bos_text (str): Text to prepend to each example to separate concatenated samples no_wrap: (bool): Whether to let text examples wrap across multiple training examples compression (str): The compression algorithm to use for MDS writing @@ -375,8 +375,8 @@ def convert_text_to_mds( tokenizer_name (str): Name of tokenizer to use output_folder (str): Folder to write MDS shards to input_folder (str): Folder of text files to process - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples + concat_tokens (int): Concatenate up to this many tokens + eos_text (str): Text to append to each example to separate concatenated samples bos_text (str): Text to prepend to each example to separate concatenated samples no_wrap: (bool): Whether to let text examples wrap across multiple training examples compression (str): The compression algorithm to use for MDS writing diff --git a/scripts/eval/README.md b/scripts/eval/README.md index ba2e9e2c79..488dd50a3a 100644 --- a/scripts/eval/README.md +++ b/scripts/eval/README.md @@ -49,7 +49,7 @@ In order to do ICL evaluation you must specify a set of benchmarks you'd like to #### ICL task YAML format -Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate againts, this can either be a list of dictionaries of the form +Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate against, this can either be a list of dictionaries of the form ```jsx icl_tasks: diff --git a/scripts/inference/benchmarking/README.md b/scripts/inference/benchmarking/README.md index 837154a977..b3f01256ac 100644 --- a/scripts/inference/benchmarking/README.md +++ b/scripts/inference/benchmarking/README.md @@ -28,7 +28,7 @@ LLM inference consists of two stages: _prefill_ and _decode_. It's important to During _prefill_, the model processes the input tokens/prompt/context. This is done in a single forward pass, making this stage fast, with excellent use of GPU hardware (ie. high Model Flop Utilization aka [MFU](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train/benchmarking#mfu)). Typically, if people talk about LLM inference being slow, this is _not_ the stage that they are referring to. -During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominant. +During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominent. ##### KV cache @@ -132,5 +132,5 @@ The benchmark script supports calling models directly from huggingface (using `h The analysis is done on a single A100 80GB GPU, with input length 512, and output length 64, while varying the batch size. As in previous sections, the batch sizes swept are 1, 2, 4, 8, 16, 32, 64, unless the GPU ran out of memory, in which case that point is not shown. As seen here, both MPT-7B and MPT-30B are among the fastest for inference in the open-source community, with MPT-30B being faster than the respective LLAMA-30B model. -Among the 7B models, Falcon-7B tends to have higher througput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies. +Among the 7B models, Falcon-7B tends to have higher throughput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies. Previously, we found that Falcon-7b was significantly slower than both MPT-7B and LLAMA-7B. This slow speed was due to the KV-cache not being used properly during generation, however this appears to be [fixed](https://huggingface.co/tiiuae/falcon-7b/tree/main) as of July 13, 2022. diff --git a/scripts/inference/convert_hf_to_onnx.py b/scripts/inference/convert_hf_to_onnx.py index 1ba1123c86..9d1841b12f 100644 --- a/scripts/inference/convert_hf_to_onnx.py +++ b/scripts/inference/convert_hf_to_onnx.py @@ -160,7 +160,7 @@ def export_to_onnx( atol=1e-2, msg=f'output mismatch between the orig and onnx exported model', ) - print('exported model ouptut matches with unexported model!!') + print('exported model output matches with unexported model!!') if save_object_store is not None: print('Uploading files to object storage...') diff --git a/scripts/inference/endpoint_generate.py b/scripts/inference/endpoint_generate.py index e78fecf59b..e6f9ae1448 100644 --- a/scripts/inference/endpoint_generate.py +++ b/scripts/inference/endpoint_generate.py @@ -42,7 +42,7 @@ def parse_args() -> Namespace: '-i', '--inputs', nargs='+', - help=f'List of strings, local datafiles (starting with {utils.PROMPTFILE_PREFIX}),' +\ + help=f'List of strings, local data files (starting with {utils.PROMPTFILE_PREFIX}),' +\ ' and/or remote object stores' ) parser.add_argument( diff --git a/scripts/inference/run_mpt_with_ft.py b/scripts/inference/run_mpt_with_ft.py index 10ccf6b78b..61d9f68d2c 100644 --- a/scripts/inference/run_mpt_with_ft.py +++ b/scripts/inference/run_mpt_with_ft.py @@ -197,7 +197,7 @@ def main(): type=int, default=0, choices=[0, 1, 2], - help='Whether to compute the cumulative log probsbility of sentences.' + + help='Whether to compute the cumulative log probability of sentences.' + ' 0: do not return the cumulative log probs' + ' 1: return the cumulative log probs of generated sequences' + ' 2: return the cumulative log probs of sequences') diff --git a/scripts/train/README.md b/scripts/train/README.md index 36974ec943..6730cb793b 100644 --- a/scripts/train/README.md +++ b/scripts/train/README.md @@ -276,7 +276,7 @@ If the dataset requires a [custom preprocessing function](#custom-data-preproces train_loader: name: finetuning dataset: - hf_name: mosaiml/doge-facts + hf_name: mosaicml/doge-facts preprocessing_fn: my_data.formatting:dogefacts_prep_fn split: train ... @@ -402,7 +402,7 @@ so you should be able to run the exact same YAML on 8 or 16 or 256 GPUs and get This is nice because it means you can write device-count-agnostic training configs, and not worry about OOM-ing or accidentally changing the optimization math. -In previous blogposts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy)) +In previous blog posts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy)) we also demonstrated auto microbatching, which takes things a step further by letting Composer determine the `device_train_microbatch_size` on its own. This makes our configs not only device-count-agnostic, but hardware-agnostic too! You can try out this feature by setting `device_train_microbatch_size: auto`, but bear in mind that FSDP support is still in alpha mode diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md index 5414cdc7bf..f5da10ec6a 100644 --- a/scripts/train/benchmarking/README.md +++ b/scripts/train/benchmarking/README.md @@ -20,7 +20,7 @@ python submit_benchmarks.py --cluster [your_mosaicml_cluster] ARGS --RUN can be used to sweep a larger set of configurations. For example usage of `submit_benchmarks.py` see `sweep.sh` which lists all benchmarks in the tables. > **Note** -> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submissing and collection scripts. +> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submission and collection scripts. ## MFU and HFU @@ -55,7 +55,7 @@ hfu* = 4 * flops_per_seq * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS) hfu = (4 * flops_per_seq + 4 * attn_flops_per_seq) * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS) ``` -Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual lyaers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multipler would be 5 instead of 4. +Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual layers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multiplier would be 5 instead of 4. ## Results @@ -65,7 +65,7 @@ python submit_benchmarks.py -m 13b.yaml 30b.yaml -t fp16 -b 21 21 -s 11 14 --RUN ``` This will run 8 configs for 12 steps to get throughput numbers. `python collect_results.py` can then be used to parse all output training logs and create the tables below. -Our microbatching engine enables microbatch sizes that do not divde Global Batchsize while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4. +Our microbatching engine enables microbatch sizes that do not divide global batch size while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4. [comment]: # (TODO: Update tables with torch 2.0 after next Composer release) diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index d3691e951c..151286dbc6 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -150,8 +150,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: d_model = run.submitted_config.parameters['model']['d_model'] n_layers = run.submitted_config.parameters['model']['n_layers'] - # mfu is approximated using thoughtput and param count - # the number of paramters is approximately the number of multiply-accumulates (MAC) in the network + # mfu is approximated using throughput and param count + # the number of parameters is approximately the number of multiply-accumulates (MAC) in the network # each MAC has 2 FLOPs - we multiply by 2 ie 2 * n_param # there are 3 passes of a NN (fwd, bwd, delta) - we multiply by 3 ie 2 * 3 * n_param # this gets us FLOPs / token diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index aff570e3d4..5e83ae41b7 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -205,7 +205,7 @@ def get_global_train_batch_sizes(max_seq_len: int, if batch_sizes is None: batch_sizes = [] if pows: - # global batch size in tokens (defualt: .5M thru 8M) + # global batch size in tokens (default: .5M thru 8M) global_train_token_counts = [2**n for n in range(pows[0], pows[1] + 1)] batch_sizes += [t // max_seq_len for t in global_train_token_counts ] # global batch size in samples diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml index d7b9db10d4..46141ce5ab 100644 --- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml +++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml @@ -16,7 +16,7 @@ model: name: mpt_causal_lm init_device: meta d_model: 2048 - n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention + n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention n_layers: 24 expansion_ratio: 4 max_seq_len: ${max_seq_len} diff --git a/scripts/train/yamls/pretrain/mpt-1b.yaml b/scripts/train/yamls/pretrain/mpt-1b.yaml index 3744a455a8..effa60c59e 100644 --- a/scripts/train/yamls/pretrain/mpt-1b.yaml +++ b/scripts/train/yamls/pretrain/mpt-1b.yaml @@ -11,7 +11,7 @@ model: name: mpt_causal_lm init_device: meta d_model: 2048 - n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention + n_heads: 16 # Modified 24->16 so that d_head == 128 to satisfy FlashAttention n_layers: 24 expansion_ratio: 4 max_seq_len: ${max_seq_len}