From 36bbf10d8e2d603dc757c2ff106258282f24c2f9 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 11:48:58 -0500 Subject: [PATCH 01/26] add llama init template --- examples/benchmarks/pytorch_llama2.py | 41 +++ .../benchmarks/model_benchmarks/__init__.py | 2 +- .../model_benchmarks/pytorch_llama.py | 239 ++++++++++++++++++ 3 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 examples/benchmarks/pytorch_llama2.py create mode 100644 superbench/benchmarks/model_benchmarks/pytorch_llama.py diff --git a/examples/benchmarks/pytorch_llama2.py b/examples/benchmarks/pytorch_llama2.py new file mode 100644 index 000000000..04db22574 --- /dev/null +++ b/examples/benchmarks/pytorch_llama2.py @@ -0,0 +1,41 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Model benchmark example for Llama2-7b (24-layer, 3072-hidden, 24-heads, 7B parameters). + +Commands to run: + python3 examples/benchmarks/pytorch_llama2.py (Single GPU) + python3 -m torch.distributed.launch --use_env --nproc_per_node=8 examples/benchmarks/pytorch_llama2.py \ + --distributed (Distributed) +""" + +import argparse + +from superbench.benchmarks import Platform, Framework, BenchmarkRegistry +from superbench.common.utils import logger + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--distributed', action='store_true', default=False, help='Whether to enable distributed training.' + ) + args = parser.parse_args() + + # Specify the model name and benchmark parameters. + model_name = 'llama2-7b' + parameters = '--batch_size 1 --duration 120 --seq_len 512 --precision float16' + if args.distributed: + parameters += ' --distributed_impl ddp --distributed_backend nccl' + + # Create context for Llama2 benchmark and run it for 120 seconds. + context = BenchmarkRegistry.create_benchmark_context( + model_name, platform=Platform.CUDA, parameters=parameters, framework=Framework.PYTORCH + ) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) diff --git a/superbench/benchmarks/model_benchmarks/__init__.py b/superbench/benchmarks/model_benchmarks/__init__.py index eda0c4985..0829c4d33 100644 --- a/superbench/benchmarks/model_benchmarks/__init__.py +++ b/superbench/benchmarks/model_benchmarks/__init__.py @@ -10,4 +10,4 @@ from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT -__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT'] +__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT', 'PytorchLlama'] diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py new file mode 100644 index 000000000..73e1b3200 --- /dev/null +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -0,0 +1,239 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the Pytorch Llama2 model.""" + +import torch +from transformers import LlamaModel, LlamaConfig +try: + import transformer_engine.pytorch as te + from transformer_engine.common.recipe import Format, DelayedScaling +except ImportError: + te = None + +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Precision +from superbench.benchmarks.model_benchmarks.model_base import Optimizer +from superbench.benchmarks.model_benchmarks.pytorch_base import PytorchBase +from superbench.benchmarks.model_benchmarks.random_dataset import TorchRandomDataset + + +class LlamaBenchmarkModel(torch.nn.Module): + """The Llama model for benchmarking.""" + def __init__(self, config, num_classes): + """Constructor. + + Args: + config (LlamaConfig): Configurations of Llama model. + num_classes (int): The number of objects for classification. + """ + super().__init__() + self._llama = LlamaModel(config) + self._linear = torch.nn.Linear(config.hidden_size, num_classes) + + def forward(self, input): + """Forward propagation function. + + Args: + input (torch.LongTensor): Indices of input sequence tokens in the vocabulary, + shape (batch_size, sequence_length). + + Return: + result (torch.FloatTensor): Last layer hidden-state of the first token of the sequence + (classification token) further processed by a Linear layer, shape (batch_size, hidden_size). + """ + outputs = self._llama(input) + result = self._linear(outputs[0]) + return result + + +class PytorchLlama(PytorchBase): + """The Llama benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + self._config = None + self._fp8_recipe = None + self._supported_precision = [ + Precision.FLOAT32, + Precision.FLOAT16, + Precision.FP8_HYBRID, + Precision.FP8_E4M3, + ] + self._optimizer_type = Optimizer.ADAMW + self._loss_fn = torch.nn.CrossEntropyLoss() + + def add_parser_arguments(self): + """Add the Llama-specified arguments. + + Llama2 model reference: https://huggingface.co/docs/transformers/model_doc/llama2 + """ + super().add_parser_arguments() + + self._parser.add_argument('--num_classes', type=int, default=100, required=False, help='Num of class.') + self._parser.add_argument('--hidden_size', type=int, default=4096, required=False, help='Hidden size.') + self._parser.add_argument( + '--num_hidden_layers', type=int, default=32, required=False, help='The number of hidden layers.' + ) + self._parser.add_argument( + '--num_attention_heads', type=int, default=32, required=False, help='The number of attention heads.' + ) + self._parser.add_argument( + '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.' + ) + self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.') + + def _generate_dataset(self): + """Generate dataset for benchmarking according to shape info. + + Return: + True if dataset is created successfully. + """ + self._dataset = TorchRandomDataset( + [self._args.sample_count, self._args.seq_len], self._world_size, dtype=torch.long + ) + if len(self._dataset) == 0: + logger.error('Generate random dataset failed - model: {}'.format(self._name)) + return False + + return True + + def _create_model(self, precision): + """Construct the model for benchmarking. + + Args: + precision (Precision): precision of model and input data, such as float32, float16. + """ + self._config = LlamaConfig( + hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, num_attention_heads=self._args.num_attention_heads, num_key_value_heads=self._args.num_key_value_heads + ) + + enable_fp8 = precision.name.startswith('FP8_') + if enable_fp8 and te is None: + logger.error( + f'Create model with fp8 failed - model: {self._name}, precision: {precision},' + ' message: Cannot find transformer_engine.' + ) + return False + if enable_fp8 and not self._gpu_available: + logger.error( + f'Create model with fp8 failed - model: {self._name}, precision: {precision},' + ' message: FP8 is only supported on GPU.' + ) + return False + + try: + self._model = LlamaBenchmarkModel(self._config, self._args.num_classes) + if enable_fp8: + self._fp8_recipe = DelayedScaling( + fp8_format=Format[precision.name.strip('FP8_')], + amax_history_len=16, + amax_compute_algo='max', + ) + self._to_te_model(self._model.to(dtype=torch.float16)) + else: + self._model = self._model.to(dtype=getattr(torch, precision.value)) + if self._gpu_available: + self._model = self._model.cuda() + except BaseException as e: + logger.error( + 'Create model with specified precision failed - model: {}, precision: {}, message: {}.'.format( + self._name, precision, str(e) + ) + ) + return False + + self._target = torch.LongTensor(self._args.batch_size).random_(self._args.num_classes) + if self._gpu_available: + self._target = self._target.cuda() + + return True + + def _train_step(self, precision): + """Define the training process. + + Args: + precision (Precision): precision of model and input data, such as float32, float16. + + Return: + The step-time list of every training step. + """ + duration = [] + curr_step = 0 + check_frequency = 100 + while True: + for idx, sample in enumerate(self._dataloader): + start = self._timer() + if self._gpu_available: + sample = sample.cuda() + self._optimizer.zero_grad() + if self._fp8_recipe is not None: + with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe): + output = self._model(sample) + else: + output = self._model(sample) + loss = self._loss_fn(output[range(self._args.batch_size), -1], self._target) + loss.backward() + self._optimizer.step() + end = self._timer() + curr_step += 1 + if curr_step > self._args.num_warmup: + # Save the step time of every training/inference step, unit is millisecond. + duration.append((end - start) * 1000) + self._log_step_time(curr_step, precision, duration) + if self._is_finished(curr_step, end, check_frequency): + return duration + + def _inference_step(self, precision): + """Define the inference process. + + Args: + precision (Precision): precision of model and input data, + such as float32, float16. + + Return: + The latency list of every inference operation. + """ + duration = [] + curr_step = 0 + with torch.no_grad(): + self._model.eval() + while True: + for idx, sample in enumerate(self._dataloader): + start = self._timer() + if self._gpu_available: + sample = sample.cuda() + if self._fp8_recipe is not None: + with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe): + self._model(sample) + else: + self._model(sample) + end = self._timer() + curr_step += 1 + if curr_step > self._args.num_warmup: + # Save the step time of every training/inference step, unit is millisecond. + duration.append((end - start) * 1000) + self._log_step_time(curr_step, precision, duration) + if self._is_finished(curr_step, end): + return duration + + +# Register Llama2 benchmark with 7b parameters. +BenchmarkRegistry.register_benchmark( + 'pytorch-llama2-7b', PytorchLlama, parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32' +) + +# Register Llama2 benchmark with 13b parameters. +BenchmarkRegistry.register_benchmark( + 'pytorch-llama2-13b', PytorchLlama, parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40' +) + +# Register Llama2 benchmark with 70b parameters. +BenchmarkRegistry.register_benchmark( + 'pytorch-llama2-70b', PytorchLlama, parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8' +) From 697138a2471052c357eb2356a7045d21add7ec8f Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 12:18:19 -0500 Subject: [PATCH 02/26] add llama2 unit test --- .../model_benchmarks/test_pytorch_llama.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tests/benchmarks/model_benchmarks/test_pytorch_llama.py diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py new file mode 100644 index 000000000..c01d8555d --- /dev/null +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -0,0 +1,56 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for Llama model benchmarks.""" + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode +from superbench.benchmarks.model_benchmarks.pytorch_llama import PytorchLlama + +@decorator.cuda_test +@decorator.pytorch_test +def test_pytorch_llama_7b(): + """Test pytorch-llama2-7b benchmark.""" + context = BenchmarkRegistry.create_benchmark_context( + 'pytorch-llama2-7b', + platform=Platform.CUDA, + parameters='--batch_size 1 --num_classes 100 --seq_len 512 --num_warmup 2 --num_steps 4 \ + --model_action train inference', + framework=Framework.PYTORCH + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (isinstance(benchmark, PytorchLlama)) + assert (benchmark.name == 'pytorch-llama2-7b') + assert (benchmark.type == BenchmarkType.MODEL) + + # Check predefined parameters of llama2 7b model. + assert (benchmark._args.hidden_size == 4096) + assert (benchmark._args.num_hidden_layers == 32) + assert (benchmark._args.num_attention_heads == 32) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.batch_size == 1) + assert (benchmark._args.num_classes == 100) + assert (benchmark._args.seq_len == 521) + assert (benchmark._args.num_warmup == 2) + assert (benchmark._args.num_steps == 4) + + # Test Dataset. + assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + for metric in [ + 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput', + 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' + ]: + assert (len(benchmark.raw_data[metric]) == benchmark.run_count) + assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) + assert (len(benchmark.result[metric]) == benchmark.run_count) From 9355e22b7cd1ac62137227ddd16b0c1adeffc32f Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 15:27:38 -0500 Subject: [PATCH 03/26] fix dims for llama2 unit test --- superbench/benchmarks/model_benchmarks/pytorch_llama.py | 8 ++++---- tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py index 73e1b3200..13257c204 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -76,17 +76,17 @@ def add_parser_arguments(self): super().add_parser_arguments() self._parser.add_argument('--num_classes', type=int, default=100, required=False, help='Num of class.') - self._parser.add_argument('--hidden_size', type=int, default=4096, required=False, help='Hidden size.') + self._parser.add_argument('--hidden_size', type=int, default=1280, required=False, help='Hidden size.') self._parser.add_argument( - '--num_hidden_layers', type=int, default=32, required=False, help='The number of hidden layers.' + '--num_hidden_layers', type=int, default=36, required=False, help='The number of hidden layers.' ) self._parser.add_argument( - '--num_attention_heads', type=int, default=32, required=False, help='The number of attention heads.' + '--num_attention_heads', type=int, default=20, required=False, help='The number of attention heads.' ) + self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.') self._parser.add_argument( '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.' ) - self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.') def _generate_dataset(self): """Generate dataset for benchmarking according to shape info. diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index c01d8555d..9e2793853 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -12,9 +12,9 @@ def test_pytorch_llama_7b(): """Test pytorch-llama2-7b benchmark.""" context = BenchmarkRegistry.create_benchmark_context( - 'pytorch-llama2-7b', + 'llama2-7b', platform=Platform.CUDA, - parameters='--batch_size 1 --num_classes 100 --seq_len 512 --num_warmup 2 --num_steps 4 \ + parameters='--batch_size 1 --seq_len 32 --num_warmup 2 --num_steps 4 --precision float16 \ --model_action train inference', framework=Framework.PYTORCH ) @@ -37,7 +37,7 @@ def test_pytorch_llama_7b(): # Check parameters specified in BenchmarkContext. assert (benchmark._args.batch_size == 1) assert (benchmark._args.num_classes == 100) - assert (benchmark._args.seq_len == 521) + assert (benchmark._args.seq_len == 32) assert (benchmark._args.num_warmup == 2) assert (benchmark._args.num_steps == 4) @@ -48,8 +48,7 @@ def test_pytorch_llama_7b(): assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ - 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput', - 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' + 'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From 60eae3610b40a184d66fc71ef32fedd4a88cfefe Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 15:29:57 -0500 Subject: [PATCH 04/26] update transformers version for LLamaConfig --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 738095889..584aed22e 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ def run(self): 'torch': [ 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.3.3, <4.23.0', + 'transformers>=4.28.0', ], 'ort': [ 'onnx>=1.10.2', From dadb56adb7754c4931d47cb5d200ee5895c7582a Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 15:45:13 -0500 Subject: [PATCH 05/26] update docs --- docs/superbench-config.mdx | 3 ++- docs/user-tutorial/benchmarks/model-benchmarks.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index 102b8d69f..051abeda3 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -328,7 +328,8 @@ A list of models to run, only supported in model-benchmark. shufflenet_v2_x0_5 | shufflenet_v2_x1_0 | shufflenet_v2_x1_5 | shufflenet_v2_x2_0 | squeezenet1_0 | squeezenet1_1 | vgg11 | vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19_bn | vgg19 | - bert-base | bert-large | gpt2-small | gpt2-medium | gpt2-large | gpt2-xl ] + bert-base | bert-large | gpt2-small | gpt2-medium | gpt2-large | gpt2-xl | + llama2-7b | llama2-13b | llama2-70b ] ``` * default value: `[ ]` diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index 34fdf4c70..71e8832cf 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -13,6 +13,7 @@ id: model-benchmarks Run training or inference tasks with single or half precision for deep learning models, including the following categories: * GPT: gpt2-small, gpt2-medium, gpt2-large and gpt2-xl +* LLAMA: llama2-7b, llama2-13b, llama2-70b * BERT: bert-base and bert-large * LSTM * CNN, listed in [`torchvision.models`](https://pytorch.org/vision/0.8/models.html), including: From d2731a8af997c9dc38ad8ab819ea558b62200533 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Mon, 18 Nov 2024 23:02:52 -0500 Subject: [PATCH 06/26] update opset for torch onnx conversion --- .../benchmarks/micro_benchmarks/_export_torch_to_onnx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index 1e37b793d..f9c9a4bed 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -9,7 +9,7 @@ import torch.hub import torch.onnx import torchvision.models -from transformers import BertConfig, GPT2Config +from transformers import BertConfig, GPT2Config, ll from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel @@ -138,7 +138,7 @@ def export_torchvision_model(self, model_name, batch_size=1): model, dummy_input, file_name, - opset_version=10, + opset_version=14, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, input_names=['input'], output_names=['output'], @@ -179,7 +179,7 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): model, dummy_input, file_name, - opset_version=10, + opset_version=14, do_constant_folding=True, input_names=['input'], output_names=['output'], From 3644985e32b782e6a230352e555328882e14c4f4 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 19 Nov 2024 00:27:11 -0500 Subject: [PATCH 07/26] format and lint --- examples/benchmarks/pytorch_llama2.py | 2 +- .../model_benchmarks/pytorch_llama.py | 17 ++++++++++++++--- .../model_benchmarks/test_pytorch_llama.py | 1 + 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/benchmarks/pytorch_llama2.py b/examples/benchmarks/pytorch_llama2.py index 04db22574..2290ba1a5 100644 --- a/examples/benchmarks/pytorch_llama2.py +++ b/examples/benchmarks/pytorch_llama2.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -"""Model benchmark example for Llama2-7b (24-layer, 3072-hidden, 24-heads, 7B parameters). +"""Model benchmark example for Llama2-7b (32-layer, 4096-hidden, 32-heads, 7B parameters). Commands to run: python3 examples/benchmarks/pytorch_llama2.py (Single GPU) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py index 13257c204..9701eb59f 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -20,6 +20,7 @@ class LlamaBenchmarkModel(torch.nn.Module): """The Llama model for benchmarking.""" + def __init__(self, config, num_classes): """Constructor. @@ -49,6 +50,7 @@ def forward(self, input): class PytorchLlama(PytorchBase): """The Llama benchmark class.""" + def __init__(self, name, parameters=''): """Constructor. @@ -85,7 +87,11 @@ def add_parser_arguments(self): ) self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.') self._parser.add_argument( - '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.' + '--num_key_value_heads', + type=int, + default=None, + required=False, + help='The number of key_value heads that should be used to implement Grouped Query Attention.' ) def _generate_dataset(self): @@ -110,7 +116,10 @@ def _create_model(self, precision): precision (Precision): precision of model and input data, such as float32, float16. """ self._config = LlamaConfig( - hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, num_attention_heads=self._args.num_attention_heads, num_key_value_heads=self._args.num_key_value_heads + hidden_size=self._args.hidden_size, + num_hidden_layers=self._args.num_hidden_layers, + num_attention_heads=self._args.num_attention_heads, + num_key_value_heads=self._args.num_key_value_heads ) enable_fp8 = precision.name.startswith('FP8_') @@ -235,5 +244,7 @@ def _inference_step(self, precision): # Register Llama2 benchmark with 70b parameters. BenchmarkRegistry.register_benchmark( - 'pytorch-llama2-70b', PytorchLlama, parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8' + 'pytorch-llama2-70b', + PytorchLlama, + parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8' ) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 9e2793853..0a17170d4 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -7,6 +7,7 @@ from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode from superbench.benchmarks.model_benchmarks.pytorch_llama import PytorchLlama + @decorator.cuda_test @decorator.pytorch_test def test_pytorch_llama_7b(): From 52f4900e02a0a4b0c1156b02a34a96f77547248e Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 19 Nov 2024 01:13:29 -0500 Subject: [PATCH 08/26] remove remnant --- superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index f9c9a4bed..abb75676d 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -9,7 +9,7 @@ import torch.hub import torch.onnx import torchvision.models -from transformers import BertConfig, GPT2Config, ll +from transformers import BertConfig, GPT2Config from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel From f826676e970b54e25d35c122f6b6e8f4fb1ece42 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 19 Nov 2024 21:30:26 -0500 Subject: [PATCH 09/26] lint fix --- superbench/benchmarks/model_benchmarks/pytorch_llama.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py index 9701eb59f..26fad9bb2 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -20,7 +20,6 @@ class LlamaBenchmarkModel(torch.nn.Module): """The Llama model for benchmarking.""" - def __init__(self, config, num_classes): """Constructor. @@ -50,7 +49,6 @@ def forward(self, input): class PytorchLlama(PytorchBase): """The Llama benchmark class.""" - def __init__(self, name, parameters=''): """Constructor. From 6a410875507c8a978106490b04f1db63bcaf8ec7 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 00:34:43 -0500 Subject: [PATCH 10/26] replace py 3.6 with 3.10 and update cuda to 12.4 for unit test --- .azure-pipelines/cpu-unit-test.yml | 4 ++-- .azure-pipelines/cuda-unit-test.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/cpu-unit-test.yml b/.azure-pipelines/cpu-unit-test.yml index 7fc698f4f..1de67824f 100644 --- a/.azure-pipelines/cpu-unit-test.yml +++ b/.azure-pipelines/cpu-unit-test.yml @@ -7,12 +7,12 @@ trigger: strategy: matrix: - python-3.6: - imageTag: '3.6' python-3.7: imageTag: '3.7' python-3.8: imageTag: '3.8' + python-3.10: + imageTag: '3.10' # TODO #python-latest: # imageTag: '3' diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index e0a69fc0d..056917184 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -10,7 +10,7 @@ pool: vmImage: ubuntu-latest container: - image: nvcr.io/nvidia/pytorch:20.12-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/' steps: From 5b816b472da403b6780ad5baaab4918e70fddf09 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 00:43:14 -0500 Subject: [PATCH 11/26] remove 3.6 from setup, codecov and docs --- .codecov.yml | 2 -- docs/getting-started/installation.mdx | 2 +- setup.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.codecov.yml b/.codecov.yml index 81d50f8bc..50a778c4e 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -14,7 +14,6 @@ coverage: target: 80% threshold: 1% flags: - - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test - directx-unit-test @@ -23,7 +22,6 @@ coverage: target: 80% threshold: 1% flags: - - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test - directx-unit-test diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 30fdee829..0a582e92f 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -26,7 +26,7 @@ Here're the system requirements for control node. ### Requirements * Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later. -* [Python](https://www.python.org/) version 3.6 or later (which can be checked by running `python3 --version`). +* [Python](https://www.python.org/) version 3.7 or later (which can be checked by running `python3 --version`). * [Pip](https://pip.pypa.io/en/stable/installing/) version 18.0 or later (which can be checked by running `python3 -m pip --version`). :::note diff --git a/setup.py b/setup.py index 584aed22e..2474dcbc1 100644 --- a/setup.py +++ b/setup.py @@ -131,17 +131,17 @@ def run(self): 'Operating System :: POSIX', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Topic :: System :: Benchmark', 'Topic :: System :: Clustering', 'Topic :: System :: Hardware', ], keywords='benchmark, AI systems', packages=find_packages(exclude=['tests']), - python_requires='>=3.6, <4', + python_requires='>=3.7, <4', use_scm_version={ 'local_scheme': 'node-and-date', 'version_scheme': lambda _: superbench.__version__, From f322c98d31c56a3413e035f10a0fe6dbe0f895eb Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 02:14:26 -0500 Subject: [PATCH 12/26] add llama fp8 unit test for better code coverage --- .../model_benchmarks/test_pytorch_llama.py | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 0a17170d4..69eed1545 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -15,7 +15,7 @@ def test_pytorch_llama_7b(): context = BenchmarkRegistry.create_benchmark_context( 'llama2-7b', platform=Platform.CUDA, - parameters='--batch_size 1 --seq_len 32 --num_warmup 2 --num_steps 4 --precision float16 \ + parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \ --model_action train inference', framework=Framework.PYTORCH ) @@ -39,8 +39,8 @@ def test_pytorch_llama_7b(): assert (benchmark._args.batch_size == 1) assert (benchmark._args.num_classes == 100) assert (benchmark._args.seq_len == 32) - assert (benchmark._args.num_warmup == 2) - assert (benchmark._args.num_steps == 4) + assert (benchmark._args.num_warmup == 1) + assert (benchmark._args.num_steps == 2) # Test Dataset. assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) @@ -54,3 +54,52 @@ def test_pytorch_llama_7b(): assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) assert (len(benchmark.result[metric]) == benchmark.run_count) + + +@decorator.cuda_test +@decorator.pytorch_test +def test_pytorch_llama_7b_fp8_inference(): + """Test pytorch-llama2-7b benchmark for fp8 inference.""" + context = BenchmarkRegistry.create_benchmark_context( + 'llama2-7b', + platform=Platform.CUDA, + parameters='--batch_size 8 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \ + --model_action inference', + framework=Framework.PYTORCH + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (isinstance(benchmark, PytorchLlama)) + assert (benchmark.name == 'pytorch-llama2-7b') + assert (benchmark.type == BenchmarkType.MODEL) + + # Check predefined parameters of llama2 7b model. + assert (benchmark._args.hidden_size == 4096) + assert (benchmark._args.num_hidden_layers == 32) + assert (benchmark._args.num_attention_heads == 32) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.batch_size == 8) + assert (benchmark._args.num_classes == 100) + assert (benchmark._args.seq_len == 32) + assert (benchmark._args.num_warmup == 1) + assert (benchmark._args.num_steps == 2) + + # Test Dataset. + assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + + for metric in [ + 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput' + ]: + assert (len(benchmark.raw_data[metric]) == benchmark.run_count) + assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) + assert (len(benchmark.result[metric]) == benchmark.run_count) From b28ee17f2d7eb3d4781bfbd117100f25f786d9e5 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 08:40:04 -0500 Subject: [PATCH 13/26] llama fp8 precision test only, to reduce memory required --- .../model_benchmarks/test_pytorch_llama.py | 54 ++----------------- 1 file changed, 3 insertions(+), 51 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 69eed1545..0dfb390b1 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -11,11 +11,11 @@ @decorator.cuda_test @decorator.pytorch_test def test_pytorch_llama_7b(): - """Test pytorch-llama2-7b benchmark.""" + """Test pytorch-llama2-7b benchmark for fp8 inference.""" context = BenchmarkRegistry.create_benchmark_context( 'llama2-7b', platform=Platform.CUDA, - parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \ + parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \ --model_action train inference', framework=Framework.PYTORCH ) @@ -45,60 +45,12 @@ def test_pytorch_llama_7b(): # Test Dataset. assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) - # Check results and metrics. - assert (benchmark.run_count == 1) - assert (benchmark.return_code == ReturnCode.SUCCESS) - for metric in [ - 'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' - ]: - assert (len(benchmark.raw_data[metric]) == benchmark.run_count) - assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) - assert (len(benchmark.result[metric]) == benchmark.run_count) - - -@decorator.cuda_test -@decorator.pytorch_test -def test_pytorch_llama_7b_fp8_inference(): - """Test pytorch-llama2-7b benchmark for fp8 inference.""" - context = BenchmarkRegistry.create_benchmark_context( - 'llama2-7b', - platform=Platform.CUDA, - parameters='--batch_size 8 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \ - --model_action inference', - framework=Framework.PYTORCH - ) - - assert (BenchmarkRegistry.is_benchmark_context_valid(context)) - - benchmark = BenchmarkRegistry.launch_benchmark(context) - - # Check basic information. - assert (benchmark) - assert (isinstance(benchmark, PytorchLlama)) - assert (benchmark.name == 'pytorch-llama2-7b') - assert (benchmark.type == BenchmarkType.MODEL) - - # Check predefined parameters of llama2 7b model. - assert (benchmark._args.hidden_size == 4096) - assert (benchmark._args.num_hidden_layers == 32) - assert (benchmark._args.num_attention_heads == 32) - - # Check parameters specified in BenchmarkContext. - assert (benchmark._args.batch_size == 8) - assert (benchmark._args.num_classes == 100) - assert (benchmark._args.seq_len == 32) - assert (benchmark._args.num_warmup == 1) - assert (benchmark._args.num_steps == 2) - - # Test Dataset. - assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) - # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ - 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput' + 'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From e6f6be3ba3af93507ae7c752f24bb9805ae81c35 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 14:00:36 -0500 Subject: [PATCH 14/26] lint fix --- tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 0dfb390b1..7ab6fcffe 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -50,7 +50,8 @@ def test_pytorch_llama_7b(): assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ - 'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput' + 'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', + 'fp8_e4m3_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From 297a2293a099cdc1df2254cab2f617b8ce98eddf Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 20 Nov 2024 23:41:29 -0800 Subject: [PATCH 15/26] remove deprecated NaN usage for numpy>2.0 --- tests/analyzer/test_summaryop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/analyzer/test_summaryop.py b/tests/analyzer/test_summaryop.py index 3b1054444..889ebc1e8 100644 --- a/tests/analyzer/test_summaryop.py +++ b/tests/analyzer/test_summaryop.py @@ -4,7 +4,7 @@ """Tests for SummaryOp module.""" import unittest -from numpy import NaN, float64 +from numpy import nan, float64 import pandas as pd @@ -55,7 +55,7 @@ def test_rule_op(self): # Test - std result = SummaryOp.std(raw_data_df) print(result) - expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64) + expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, nan], index=['a', 'b', 'c', 'd'], dtype=float64) pd.testing.assert_series_equal(result, expectedResult) # Test - count result = SummaryOp.count(raw_data_df) From 5f72f51cd190d7b43633200a26aa80603dbc8460 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 21 Nov 2024 01:47:21 -0800 Subject: [PATCH 16/26] fix argparse formatting related test cases failure for 3.10 --- superbench/benchmarks/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 86c6b6d15..944099bb6 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -87,7 +87,12 @@ def get_configurable_settings(self): Return: All configurable settings in raw string. """ - return self._parser.format_help().strip() + message = self._parser.format_help().strip() + # Ensure consistent header across diff python argparse format_help output. + if "options:" in message: + message = message.replace("options:", "optional arguments:") + return message + def parse_args(self, ignore_invalid=False): """Parse the arguments. From 8bbe32608e4e93f2fed002a94522bd791ba8559c Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 21 Nov 2024 01:53:38 -0800 Subject: [PATCH 17/26] fix lint --- superbench/benchmarks/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 944099bb6..71ea96247 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -93,7 +93,6 @@ def get_configurable_settings(self): message = message.replace("options:", "optional arguments:") return message - def parse_args(self, ignore_invalid=False): """Parse the arguments. From 50452effc95d3576638da4a34f23e2f7fff5918c Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 21 Nov 2024 02:03:18 -0800 Subject: [PATCH 18/26] fix lint --- superbench/benchmarks/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 71ea96247..af0443249 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -89,8 +89,8 @@ def get_configurable_settings(self): """ message = self._parser.format_help().strip() # Ensure consistent header across diff python argparse format_help output. - if "options:" in message: - message = message.replace("options:", "optional arguments:") + if 'options:' in message: + message = message.replace('options:', 'optional arguments:') return message def parse_args(self, ignore_invalid=False): From 0b1da4f67dbe39739e0ebe4b9342622537365fbe Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 21 Nov 2024 10:27:48 -0800 Subject: [PATCH 19/26] add llama2 to tensorrt --- .../micro_benchmarks/_export_torch_to_onnx.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index abb75676d..36a50f3bd 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -9,11 +9,12 @@ import torch.hub import torch.onnx import torchvision.models -from transformers import BertConfig, GPT2Config +from transformers import BertConfig, GPT2Config, LlamaConfig from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel +from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel class torch2onnxExporter(): @@ -87,6 +88,34 @@ def __init__(self): ), self.num_classes, ), + 'llama2-7b': + lambda: LlamaBenchmarkModel( + LlamaConfig( + hidden_size=4096, + num_hidden_layers=32, + num_attention_heads=32, + ), + self.num_classes, + ), + 'llama2-13b': + lambda: LlamaBenchmarkModel( + LlamaConfig( + hidden_size=5120, + num_hidden_layers=40, + num_attention_heads=40, + ), + self.num_classes, + ), + 'llama2-70b': + lambda: LlamaBenchmarkModel( + LlamaConfig( + hidden_size=8192, + num_hidden_layers=80, + num_attention_heads=64, + num_key_value_heads=8, + ), + self.num_classes, + ), } self._onnx_model_path = Path(torch.hub.get_dir()) / 'onnx' self._onnx_model_path.mkdir(parents=True, exist_ok=True) From e423aecabdbc325930181e30593c690e7917b658 Mon Sep 17 00:00:00 2001 From: dpatlolla Date: Fri, 22 Nov 2024 03:00:51 +0000 Subject: [PATCH 20/26] add more params to llama config --- .../micro_benchmarks/_export_torch_to_onnx.py | 5 ++++ .../model_benchmarks/pytorch_llama.py | 26 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index 36a50f3bd..0f28f4f6a 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -94,6 +94,8 @@ def __init__(self): hidden_size=4096, num_hidden_layers=32, num_attention_heads=32, + num_key_value_heads=32, + intermediate_size=11008, ), self.num_classes, ), @@ -103,6 +105,8 @@ def __init__(self): hidden_size=5120, num_hidden_layers=40, num_attention_heads=40, + num_key_value_heads=40, + intermediate_size=13824, ), self.num_classes, ), @@ -113,6 +117,7 @@ def __init__(self): num_hidden_layers=80, num_attention_heads=64, num_key_value_heads=8, + intermediate_size=28672, ), self.num_classes, ), diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py index 26fad9bb2..e292105cf 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -83,6 +83,13 @@ def add_parser_arguments(self): self._parser.add_argument( '--num_attention_heads', type=int, default=20, required=False, help='The number of attention heads.' ) + self._parser.add_argument( + '--intermediate_size', + type=int, + default=11008, + required=False, + help='Dimension of the MLP representations.' + ) self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.') self._parser.add_argument( '--num_key_value_heads', @@ -113,11 +120,15 @@ def _create_model(self, precision): Args: precision (Precision): precision of model and input data, such as float32, float16. """ + self._config = LlamaConfig( hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, num_attention_heads=self._args.num_attention_heads, - num_key_value_heads=self._args.num_key_value_heads + num_key_value_heads=self._args.num_key_value_heads, + intermediate_size=self._args.intermediate_size, + max_position_embeddings=4096, # Maximum sequence length that llama2 supports + rms_norm_eps=1e-05, # Llama2 default for epsilon used by the rms normalization layers ) enable_fp8 = precision.name.startswith('FP8_') @@ -232,17 +243,24 @@ def _inference_step(self, precision): # Register Llama2 benchmark with 7b parameters. BenchmarkRegistry.register_benchmark( - 'pytorch-llama2-7b', PytorchLlama, parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32' + 'pytorch-llama2-7b', + PytorchLlama, + parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32 --num_key_value_heads=32 \ + --intermediate_size=11008' ) # Register Llama2 benchmark with 13b parameters. BenchmarkRegistry.register_benchmark( - 'pytorch-llama2-13b', PytorchLlama, parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40' + 'pytorch-llama2-13b', + PytorchLlama, + parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40 --num_key_value_heads=40 \ + --intermediate_size=13824' ) # Register Llama2 benchmark with 70b parameters. BenchmarkRegistry.register_benchmark( 'pytorch-llama2-70b', PytorchLlama, - parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8' + parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8 \ + --intermediate_size=28672' ) From d850210a9ac0fd6cd36762a13a01198ca61b69a0 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Thu, 21 Nov 2024 19:11:39 -0800 Subject: [PATCH 21/26] fix lint --- superbench/benchmarks/model_benchmarks/pytorch_llama.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py index e292105cf..7161aeb83 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py @@ -120,7 +120,6 @@ def _create_model(self, precision): Args: precision (Precision): precision of model and input data, such as float32, float16. """ - self._config = LlamaConfig( hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, From 670dc769dc6c07ba3c75ed313572713760e55573 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 26 Nov 2024 22:39:34 -0800 Subject: [PATCH 22/26] llama test: use fp16 instead of fp8 to relax cuda CC req. --- tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 7ab6fcffe..d0f779797 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -11,11 +11,11 @@ @decorator.cuda_test @decorator.pytorch_test def test_pytorch_llama_7b(): - """Test pytorch-llama2-7b benchmark for fp8 inference.""" + """Test pytorch-llama2-7b benchmark for fp16 inference.""" context = BenchmarkRegistry.create_benchmark_context( 'llama2-7b', platform=Platform.CUDA, - parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \ + parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp16 \ --model_action train inference', framework=Framework.PYTORCH ) @@ -50,8 +50,8 @@ def test_pytorch_llama_7b(): assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ - 'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', - 'fp8_e4m3_inference_throughput' + 'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', + 'fp16_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From 27c788c920e344133c2f47b4c90d4f2b4a7fbd6b Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 26 Nov 2024 22:47:02 -0800 Subject: [PATCH 23/26] fix comment and lint --- tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index d0f779797..92bd1c89d 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -11,7 +11,7 @@ @decorator.cuda_test @decorator.pytorch_test def test_pytorch_llama_7b(): - """Test pytorch-llama2-7b benchmark for fp16 inference.""" + """Test pytorch-llama2-7b benchmark for fp16 train and inference.""" context = BenchmarkRegistry.create_benchmark_context( 'llama2-7b', platform=Platform.CUDA, @@ -50,8 +50,7 @@ def test_pytorch_llama_7b(): assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ - 'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', - 'fp16_inference_throughput' + 'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From 00d09ba36e2afb406df38391c010613caef271cd Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Tue, 26 Nov 2024 23:00:55 -0800 Subject: [PATCH 24/26] fix precision arg as float16 --- tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py index 92bd1c89d..a9a03d7b9 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py @@ -15,7 +15,7 @@ def test_pytorch_llama_7b(): context = BenchmarkRegistry.create_benchmark_context( 'llama2-7b', platform=Platform.CUDA, - parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp16 \ + parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \ --model_action train inference', framework=Framework.PYTORCH ) From bd47fc3e17c8adb410a75070e48d675d2e1c8423 Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 27 Nov 2024 00:06:28 -0800 Subject: [PATCH 25/26] limit tokenizers version to < 0.20.3 as 0.20.4 doesnt support py3.8 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 2474dcbc1..58e2beb7e 100644 --- a/setup.py +++ b/setup.py @@ -211,6 +211,7 @@ def run(self): 'torch': [ 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', + 'tokenizers <= 0.20.3', 'transformers>=4.28.0', ], 'ort': [ From bed3e01b53e729a3445b4fcd96e238f4571801fb Mon Sep 17 00:00:00 2001 From: dilip patlolla Date: Wed, 27 Nov 2024 16:47:17 -0800 Subject: [PATCH 26/26] address review comments --- setup.py | 2 +- superbench/benchmarks/base.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 58e2beb7e..c6e2d1fe3 100644 --- a/setup.py +++ b/setup.py @@ -209,9 +209,9 @@ def run(self): 'yapf==0.31.0', ], 'torch': [ + 'tokenizers<=0.20.3', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'tokenizers <= 0.20.3', 'transformers>=4.28.0', ], 'ort': [ diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 649ade99d..8e6e58bfe 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -90,9 +90,6 @@ def get_configurable_settings(self): All configurable settings in raw string. """ message = self._parser.format_help().strip() - # Ensure consistent header across diff python argparse format_help output. - if 'options:' in message: - message = message.replace('options:', 'optional arguments:') return message def parse_args(self, ignore_invalid=False):