From 36bbf10d8e2d603dc757c2ff106258282f24c2f9 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 11:48:58 -0500
Subject: [PATCH 01/26] add llama init template

---
 examples/benchmarks/pytorch_llama2.py         |  41 +++
 .../benchmarks/model_benchmarks/__init__.py   |   2 +-
 .../model_benchmarks/pytorch_llama.py         | 239 ++++++++++++++++++
 3 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100644 examples/benchmarks/pytorch_llama2.py
 create mode 100644 superbench/benchmarks/model_benchmarks/pytorch_llama.py

diff --git a/examples/benchmarks/pytorch_llama2.py b/examples/benchmarks/pytorch_llama2.py
new file mode 100644
index 000000000..04db22574
--- /dev/null
+++ b/examples/benchmarks/pytorch_llama2.py
@@ -0,0 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Model benchmark example for Llama2-7b (24-layer, 3072-hidden, 24-heads, 7B parameters).
+
+Commands to run:
+  python3 examples/benchmarks/pytorch_llama2.py (Single GPU)
+  python3 -m torch.distributed.launch --use_env --nproc_per_node=8 examples/benchmarks/pytorch_llama2.py \
+      --distributed (Distributed)
+"""
+
+import argparse
+
+from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--distributed', action='store_true', default=False, help='Whether to enable distributed training.'
+    )
+    args = parser.parse_args()
+
+    # Specify the model name and benchmark parameters.
+    model_name = 'llama2-7b'
+    parameters = '--batch_size 1 --duration 120 --seq_len 512 --precision float16'
+    if args.distributed:
+        parameters += ' --distributed_impl ddp --distributed_backend nccl'
+
+    # Create context for Llama2 benchmark and run it for 120 seconds.
+    context = BenchmarkRegistry.create_benchmark_context(
+        model_name, platform=Platform.CUDA, parameters=parameters, framework=Framework.PYTORCH
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/model_benchmarks/__init__.py b/superbench/benchmarks/model_benchmarks/__init__.py
index eda0c4985..0829c4d33 100644
--- a/superbench/benchmarks/model_benchmarks/__init__.py
+++ b/superbench/benchmarks/model_benchmarks/__init__.py
@@ -10,4 +10,4 @@
 from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM
 from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT
 
-__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT']
+__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT', 'PytorchLlama']
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
new file mode 100644
index 000000000..73e1b3200
--- /dev/null
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -0,0 +1,239 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the Pytorch Llama2 model."""
+
+import torch
+from transformers import LlamaModel, LlamaConfig
+try:
+    import transformer_engine.pytorch as te
+    from transformer_engine.common.recipe import Format, DelayedScaling
+except ImportError:
+    te = None
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Precision
+from superbench.benchmarks.model_benchmarks.model_base import Optimizer
+from superbench.benchmarks.model_benchmarks.pytorch_base import PytorchBase
+from superbench.benchmarks.model_benchmarks.random_dataset import TorchRandomDataset
+
+
+class LlamaBenchmarkModel(torch.nn.Module):
+    """The Llama model for benchmarking."""
+    def __init__(self, config, num_classes):
+        """Constructor.
+
+        Args:
+            config (LlamaConfig): Configurations of Llama model.
+            num_classes (int): The number of objects for classification.
+        """
+        super().__init__()
+        self._llama = LlamaModel(config)
+        self._linear = torch.nn.Linear(config.hidden_size, num_classes)
+
+    def forward(self, input):
+        """Forward propagation function.
+
+        Args:
+            input (torch.LongTensor): Indices of input sequence tokens in the vocabulary,
+              shape (batch_size, sequence_length).
+
+        Return:
+            result (torch.FloatTensor): Last layer hidden-state of the first token of the sequence
+              (classification token) further processed by a Linear layer, shape (batch_size, hidden_size).
+        """
+        outputs = self._llama(input)
+        result = self._linear(outputs[0])
+        return result
+
+
+class PytorchLlama(PytorchBase):
+    """The Llama benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._config = None
+        self._fp8_recipe = None
+        self._supported_precision = [
+            Precision.FLOAT32,
+            Precision.FLOAT16,
+            Precision.FP8_HYBRID,
+            Precision.FP8_E4M3,
+        ]
+        self._optimizer_type = Optimizer.ADAMW
+        self._loss_fn = torch.nn.CrossEntropyLoss()
+
+    def add_parser_arguments(self):
+        """Add the Llama-specified arguments.
+
+        Llama2 model reference: https://huggingface.co/docs/transformers/model_doc/llama2
+        """
+        super().add_parser_arguments()
+
+        self._parser.add_argument('--num_classes', type=int, default=100, required=False, help='Num of class.')
+        self._parser.add_argument('--hidden_size', type=int, default=4096, required=False, help='Hidden size.')
+        self._parser.add_argument(
+            '--num_hidden_layers', type=int, default=32, required=False, help='The number of hidden layers.'
+        )
+        self._parser.add_argument(
+            '--num_attention_heads', type=int, default=32, required=False, help='The number of attention heads.'
+        )
+        self._parser.add_argument(
+            '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.'
+        )
+        self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.')
+
+    def _generate_dataset(self):
+        """Generate dataset for benchmarking according to shape info.
+
+        Return:
+            True if dataset is created successfully.
+        """
+        self._dataset = TorchRandomDataset(
+            [self._args.sample_count, self._args.seq_len], self._world_size, dtype=torch.long
+        )
+        if len(self._dataset) == 0:
+            logger.error('Generate random dataset failed - model: {}'.format(self._name))
+            return False
+
+        return True
+
+    def _create_model(self, precision):
+        """Construct the model for benchmarking.
+
+        Args:
+            precision (Precision): precision of model and input data, such as float32, float16.
+        """
+        self._config = LlamaConfig(
+            hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, num_attention_heads=self._args.num_attention_heads, num_key_value_heads=self._args.num_key_value_heads
+        )
+
+        enable_fp8 = precision.name.startswith('FP8_')
+        if enable_fp8 and te is None:
+            logger.error(
+                f'Create model with fp8 failed - model: {self._name}, precision: {precision},'
+                ' message: Cannot find transformer_engine.'
+            )
+            return False
+        if enable_fp8 and not self._gpu_available:
+            logger.error(
+                f'Create model with fp8 failed - model: {self._name}, precision: {precision},'
+                ' message: FP8 is only supported on GPU.'
+            )
+            return False
+
+        try:
+            self._model = LlamaBenchmarkModel(self._config, self._args.num_classes)
+            if enable_fp8:
+                self._fp8_recipe = DelayedScaling(
+                    fp8_format=Format[precision.name.strip('FP8_')],
+                    amax_history_len=16,
+                    amax_compute_algo='max',
+                )
+                self._to_te_model(self._model.to(dtype=torch.float16))
+            else:
+                self._model = self._model.to(dtype=getattr(torch, precision.value))
+            if self._gpu_available:
+                self._model = self._model.cuda()
+        except BaseException as e:
+            logger.error(
+                'Create model with specified precision failed - model: {}, precision: {}, message: {}.'.format(
+                    self._name, precision, str(e)
+                )
+            )
+            return False
+
+        self._target = torch.LongTensor(self._args.batch_size).random_(self._args.num_classes)
+        if self._gpu_available:
+            self._target = self._target.cuda()
+
+        return True
+
+    def _train_step(self, precision):
+        """Define the training process.
+
+        Args:
+            precision (Precision): precision of model and input data, such as float32, float16.
+
+        Return:
+            The step-time list of every training step.
+        """
+        duration = []
+        curr_step = 0
+        check_frequency = 100
+        while True:
+            for idx, sample in enumerate(self._dataloader):
+                start = self._timer()
+                if self._gpu_available:
+                    sample = sample.cuda()
+                self._optimizer.zero_grad()
+                if self._fp8_recipe is not None:
+                    with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
+                        output = self._model(sample)
+                else:
+                    output = self._model(sample)
+                loss = self._loss_fn(output[range(self._args.batch_size), -1], self._target)
+                loss.backward()
+                self._optimizer.step()
+                end = self._timer()
+                curr_step += 1
+                if curr_step > self._args.num_warmup:
+                    # Save the step time of every training/inference step, unit is millisecond.
+                    duration.append((end - start) * 1000)
+                    self._log_step_time(curr_step, precision, duration)
+                if self._is_finished(curr_step, end, check_frequency):
+                    return duration
+
+    def _inference_step(self, precision):
+        """Define the inference process.
+
+        Args:
+            precision (Precision): precision of model and input data,
+              such as float32, float16.
+
+        Return:
+            The latency list of every inference operation.
+        """
+        duration = []
+        curr_step = 0
+        with torch.no_grad():
+            self._model.eval()
+            while True:
+                for idx, sample in enumerate(self._dataloader):
+                    start = self._timer()
+                    if self._gpu_available:
+                        sample = sample.cuda()
+                    if self._fp8_recipe is not None:
+                        with te.fp8_autocast(enabled=True, fp8_recipe=self._fp8_recipe):
+                            self._model(sample)
+                    else:
+                        self._model(sample)
+                    end = self._timer()
+                    curr_step += 1
+                    if curr_step > self._args.num_warmup:
+                        # Save the step time of every training/inference step, unit is millisecond.
+                        duration.append((end - start) * 1000)
+                        self._log_step_time(curr_step, precision, duration)
+                    if self._is_finished(curr_step, end):
+                        return duration
+
+
+# Register Llama2 benchmark with 7b parameters.
+BenchmarkRegistry.register_benchmark(
+    'pytorch-llama2-7b', PytorchLlama, parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32'
+)
+
+# Register Llama2 benchmark with 13b parameters.
+BenchmarkRegistry.register_benchmark(
+    'pytorch-llama2-13b', PytorchLlama, parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40'
+)
+
+# Register Llama2 benchmark with 70b parameters.
+BenchmarkRegistry.register_benchmark(
+    'pytorch-llama2-70b', PytorchLlama, parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8'
+)

From 697138a2471052c357eb2356a7045d21add7ec8f Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 12:18:19 -0500
Subject: [PATCH 02/26] add llama2 unit test

---
 .../model_benchmarks/test_pytorch_llama.py    | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 tests/benchmarks/model_benchmarks/test_pytorch_llama.py

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
new file mode 100644
index 000000000..c01d8555d
--- /dev/null
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for Llama model benchmarks."""
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode
+from superbench.benchmarks.model_benchmarks.pytorch_llama import PytorchLlama
+
+@decorator.cuda_test
+@decorator.pytorch_test
+def test_pytorch_llama_7b():
+    """Test pytorch-llama2-7b benchmark."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'pytorch-llama2-7b',
+        platform=Platform.CUDA,
+        parameters='--batch_size 1 --num_classes 100 --seq_len 512 --num_warmup 2 --num_steps 4 \
+            --model_action train inference',
+        framework=Framework.PYTORCH
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (isinstance(benchmark, PytorchLlama))
+    assert (benchmark.name == 'pytorch-llama2-7b')
+    assert (benchmark.type == BenchmarkType.MODEL)
+
+    # Check predefined parameters of llama2 7b model.
+    assert (benchmark._args.hidden_size == 4096)
+    assert (benchmark._args.num_hidden_layers == 32)
+    assert (benchmark._args.num_attention_heads == 32)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.batch_size == 1)
+    assert (benchmark._args.num_classes == 100)
+    assert (benchmark._args.seq_len == 521)
+    assert (benchmark._args.num_warmup == 2)
+    assert (benchmark._args.num_steps == 4)
+
+    # Test Dataset.
+    assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size)
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    for metric in [
+        'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
+        'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
+    ]:
+        assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
+        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        assert (len(benchmark.result[metric]) == benchmark.run_count)

From 9355e22b7cd1ac62137227ddd16b0c1adeffc32f Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 15:27:38 -0500
Subject: [PATCH 03/26] fix dims for llama2 unit test

---
 superbench/benchmarks/model_benchmarks/pytorch_llama.py | 8 ++++----
 tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 9 ++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index 73e1b3200..13257c204 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -76,17 +76,17 @@ def add_parser_arguments(self):
         super().add_parser_arguments()
 
         self._parser.add_argument('--num_classes', type=int, default=100, required=False, help='Num of class.')
-        self._parser.add_argument('--hidden_size', type=int, default=4096, required=False, help='Hidden size.')
+        self._parser.add_argument('--hidden_size', type=int, default=1280, required=False, help='Hidden size.')
         self._parser.add_argument(
-            '--num_hidden_layers', type=int, default=32, required=False, help='The number of hidden layers.'
+            '--num_hidden_layers', type=int, default=36, required=False, help='The number of hidden layers.'
         )
         self._parser.add_argument(
-            '--num_attention_heads', type=int, default=32, required=False, help='The number of attention heads.'
+            '--num_attention_heads', type=int, default=20, required=False, help='The number of attention heads.'
         )
+        self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.')
         self._parser.add_argument(
             '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.'
         )
-        self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.')
 
     def _generate_dataset(self):
         """Generate dataset for benchmarking according to shape info.
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index c01d8555d..9e2793853 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -12,9 +12,9 @@
 def test_pytorch_llama_7b():
     """Test pytorch-llama2-7b benchmark."""
     context = BenchmarkRegistry.create_benchmark_context(
-        'pytorch-llama2-7b',
+        'llama2-7b',
         platform=Platform.CUDA,
-        parameters='--batch_size 1 --num_classes 100 --seq_len 512 --num_warmup 2 --num_steps 4 \
+        parameters='--batch_size 1 --seq_len 32 --num_warmup 2 --num_steps 4 --precision float16 \
             --model_action train inference',
         framework=Framework.PYTORCH
     )
@@ -37,7 +37,7 @@ def test_pytorch_llama_7b():
     # Check parameters specified in BenchmarkContext.
     assert (benchmark._args.batch_size == 1)
     assert (benchmark._args.num_classes == 100)
-    assert (benchmark._args.seq_len == 521)
+    assert (benchmark._args.seq_len == 32)
     assert (benchmark._args.num_warmup == 2)
     assert (benchmark._args.num_steps == 4)
 
@@ -48,8 +48,7 @@ def test_pytorch_llama_7b():
     assert (benchmark.run_count == 1)
     assert (benchmark.return_code == ReturnCode.SUCCESS)
     for metric in [
-        'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
-        'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
+        'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
     ]:
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From 60eae3610b40a184d66fc71ef32fedd4a88cfefe Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 15:29:57 -0500
Subject: [PATCH 04/26] update transformers version for LLamaConfig

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 738095889..584aed22e 100644
--- a/setup.py
+++ b/setup.py
@@ -211,7 +211,7 @@ def run(self):
             'torch': [
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
-                'transformers>=4.3.3, <4.23.0',
+                'transformers>=4.28.0',
             ],
             'ort': [
                 'onnx>=1.10.2',

From dadb56adb7754c4931d47cb5d200ee5895c7582a Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 15:45:13 -0500
Subject: [PATCH 05/26] update docs

---
 docs/superbench-config.mdx                        | 3 ++-
 docs/user-tutorial/benchmarks/model-benchmarks.md | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx
index 102b8d69f..051abeda3 100644
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -328,7 +328,8 @@ A list of models to run, only supported in model-benchmark.
     shufflenet_v2_x0_5 | shufflenet_v2_x1_0 | shufflenet_v2_x1_5 | shufflenet_v2_x2_0 |
     squeezenet1_0 | squeezenet1_1 |
     vgg11 | vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19_bn | vgg19 |
-    bert-base | bert-large | gpt2-small | gpt2-medium | gpt2-large | gpt2-xl ]
+    bert-base | bert-large | gpt2-small | gpt2-medium | gpt2-large | gpt2-xl |
+    llama2-7b | llama2-13b | llama2-70b ]
   ```
 * default value: `[ ]`
 
diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index 34fdf4c70..71e8832cf 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -13,6 +13,7 @@ id: model-benchmarks
 Run training or inference tasks with single or half precision for deep learning models,
 including the following categories:
 * GPT: gpt2-small, gpt2-medium, gpt2-large and gpt2-xl
+* LLAMA: llama2-7b, llama2-13b, llama2-70b
 * BERT: bert-base and bert-large
 * LSTM
 * CNN, listed in [`torchvision.models`](https://pytorch.org/vision/0.8/models.html), including:

From d2731a8af997c9dc38ad8ab819ea558b62200533 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Mon, 18 Nov 2024 23:02:52 -0500
Subject: [PATCH 06/26] update opset for torch onnx conversion

---
 .../benchmarks/micro_benchmarks/_export_torch_to_onnx.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index 1e37b793d..f9c9a4bed 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -9,7 +9,7 @@
 import torch.hub
 import torch.onnx
 import torchvision.models
-from transformers import BertConfig, GPT2Config
+from transformers import BertConfig, GPT2Config, ll
 
 from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
 from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
@@ -138,7 +138,7 @@ def export_torchvision_model(self, model_name, batch_size=1):
             model,
             dummy_input,
             file_name,
-            opset_version=10,
+            opset_version=14,
             operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
             input_names=['input'],
             output_names=['output'],
@@ -179,7 +179,7 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
             model,
             dummy_input,
             file_name,
-            opset_version=10,
+            opset_version=14,
             do_constant_folding=True,
             input_names=['input'],
             output_names=['output'],

From 3644985e32b782e6a230352e555328882e14c4f4 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 19 Nov 2024 00:27:11 -0500
Subject: [PATCH 07/26] format and lint

---
 examples/benchmarks/pytorch_llama2.py           |  2 +-
 .../model_benchmarks/pytorch_llama.py           | 17 ++++++++++++++---
 .../model_benchmarks/test_pytorch_llama.py      |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/examples/benchmarks/pytorch_llama2.py b/examples/benchmarks/pytorch_llama2.py
index 04db22574..2290ba1a5 100644
--- a/examples/benchmarks/pytorch_llama2.py
+++ b/examples/benchmarks/pytorch_llama2.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-"""Model benchmark example for Llama2-7b (24-layer, 3072-hidden, 24-heads, 7B parameters).
+"""Model benchmark example for Llama2-7b (32-layer, 4096-hidden, 32-heads, 7B parameters).
 
 Commands to run:
   python3 examples/benchmarks/pytorch_llama2.py (Single GPU)
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index 13257c204..9701eb59f 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -20,6 +20,7 @@
 
 class LlamaBenchmarkModel(torch.nn.Module):
     """The Llama model for benchmarking."""
+
     def __init__(self, config, num_classes):
         """Constructor.
 
@@ -49,6 +50,7 @@ def forward(self, input):
 
 class PytorchLlama(PytorchBase):
     """The Llama benchmark class."""
+
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -85,7 +87,11 @@ def add_parser_arguments(self):
         )
         self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.')
         self._parser.add_argument(
-            '--num_key_value_heads', type=int, default=None, required=False, help='The number of key_value heads that should be used to implement Grouped Query Attention.'
+            '--num_key_value_heads',
+            type=int,
+            default=None,
+            required=False,
+            help='The number of key_value heads that should be used to implement Grouped Query Attention.'
         )
 
     def _generate_dataset(self):
@@ -110,7 +116,10 @@ def _create_model(self, precision):
             precision (Precision): precision of model and input data, such as float32, float16.
         """
         self._config = LlamaConfig(
-            hidden_size=self._args.hidden_size, num_hidden_layers=self._args.num_hidden_layers, num_attention_heads=self._args.num_attention_heads, num_key_value_heads=self._args.num_key_value_heads
+            hidden_size=self._args.hidden_size,
+            num_hidden_layers=self._args.num_hidden_layers,
+            num_attention_heads=self._args.num_attention_heads,
+            num_key_value_heads=self._args.num_key_value_heads
         )
 
         enable_fp8 = precision.name.startswith('FP8_')
@@ -235,5 +244,7 @@ def _inference_step(self, precision):
 
 # Register Llama2 benchmark with 70b parameters.
 BenchmarkRegistry.register_benchmark(
-    'pytorch-llama2-70b', PytorchLlama, parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8'
+    'pytorch-llama2-70b',
+    PytorchLlama,
+    parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8'
 )
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 9e2793853..0a17170d4 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -7,6 +7,7 @@
 from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode
 from superbench.benchmarks.model_benchmarks.pytorch_llama import PytorchLlama
 
+
 @decorator.cuda_test
 @decorator.pytorch_test
 def test_pytorch_llama_7b():

From 52f4900e02a0a4b0c1156b02a34a96f77547248e Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 19 Nov 2024 01:13:29 -0500
Subject: [PATCH 08/26] remove remnant

---
 superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index f9c9a4bed..abb75676d 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -9,7 +9,7 @@
 import torch.hub
 import torch.onnx
 import torchvision.models
-from transformers import BertConfig, GPT2Config, ll
+from transformers import BertConfig, GPT2Config
 
 from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
 from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel

From f826676e970b54e25d35c122f6b6e8f4fb1ece42 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 19 Nov 2024 21:30:26 -0500
Subject: [PATCH 09/26] lint fix

---
 superbench/benchmarks/model_benchmarks/pytorch_llama.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index 9701eb59f..26fad9bb2 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -20,7 +20,6 @@
 
 class LlamaBenchmarkModel(torch.nn.Module):
     """The Llama model for benchmarking."""
-
     def __init__(self, config, num_classes):
         """Constructor.
 
@@ -50,7 +49,6 @@ def forward(self, input):
 
 class PytorchLlama(PytorchBase):
     """The Llama benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 

From 6a410875507c8a978106490b04f1db63bcaf8ec7 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 00:34:43 -0500
Subject: [PATCH 10/26] replace py 3.6 with 3.10 and update cuda to 12.4 for
 unit test

---
 .azure-pipelines/cpu-unit-test.yml  | 4 ++--
 .azure-pipelines/cuda-unit-test.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/cpu-unit-test.yml b/.azure-pipelines/cpu-unit-test.yml
index 7fc698f4f..1de67824f 100644
--- a/.azure-pipelines/cpu-unit-test.yml
+++ b/.azure-pipelines/cpu-unit-test.yml
@@ -7,12 +7,12 @@ trigger:
 
 strategy:
   matrix:
-    python-3.6:
-      imageTag: '3.6'
     python-3.7:
       imageTag: '3.7'
     python-3.8:
       imageTag: '3.8'
+    python-3.10:
+      imageTag: '3.10'
     # TODO
     #python-latest:
     #  imageTag: '3'
diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml
index e0a69fc0d..056917184 100644
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -10,7 +10,7 @@ pool:
   vmImage: ubuntu-latest
 
 container:
-  image: nvcr.io/nvidia/pytorch:20.12-py3
+  image: nvcr.io/nvidia/pytorch:24.03-py3
   options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/'
 
 steps:

From 5b816b472da403b6780ad5baaab4918e70fddf09 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 00:43:14 -0500
Subject: [PATCH 11/26] remove 3.6 from setup, codecov and docs

---
 .codecov.yml                          | 2 --
 docs/getting-started/installation.mdx | 2 +-
 setup.py                              | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.codecov.yml b/.codecov.yml
index 81d50f8bc..50a778c4e 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -14,7 +14,6 @@ coverage:
         target: 80%
         threshold: 1%
         flags:
-          - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
           - cuda-unit-test
           - directx-unit-test
@@ -23,7 +22,6 @@ coverage:
         target: 80%
         threshold: 1%
         flags:
-          - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
           - cuda-unit-test
           - directx-unit-test
diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
index 30fdee829..0a582e92f 100644
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -26,7 +26,7 @@ Here're the system requirements for control node.
 ### Requirements
 
 * Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later.
-* [Python](https://www.python.org/) version 3.6 or later (which can be checked by running `python3 --version`).
+* [Python](https://www.python.org/) version 3.7 or later (which can be checked by running `python3 --version`).
 * [Pip](https://pip.pypa.io/en/stable/installing/) version 18.0 or later (which can be checked by running `python3 -m pip --version`).
 
 :::note
diff --git a/setup.py b/setup.py
index 584aed22e..2474dcbc1 100644
--- a/setup.py
+++ b/setup.py
@@ -131,17 +131,17 @@ def run(self):
         'Operating System :: POSIX',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: System :: Benchmark',
         'Topic :: System :: Clustering',
         'Topic :: System :: Hardware',
     ],
     keywords='benchmark, AI systems',
     packages=find_packages(exclude=['tests']),
-    python_requires='>=3.6, <4',
+    python_requires='>=3.7, <4',
     use_scm_version={
         'local_scheme': 'node-and-date',
         'version_scheme': lambda _: superbench.__version__,

From f322c98d31c56a3413e035f10a0fe6dbe0f895eb Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 02:14:26 -0500
Subject: [PATCH 12/26] add llama fp8 unit test for better code coverage

---
 .../model_benchmarks/test_pytorch_llama.py    | 55 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 0a17170d4..69eed1545 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -15,7 +15,7 @@ def test_pytorch_llama_7b():
     context = BenchmarkRegistry.create_benchmark_context(
         'llama2-7b',
         platform=Platform.CUDA,
-        parameters='--batch_size 1 --seq_len 32 --num_warmup 2 --num_steps 4 --precision float16 \
+        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \
             --model_action train inference',
         framework=Framework.PYTORCH
     )
@@ -39,8 +39,8 @@ def test_pytorch_llama_7b():
     assert (benchmark._args.batch_size == 1)
     assert (benchmark._args.num_classes == 100)
     assert (benchmark._args.seq_len == 32)
-    assert (benchmark._args.num_warmup == 2)
-    assert (benchmark._args.num_steps == 4)
+    assert (benchmark._args.num_warmup == 1)
+    assert (benchmark._args.num_steps == 2)
 
     # Test Dataset.
     assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size)
@@ -54,3 +54,52 @@ def test_pytorch_llama_7b():
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
         assert (len(benchmark.result[metric]) == benchmark.run_count)
+
+
+@decorator.cuda_test
+@decorator.pytorch_test
+def test_pytorch_llama_7b_fp8_inference():
+    """Test pytorch-llama2-7b benchmark for fp8 inference."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'llama2-7b',
+        platform=Platform.CUDA,
+        parameters='--batch_size 8 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \
+            --model_action inference',
+        framework=Framework.PYTORCH
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (isinstance(benchmark, PytorchLlama))
+    assert (benchmark.name == 'pytorch-llama2-7b')
+    assert (benchmark.type == BenchmarkType.MODEL)
+
+    # Check predefined parameters of llama2 7b model.
+    assert (benchmark._args.hidden_size == 4096)
+    assert (benchmark._args.num_hidden_layers == 32)
+    assert (benchmark._args.num_attention_heads == 32)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.batch_size == 8)
+    assert (benchmark._args.num_classes == 100)
+    assert (benchmark._args.seq_len == 32)
+    assert (benchmark._args.num_warmup == 1)
+    assert (benchmark._args.num_steps == 2)
+
+    # Test Dataset.
+    assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size)
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+
+    for metric in [
+        'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput'
+    ]:
+        assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
+        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        assert (len(benchmark.result[metric]) == benchmark.run_count)

From b28ee17f2d7eb3d4781bfbd117100f25f786d9e5 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 08:40:04 -0500
Subject: [PATCH 13/26] llama fp8 precision test only, to reduce memory
 required

---
 .../model_benchmarks/test_pytorch_llama.py    | 54 ++-----------------
 1 file changed, 3 insertions(+), 51 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 69eed1545..0dfb390b1 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -11,11 +11,11 @@
 @decorator.cuda_test
 @decorator.pytorch_test
 def test_pytorch_llama_7b():
-    """Test pytorch-llama2-7b benchmark."""
+    """Test pytorch-llama2-7b benchmark for fp8 inference."""
     context = BenchmarkRegistry.create_benchmark_context(
         'llama2-7b',
         platform=Platform.CUDA,
-        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \
+        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \
             --model_action train inference',
         framework=Framework.PYTORCH
     )
@@ -45,60 +45,12 @@ def test_pytorch_llama_7b():
     # Test Dataset.
     assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size)
 
-    # Check results and metrics.
-    assert (benchmark.run_count == 1)
-    assert (benchmark.return_code == ReturnCode.SUCCESS)
-    for metric in [
-        'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
-    ]:
-        assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
-        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
-        assert (len(benchmark.result[metric]) == benchmark.run_count)
-
-
-@decorator.cuda_test
-@decorator.pytorch_test
-def test_pytorch_llama_7b_fp8_inference():
-    """Test pytorch-llama2-7b benchmark for fp8 inference."""
-    context = BenchmarkRegistry.create_benchmark_context(
-        'llama2-7b',
-        platform=Platform.CUDA,
-        parameters='--batch_size 8 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \
-            --model_action inference',
-        framework=Framework.PYTORCH
-    )
-
-    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
-
-    benchmark = BenchmarkRegistry.launch_benchmark(context)
-
-    # Check basic information.
-    assert (benchmark)
-    assert (isinstance(benchmark, PytorchLlama))
-    assert (benchmark.name == 'pytorch-llama2-7b')
-    assert (benchmark.type == BenchmarkType.MODEL)
-
-    # Check predefined parameters of llama2 7b model.
-    assert (benchmark._args.hidden_size == 4096)
-    assert (benchmark._args.num_hidden_layers == 32)
-    assert (benchmark._args.num_attention_heads == 32)
-
-    # Check parameters specified in BenchmarkContext.
-    assert (benchmark._args.batch_size == 8)
-    assert (benchmark._args.num_classes == 100)
-    assert (benchmark._args.seq_len == 32)
-    assert (benchmark._args.num_warmup == 1)
-    assert (benchmark._args.num_steps == 2)
-
-    # Test Dataset.
-    assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size)
-
     # Check results and metrics.
     assert (benchmark.run_count == 1)
     assert (benchmark.return_code == ReturnCode.SUCCESS)
 
     for metric in [
-        'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput'
+        'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput'
     ]:
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From e6f6be3ba3af93507ae7c752f24bb9805ae81c35 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 14:00:36 -0500
Subject: [PATCH 14/26] lint fix

---
 tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 0dfb390b1..7ab6fcffe 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -50,7 +50,8 @@ def test_pytorch_llama_7b():
     assert (benchmark.return_code == ReturnCode.SUCCESS)
 
     for metric in [
-        'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time', 'fp8_e4m3_inference_throughput'
+        'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time',
+        'fp8_e4m3_inference_throughput'
     ]:
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From 297a2293a099cdc1df2254cab2f617b8ce98eddf Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 20 Nov 2024 23:41:29 -0800
Subject: [PATCH 15/26] remove deprecated NaN usage for numpy>2.0

---
 tests/analyzer/test_summaryop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/analyzer/test_summaryop.py b/tests/analyzer/test_summaryop.py
index 3b1054444..889ebc1e8 100644
--- a/tests/analyzer/test_summaryop.py
+++ b/tests/analyzer/test_summaryop.py
@@ -4,7 +4,7 @@
 """Tests for SummaryOp module."""
 
 import unittest
-from numpy import NaN, float64
+from numpy import nan, float64
 
 import pandas as pd
 
@@ -55,7 +55,7 @@ def test_rule_op(self):
         # Test - std
         result = SummaryOp.std(raw_data_df)
         print(result)
-        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64)
+        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, nan], index=['a', 'b', 'c', 'd'], dtype=float64)
         pd.testing.assert_series_equal(result, expectedResult)
         # Test - count
         result = SummaryOp.count(raw_data_df)

From 5f72f51cd190d7b43633200a26aa80603dbc8460 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 21 Nov 2024 01:47:21 -0800
Subject: [PATCH 16/26] fix argparse formatting related test cases failure for
 3.10

---
 superbench/benchmarks/base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 86c6b6d15..944099bb6 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -87,7 +87,12 @@ def get_configurable_settings(self):
         Return:
             All configurable settings in raw string.
         """
-        return self._parser.format_help().strip()
+        message = self._parser.format_help().strip()
+        # Ensure consistent header across diff python argparse format_help output.
+        if "options:" in message:
+            message = message.replace("options:", "optional arguments:")
+        return message
+
 
     def parse_args(self, ignore_invalid=False):
         """Parse the arguments.

From 8bbe32608e4e93f2fed002a94522bd791ba8559c Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 21 Nov 2024 01:53:38 -0800
Subject: [PATCH 17/26] fix lint

---
 superbench/benchmarks/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 944099bb6..71ea96247 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -93,7 +93,6 @@ def get_configurable_settings(self):
             message = message.replace("options:", "optional arguments:")
         return message
 
-
     def parse_args(self, ignore_invalid=False):
         """Parse the arguments.
 

From 50452effc95d3576638da4a34f23e2f7fff5918c Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 21 Nov 2024 02:03:18 -0800
Subject: [PATCH 18/26] fix lint

---
 superbench/benchmarks/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 71ea96247..af0443249 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -89,8 +89,8 @@ def get_configurable_settings(self):
         """
         message = self._parser.format_help().strip()
         # Ensure consistent header across diff python argparse format_help output.
-        if "options:" in message:
-            message = message.replace("options:", "optional arguments:")
+        if 'options:' in message:
+            message = message.replace('options:', 'optional arguments:')
         return message
 
     def parse_args(self, ignore_invalid=False):

From 0b1da4f67dbe39739e0ebe4b9342622537365fbe Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 21 Nov 2024 10:27:48 -0800
Subject: [PATCH 19/26] add llama2 to tensorrt

---
 .../micro_benchmarks/_export_torch_to_onnx.py | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index abb75676d..36a50f3bd 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -9,11 +9,12 @@
 import torch.hub
 import torch.onnx
 import torchvision.models
-from transformers import BertConfig, GPT2Config
+from transformers import BertConfig, GPT2Config, LlamaConfig
 
 from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
 from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
 from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel
+from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel
 
 
 class torch2onnxExporter():
@@ -87,6 +88,34 @@ def __init__(self):
                 ),
                 self.num_classes,
             ),
+            'llama2-7b':
+            lambda: LlamaBenchmarkModel(
+                LlamaConfig(
+                    hidden_size=4096,
+                    num_hidden_layers=32,
+                    num_attention_heads=32,
+                ),
+                self.num_classes,
+            ),
+            'llama2-13b':
+            lambda: LlamaBenchmarkModel(
+                LlamaConfig(
+                    hidden_size=5120,
+                    num_hidden_layers=40,
+                    num_attention_heads=40,
+                ),
+                self.num_classes,
+            ),
+            'llama2-70b':
+            lambda: LlamaBenchmarkModel(
+                LlamaConfig(
+                    hidden_size=8192,
+                    num_hidden_layers=80,
+                    num_attention_heads=64,
+                    num_key_value_heads=8,
+                ),
+                self.num_classes,
+            ),
         }
         self._onnx_model_path = Path(torch.hub.get_dir()) / 'onnx'
         self._onnx_model_path.mkdir(parents=True, exist_ok=True)

From e423aecabdbc325930181e30593c690e7917b658 Mon Sep 17 00:00:00 2001
From: dpatlolla <dpatlolla@microsoft.com>
Date: Fri, 22 Nov 2024 03:00:51 +0000
Subject: [PATCH 20/26] add more params to llama config

---
 .../micro_benchmarks/_export_torch_to_onnx.py |  5 ++++
 .../model_benchmarks/pytorch_llama.py         | 26 ++++++++++++++++---
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index 36a50f3bd..0f28f4f6a 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -94,6 +94,8 @@ def __init__(self):
                     hidden_size=4096,
                     num_hidden_layers=32,
                     num_attention_heads=32,
+                    num_key_value_heads=32,
+                    intermediate_size=11008,
                 ),
                 self.num_classes,
             ),
@@ -103,6 +105,8 @@ def __init__(self):
                     hidden_size=5120,
                     num_hidden_layers=40,
                     num_attention_heads=40,
+                    num_key_value_heads=40,
+                    intermediate_size=13824,
                 ),
                 self.num_classes,
             ),
@@ -113,6 +117,7 @@ def __init__(self):
                     num_hidden_layers=80,
                     num_attention_heads=64,
                     num_key_value_heads=8,
+                    intermediate_size=28672,
                 ),
                 self.num_classes,
             ),
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index 26fad9bb2..e292105cf 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -83,6 +83,13 @@ def add_parser_arguments(self):
         self._parser.add_argument(
             '--num_attention_heads', type=int, default=20, required=False, help='The number of attention heads.'
         )
+        self._parser.add_argument(
+            '--intermediate_size',
+            type=int,
+            default=11008,
+            required=False,
+            help='Dimension of the MLP representations.'
+        )
         self._parser.add_argument('--seq_len', type=int, default=512, required=False, help='Sequence length.')
         self._parser.add_argument(
             '--num_key_value_heads',
@@ -113,11 +120,15 @@ def _create_model(self, precision):
         Args:
             precision (Precision): precision of model and input data, such as float32, float16.
         """
+
         self._config = LlamaConfig(
             hidden_size=self._args.hidden_size,
             num_hidden_layers=self._args.num_hidden_layers,
             num_attention_heads=self._args.num_attention_heads,
-            num_key_value_heads=self._args.num_key_value_heads
+            num_key_value_heads=self._args.num_key_value_heads,
+            intermediate_size=self._args.intermediate_size,
+            max_position_embeddings=4096,    # Maximum sequence length that llama2 supports
+            rms_norm_eps=1e-05,    # Llama2 default for epsilon used by the rms normalization layers
         )
 
         enable_fp8 = precision.name.startswith('FP8_')
@@ -232,17 +243,24 @@ def _inference_step(self, precision):
 
 # Register Llama2 benchmark with 7b parameters.
 BenchmarkRegistry.register_benchmark(
-    'pytorch-llama2-7b', PytorchLlama, parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32'
+    'pytorch-llama2-7b',
+    PytorchLlama,
+    parameters='--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32 --num_key_value_heads=32 \
+        --intermediate_size=11008'
 )
 
 # Register Llama2 benchmark with 13b parameters.
 BenchmarkRegistry.register_benchmark(
-    'pytorch-llama2-13b', PytorchLlama, parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40'
+    'pytorch-llama2-13b',
+    PytorchLlama,
+    parameters='--hidden_size=5120 --num_hidden_layers=40 --num_attention_heads=40 --num_key_value_heads=40 \
+        --intermediate_size=13824'
 )
 
 # Register Llama2 benchmark with 70b parameters.
 BenchmarkRegistry.register_benchmark(
     'pytorch-llama2-70b',
     PytorchLlama,
-    parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8'
+    parameters='--hidden_size=8192 --num_hidden_layers=80 --num_attention_heads=64 --num_key_value_heads=8 \
+        --intermediate_size=28672'
 )

From d850210a9ac0fd6cd36762a13a01198ca61b69a0 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Thu, 21 Nov 2024 19:11:39 -0800
Subject: [PATCH 21/26] fix lint

---
 superbench/benchmarks/model_benchmarks/pytorch_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index e292105cf..7161aeb83 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -120,7 +120,6 @@ def _create_model(self, precision):
         Args:
             precision (Precision): precision of model and input data, such as float32, float16.
         """
-
         self._config = LlamaConfig(
             hidden_size=self._args.hidden_size,
             num_hidden_layers=self._args.num_hidden_layers,

From 670dc769dc6c07ba3c75ed313572713760e55573 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 26 Nov 2024 22:39:34 -0800
Subject: [PATCH 22/26] llama test: use fp16 instead of fp8 to relax cuda CC
 req.

---
 tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 7ab6fcffe..d0f779797 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -11,11 +11,11 @@
 @decorator.cuda_test
 @decorator.pytorch_test
 def test_pytorch_llama_7b():
-    """Test pytorch-llama2-7b benchmark for fp8 inference."""
+    """Test pytorch-llama2-7b benchmark for fp16 inference."""
     context = BenchmarkRegistry.create_benchmark_context(
         'llama2-7b',
         platform=Platform.CUDA,
-        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp8_e4m3 \
+        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp16 \
             --model_action train inference',
         framework=Framework.PYTORCH
     )
@@ -50,8 +50,8 @@ def test_pytorch_llama_7b():
     assert (benchmark.return_code == ReturnCode.SUCCESS)
 
     for metric in [
-        'fp8_e4m3_train_step_time', 'fp8_e4m3_train_throughput', 'fp8_e4m3_inference_step_time',
-        'fp8_e4m3_inference_throughput'
+        'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time',
+        'fp16_inference_throughput'
     ]:
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From 27c788c920e344133c2f47b4c90d4f2b4a7fbd6b Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 26 Nov 2024 22:47:02 -0800
Subject: [PATCH 23/26] fix comment and lint

---
 tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index d0f779797..92bd1c89d 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -11,7 +11,7 @@
 @decorator.cuda_test
 @decorator.pytorch_test
 def test_pytorch_llama_7b():
-    """Test pytorch-llama2-7b benchmark for fp16 inference."""
+    """Test pytorch-llama2-7b benchmark for fp16 train and inference."""
     context = BenchmarkRegistry.create_benchmark_context(
         'llama2-7b',
         platform=Platform.CUDA,
@@ -50,8 +50,7 @@ def test_pytorch_llama_7b():
     assert (benchmark.return_code == ReturnCode.SUCCESS)
 
     for metric in [
-        'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time',
-        'fp16_inference_throughput'
+        'fp16_train_step_time', 'fp16_train_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
     ]:
         assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
         assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From 00d09ba36e2afb406df38391c010613caef271cd Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Tue, 26 Nov 2024 23:00:55 -0800
Subject: [PATCH 24/26] fix precision arg as float16

---
 tests/benchmarks/model_benchmarks/test_pytorch_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
index 92bd1c89d..a9a03d7b9 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_llama.py
@@ -15,7 +15,7 @@ def test_pytorch_llama_7b():
     context = BenchmarkRegistry.create_benchmark_context(
         'llama2-7b',
         platform=Platform.CUDA,
-        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision fp16 \
+        parameters='--batch_size 1 --seq_len 32 --num_warmup 1 --num_steps 2 --precision float16 \
             --model_action train inference',
         framework=Framework.PYTORCH
     )

From bd47fc3e17c8adb410a75070e48d675d2e1c8423 Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 27 Nov 2024 00:06:28 -0800
Subject: [PATCH 25/26] limit tokenizers version to < 0.20.3 as 0.20.4 doesnt
 support py3.8

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 2474dcbc1..58e2beb7e 100644
--- a/setup.py
+++ b/setup.py
@@ -211,6 +211,7 @@ def run(self):
             'torch': [
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
+                'tokenizers <= 0.20.3',
                 'transformers>=4.28.0',
             ],
             'ort': [

From bed3e01b53e729a3445b4fcd96e238f4571801fb Mon Sep 17 00:00:00 2001
From: dilip patlolla <dilipreddi@gmail.com>
Date: Wed, 27 Nov 2024 16:47:17 -0800
Subject: [PATCH 26/26] address review comments

---
 setup.py                      | 2 +-
 superbench/benchmarks/base.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 58e2beb7e..c6e2d1fe3 100644
--- a/setup.py
+++ b/setup.py
@@ -209,9 +209,9 @@ def run(self):
                 'yapf==0.31.0',
             ],
             'torch': [
+                'tokenizers<=0.20.3',
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
-                'tokenizers <= 0.20.3',
                 'transformers>=4.28.0',
             ],
             'ort': [
diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 649ade99d..8e6e58bfe 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -90,9 +90,6 @@ def get_configurable_settings(self):
             All configurable settings in raw string.
         """
         message = self._parser.format_help().strip()
-        # Ensure consistent header across diff python argparse format_help output.
-        if 'options:' in message:
-            message = message.replace('options:', 'optional arguments:')
         return message
 
     def parse_args(self, ignore_invalid=False):