From 07130b87266c142b81eb1fbf7c71c6db8600a4f6 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Tue, 22 Oct 2024 12:01:26 -0500
Subject: [PATCH 01/51] Get baseline_perplexity_scores from azure sharkpublic
 blob

---
 .github/workflows/ci_eval.yaml              |  6 --
 sharktank/conftest.py                       | 10 ++--
 sharktank/tests/evaluate/perplexity_test.py | 64 +++++++++++++++++----
 3 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index d3681d95a..be60e8f5e 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,9 +59,3 @@ jobs:
 
       - name: Run perplexity test
         run:  pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
-
-      - name: Update Perplexity baseline numbers
-        uses: actions/upload-artifact@v4
-        with:
-          name: current_perplexity_scores_json
-          path: ${{ env.SHARK_PLATFORM_REPO_ROOT }}/sharktank/sharktank/evaluate/
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index 040775409..b57af92a1 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -121,11 +121,11 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--baseline-perplexity-score-json",
+        "--baseline-perplexity-scores",
         type=Path,
         action="store",
-        default="sharktank/tests/evaluate/baseline_perplexity_scores.json",
-        help="Llama3.1 8B & 405B model baseline perplexity scores json",
+        default="https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/evaluation/baseline_perplexity_scores.npy",
+        help="Llama3.1 8B & 405B model baseline perplexity scores",
     )
 
 
@@ -189,7 +189,7 @@ def get_model_path(request: FixtureRequest):
     model_path["llama3_405b_fp8_model_path"] = set_fixture_from_cli_option(
         request, "--llama3-405b-fp8-model-path", "llama3_405b_fp8_model"
     )
-    model_path["baseline_perplexity_score_json"] = set_fixture_from_cli_option(
-        request, "--baseline-perplexity-score-json", "baseline_perplexity_score_json"
+    model_path["baseline_perplexity_scores"] = set_fixture_from_cli_option(
+        request, "--baseline-perplexity-scores", "baseline_perplexity_scores"
     )
     return model_path
diff --git a/sharktank/tests/evaluate/perplexity_test.py b/sharktank/tests/evaluate/perplexity_test.py
index faf3a263f..23b56efc8 100644
--- a/sharktank/tests/evaluate/perplexity_test.py
+++ b/sharktank/tests/evaluate/perplexity_test.py
@@ -6,7 +6,7 @@
 
 import unittest
 import pytest
-import json
+import numpy as np
 
 from sharktank.evaluate import perplexity
 
@@ -19,9 +19,9 @@ def setUp(self):
         self.current_perplexity_all = {}
         self.delta = 5e-1
         self.tensor_parallelism_size = 8
-
-        with open(self.baseline_perplexity_score_json, "r") as f:
-            self.baseline_perplexity = json.load(f)
+        self.baseline_perplexity = np.load(
+            self.baseline_perplexity_scores, allow_pickle=True
+        ).item()
 
     @longrun
     def test_llama3_8B_f16_decomposed(self):
@@ -38,11 +38,16 @@ def test_llama3_8B_f16_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -64,11 +69,16 @@ def test_llama3_8B_f16_non_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -89,11 +99,16 @@ def test_llama3_8B_fp8_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -115,11 +130,16 @@ def test_llama3_8B_fp8_non_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @longrun
@@ -138,11 +158,16 @@ def test_llama3_405B_f16_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -165,11 +190,16 @@ def test_llama3_405B_f16_non_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -191,11 +221,16 @@ def test_llama3_405B_fp8_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
     @pytest.mark.xfail(
@@ -218,11 +253,16 @@ def test_llama3_405B_fp8_non_decomposed(self):
             ]
         )
 
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
         self.assertAlmostEqual(
             baseline_perplexity["mean_perplexity"],
             current_perplexity["mean_perplexity"],
             delta=self.delta,
-            msg=f"Perplexity is deviating more than {self.delta}",
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
 

From ebe1e69d13a931799058686f2d9380da3bd73bda Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Wed, 23 Oct 2024 02:34:14 -0500
Subject: [PATCH 02/51] Add perplexity for vmfb

---
 .../{perplexity.py => perplexity_torch.py}    |  16 +-
 .../sharktank/evaluate/perplexity_vmfb.py     | 329 ++++++++++++++++++
 2 files changed, 336 insertions(+), 9 deletions(-)
 rename sharktank/sharktank/evaluate/{perplexity.py => perplexity_torch.py} (97%)
 create mode 100644 sharktank/sharktank/evaluate/perplexity_vmfb.py

diff --git a/sharktank/sharktank/evaluate/perplexity.py b/sharktank/sharktank/evaluate/perplexity_torch.py
similarity index 97%
rename from sharktank/sharktank/evaluate/perplexity.py
rename to sharktank/sharktank/evaluate/perplexity_torch.py
index 2c76a76ad..41dfaba17 100644
--- a/sharktank/sharktank/evaluate/perplexity.py
+++ b/sharktank/sharktank/evaluate/perplexity_torch.py
@@ -42,10 +42,10 @@
     logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
 )
 
-__all__ = ["Perplexity", "run_perplexity"]
+__all__ = ["Perplexity_torch", "run_perplexity_torch"]
 
 
-class Perplexity:
+class Perplexity_torch:
     """
     Perplexity (PPL) is one of the most common metrics for evaluating language models.
     It is defined as the exponentiated average negative log-likelihood of a sequence,
@@ -59,8 +59,6 @@ def __init__(
         device,
         kv_cache_type,
     ):
-        self.batch_size = 16
-
         self.device = device
         self.kv_cache_type = kv_cache_type
         self.activation_dtype = torch.float32
@@ -173,6 +171,8 @@ def get_logits(self):
             (self.token_ids != 0).int().detach().clone().to(self.device)
         )
 
+        self.bs = len(self.test_prompts)
+
         is_first_token = True
         start = 0
         for i in tqdm(
@@ -262,8 +262,6 @@ def compute_perplexity(self):
     def get_perplexity(self, test_prompts):
 
         self.test_prompts = test_prompts
-        self.bs = len(self.test_prompts)
-
         self.get_logits()
 
         self.out_logits = self.out_logits[..., :-1, :].contiguous()
@@ -281,7 +279,7 @@ def get_perplexity(self, test_prompts):
         return self.compute_perplexity()
 
 
-def run_perplexity(
+def run_perplexity_torch(
     dataset,
     tokenizer,
     device,
@@ -289,7 +287,7 @@ def run_perplexity(
     tensor_parallelism_size,
     attention_kernel,
 ):
-    perplexity = Perplexity(device=device, kv_cache_type=kv_cache_type)
+    perplexity = Perplexity_torch(device=device, kv_cache_type=kv_cache_type)
 
     perplexity.load_model(dataset, tokenizer, tensor_parallelism_size, attention_kernel)
     test_prompts = perplexity.get_prompts()
@@ -325,7 +323,7 @@ def main(argv):
     dataset = cli.get_input_dataset(args)
     tokenizer = cli.get_tokenizer(args)
 
-    ppl = run_perplexity(
+    ppl = run_perplexity_torch(
         dataset=dataset,
         tokenizer=tokenizer,
         device=device,
diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
new file mode 100644
index 000000000..d42f77ebd
--- /dev/null
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -0,0 +1,329 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import sys
+import logging
+import json
+import time
+import random
+from datetime import timedelta
+from tqdm import tqdm
+
+import numpy as np
+
+from datasets import load_dataset
+
+import torch
+from torch.nn import CrossEntropyLoss
+
+from sharktank.layers import *
+from sharktank.types import *
+
+from sharktank.utils.vmfb_runner import *
+from sharktank.utils import cli
+from sharktank.utils.load_llm import *
+
+import iree.runtime as ireert
+
+log_levels = {
+    "info": logging.INFO,
+    "debug": logging.DEBUG,
+}
+logger = logging.getLogger("eval")
+
+logger.setLevel(log_levels["info"])
+
+logger.root.handlers[0].setFormatter(
+    logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
+)
+
+__all__ = ["Perplexity", "run_perplexity"]
+
+
+class Perplexity:
+    """
+    Perplexity (PPL) is one of the most common metrics for evaluating language models.
+    It is defined as the exponentiated average negative log-likelihood of a sequence,
+    calculated with exponent base `e`.
+
+    For more information, see https://huggingface.co/docs/transformers/perplexity
+    """
+
+    def __init__(
+        self,
+        device,
+        tokenizer,
+    ):
+        self.device = device
+        self.tokenizer = tokenizer
+        self.pad_sequence_stride = 16
+        self.block_seq_stride = 16
+        self.free_pages = list(range(1, 8192))
+        # TODO: investigate cache
+        self.cache_state = model.cache.paged.allocate(page_cache_size)
+
+    def timeit(func):
+        def wrapper(*args, **kwargs):
+            start = time.time()
+            result = func(*args, **kwargs)
+            end = time.time()
+            seconds = end - start
+            time_taken = abs(timedelta(seconds=round(seconds)))
+
+            if seconds < 1:
+                time_taken = f" {seconds * 1000} ms"
+
+            func_name = func.__name__
+            if func_name == "get_perplexity":
+                func_name = "Total time"
+            logger.info(f" {func_name}: {time_taken}")
+            return result
+
+        return wrapper
+
+    def print_token_comparison(self, i):
+        if i <= self.max_prompt_length:
+            batch_predicted_token_id = [[i[-1]] for i in self.batch.results]
+            batch_predicted_token = self.tokenizer.decode(batch_predicted_token_id)
+            logger.debug(f"Predicted:")
+            logger.debug(f"{batch_predicted_token}")
+            logger.debug(f"{batch_predicted_token_id}")
+
+            expected_token_id = self.token_ids[:, i + 1 : i + 2].tolist()
+            expected_token = self.tokenizer.decode(expected_token_id)
+            logger.debug(f"Expected:")
+            logger.debug(f"{expected_token}")
+            logger.debug(f"{expected_token_id}")
+
+    def alloc_page(self) -> int:
+        # Only applies for paged attention
+        return self.free_pages.pop()
+
+    def pad_block_ids(self, seq_block_ids) -> torch.Tensor:
+        max_length = max(len(r) for r in seq_block_ids)
+        rows = [r + (max_length - len(r)) * [0] for r in seq_block_ids]
+        return torch.tensor(rows)
+
+    @timeit
+    def load_model(self, vmfb_path, gguf_weight_path):
+        return vmfbRunner(
+            device=self.device,
+            vmfb_path=vmfb_path,
+            external_weight_path=gguf_weight_path,
+        )
+
+    def get_args(self, seq_lens_batch):
+        # Assemble the batch.
+        seq_stride = self.block_seq_stride
+        seq_block_ids: list[list[int]] = []
+        for seq_len in seq_lens_batch:
+            blocks_needed = (
+                int(math.ceil(seq_len / seq_stride)) if seq_stride > 0 else 0
+            )
+            row = []
+            for _ in range(blocks_needed):
+                row.append(self.alloc_page())
+            seq_block_ids.append(row)
+
+        return seq_block_ids
+
+    @timeit
+    def get_prompts(self):
+        test_prompts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")[
+            "text"
+        ]
+
+        num_test_prompts = 219
+
+        random.seed(0)
+        test_prompts = random.sample(test_prompts, num_test_prompts)
+
+        # Ignore prompts that are: empty, less than 20 tokens or a title.
+        test_prompts = [
+            s.replace("\n", "").rstrip()
+            for s in test_prompts
+            if s != "" and len(s.split()) >= 20 and s.count("=") < 2
+        ]
+
+        logger.info(f" num_test_prompts: {len(test_prompts)}")
+
+        return test_prompts
+
+    @timeit
+    def get_logits(
+        self,
+    ):
+
+        token_ids, seq_lens = self.tokenizer.encode(
+            self.test_prompts,
+            pad_to_multiple_of=self.pad_sequence_stride,
+        )
+
+        logger.info(f" Prompts for Evaluation:")
+        for idx, prompt in enumerate(self.test_prompts):
+            logger.info(
+                f" Prompt {idx}: \nTokens: {prompt.encode()}\nToken ids: {token_ids[idx]}\n"
+            )
+
+        self.max_prompt_length = max(seq_lens)
+        self.token_ids = torch.tensor(token_ids)
+        self.attention_mask = (self.token_ids != 0).int().detach().clone()
+
+        self.bs = len(self.test_prompts)
+
+        is_first_token = True
+        start = 0
+        for i in tqdm(
+            range(start, self.max_prompt_length - 1),
+            desc="eval: Calculating logits",
+        ):
+            logger.debug(f"Iteration: {i}")
+
+            if is_first_token:
+
+                token_batch = self.token_ids[:, : i + 1]
+                logger.debug(f"Prefill:")
+
+                logger.debug("Input:")
+                logger.debug(f"{self.tokenizer.decode(token_batch)}")
+
+                token_batch, seq_lens_batch = self.tokenizer.pad_tokens(
+                    token_ids=token_batch.tolist(),
+                    pad_to_multiple_of=self.pad_sequence_stride,
+                )
+
+                logger.debug(f"{token_batch}")
+
+                token_batch = torch.tensor(token_batch, device=self.device)
+                seq_lens_batch = torch.tensor(seq_lens_batch, device=self.device)
+
+                seq_block_ids = self.get_args(seq_lens_batch)
+                seq_block_ids = self.pad_block_ids(seq_block_ids)
+                prefill_logits = self.runner.ctx.modules.module.prefill_bs4(
+                    token_batch, seq_lens_batch, seq_block_ids, self.cache_state
+                )
+
+                self.out_logits = prefill_logits[:, -1, :]
+                is_first_token = False
+
+                self.print_token_comparison(i)
+
+            else:
+                token_batch = self.token_ids[:, i : i + 1]
+
+                logger.debug("Decode:")
+
+                logger.debug("Input:")
+                logger.debug(f"{self.tokenizer.decode(token_batch)}")
+                logger.debug(f"{token_batch.tolist()}")
+
+                start_positions = seq_lens_batch.clone()
+                seq_lens_batch.add_(1)
+
+                seq_block_ids = self.get_args(seq_lens_batch)
+                seq_block_ids = self.pad_block_ids(seq_block_ids)
+                decode_logits = self.runner.ctx.modules.module.decode_bs4(
+                    token_batch, start_positions, seq_block_ids, self.cache_state
+                )
+
+                self.out_logits = torch.cat((self.out_logits, decode_logits), 1)
+
+                self.print_token_comparison(i)
+
+        pad_logits_shape = self.token_ids.shape[1] - self.out_logits.shape[1]
+
+        self.pad_logits = torch.zeros(
+            self.out_logits.shape[0], pad_logits_shape, self.out_logits.shape[2]
+        )
+
+        self.out_logits = torch.cat((self.out_logits, self.pad_logits), 1).to(
+            self.device
+        )
+
+    @timeit
+    def compute_perplexity(self):
+        loss_fct = CrossEntropyLoss(reduction="none")
+
+        ## perplexity = e ^ (sum(losses) / num_tokenized_tokens)
+        crossentropy_loss = (
+            loss_fct(self.out_logits.transpose(1, 2), self.token_ids)
+            * self.attention_mask
+        ).sum(1)
+        crossentropy_loss = torch.tensor(crossentropy_loss.tolist())
+        perplexity_batch = torch.exp(
+            crossentropy_loss / self.attention_mask.sum(1)
+        ).tolist()
+
+        perplexity_batch = [round(ppl, 6) for ppl in perplexity_batch]
+
+        return {
+            "perplexities": perplexity_batch,
+            "mean_perplexity": round(np.mean(perplexity_batch), 6),
+        }
+
+    @timeit
+    def get_perplexity(self, test_prompts):
+
+        self.test_prompts = test_prompts
+        self.get_logits()
+
+        self.out_logits = self.out_logits[..., :-1, :].contiguous()
+        self.token_ids = self.token_ids[..., 1:].contiguous()
+        self.attention_mask = self.attention_mask[..., 1:].contiguous()
+
+        logger.debug(f"Final Logits shape: {self.out_logits.shape}")
+        logger.debug(f"Token ids: {self.token_ids}, \n{self.token_ids.shape}")
+        logger.debug(
+            f"Mask shape: {self.attention_mask}, \n{self.attention_mask.shape}"
+        )
+
+        assert self.token_ids.shape == self.out_logits.shape[0:2]
+
+        return self.compute_perplexity()
+
+
+def run_perplexity(
+    vmfb_path,
+    gguf_weight_path,
+    tokenizer,
+    device,
+):
+    perplexity = Perplexity(device=device, tokenizer=tokenizer)
+    perplexity.load_model(tokenizer, vmfb_path, gguf_weight_path)
+    test_prompts = perplexity.get_prompts()
+    ppl = perplexity.get_perplexity(test_prompts=test_prompts)
+
+    return ppl
+
+
+def main(argv):
+    parser = cli.create_parser()
+    parser.add_argument("--device", help="Torch device (or default)")
+
+    cli.add_tokenizer_options(parser)
+    args = cli.parse(parser, args=argv)
+
+    device = torch.device(args.device) if args.device else None
+    tokenizer = cli.get_tokenizer(args)
+
+    # device could be local-sync:// local-task://
+    device = "hip://GPU-34346462-3466-6333-3231-353561336563"
+    vmfb_path = "/home/aramalin/SHARK-Platform/artifacts/llama70b_q4_1.vmfb"
+    gguf_weight_path = "/data/extra/models/llama70b_q4_1.gguf"
+
+    ppl = run_perplexity(
+        vmfb_path=vmfb_path,
+        gguf_weight_path=gguf_weight_path,
+        tokenizer=tokenizer,
+        device=device,
+    )
+
+    logger.info(f"\n{json.dumps(ppl, indent=2)}")
+    return ppl
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

From aa47d6797d24c3e48c215824bd95f13e32d2268b Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Wed, 23 Oct 2024 02:39:24 -0500
Subject: [PATCH 03/51] Add vmfb runner script

---
 sharktank/sharktank/utils/vmfb_runner.py | 82 ++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 sharktank/sharktank/utils/vmfb_runner.py

diff --git a/sharktank/sharktank/utils/vmfb_runner.py b/sharktank/sharktank/utils/vmfb_runner.py
new file mode 100644
index 000000000..cdbf96c9d
--- /dev/null
+++ b/sharktank/sharktank/utils/vmfb_runner.py
@@ -0,0 +1,82 @@
+from iree import runtime as ireert
+from iree.runtime._binding import create_hal_driver
+
+
+class vmfbRunner:
+    def __init__(self, device, vmfb_path, external_weight_path=None, extra_plugin=None):
+
+        # If an extra plugin is requested, add a global flag to load the plugin
+        # and create the driver using the non-caching creation function, as
+        # the caching creation function may ignore the flag.
+        if extra_plugin:
+            ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
+            haldriver = create_hal_driver(device)
+
+        # No plugin requested: create the driver with the caching create
+        # function.
+        else:
+            haldriver = ireert.get_driver(device)
+        if "://" in device:
+            try:
+                device_idx = int(device.split("://")[-1])
+                device_uri = None
+            except:
+                device_idx = None
+                device_uri = device.split("://")[-1]
+        else:
+            device_idx = 0
+            device_uri = None
+        if device_uri:
+            if not any(x in device for x in ["cpu", "task"]):
+                allocators = ["caching"]
+                haldevice = haldriver.create_device_by_uri(
+                    device_uri, allocators=allocators
+                )
+            else:
+                haldevice = haldriver.create_device_by_uri(device_uri)
+        else:
+            hal_device_id = haldriver.query_available_devices()[device_idx]["device_id"]
+            if not any(x in device for x in ["cpu", "task"]):
+                allocators = ["caching"]
+                haldevice = haldriver.create_device(
+                    hal_device_id, allocators=allocators
+                )
+            else:
+                haldevice = haldriver.create_device(hal_device_id)
+
+        self.config = ireert.Config(device=haldevice)
+        mods = []
+        if not isinstance(vmfb_path, list):
+            vmfb_path = [vmfb_path]
+        for path in vmfb_path:
+            mods.append(ireert.VmModule.mmap(self.config.vm_instance, path))
+        vm_modules = [
+            *mods,
+            ireert.create_hal_module(self.config.vm_instance, self.config.device),
+        ]
+
+        # TODO: Enable multiple weight files
+        if external_weight_path:
+            index = ireert.ParameterIndex()
+            if not isinstance(external_weight_path, list):
+                external_weight_path = [external_weight_path]
+            for i, path in enumerate(external_weight_path):
+                if path in ["", None]:
+                    continue
+                index.load(path)
+                # TODO: extend scope
+                param_module = ireert.create_io_parameters_module(
+                    self.config.vm_instance, index.create_provider(scope="model")
+                )
+                vm_modules.insert(i, param_module)
+                del param_module
+            del index
+
+        self.ctx = ireert.SystemContext(
+            vm_modules=vm_modules,
+            config=self.config,
+        )
+
+    def unload(self):
+        self.ctx = None
+        self.config = None

From 1a7933a6494dcff09c25645844394c12297c7d52 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Wed, 23 Oct 2024 02:48:44 -0500
Subject: [PATCH 04/51] Update test

---
 sharktank/tests/evaluate/perplexity_test.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sharktank/tests/evaluate/perplexity_test.py b/sharktank/tests/evaluate/perplexity_test.py
index 23b56efc8..bf5ede529 100644
--- a/sharktank/tests/evaluate/perplexity_test.py
+++ b/sharktank/tests/evaluate/perplexity_test.py
@@ -8,7 +8,7 @@
 import pytest
 import numpy as np
 
-from sharktank.evaluate import perplexity
+from sharktank.evaluate import perplexity_torch
 
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
@@ -31,7 +31,7 @@ def test_llama3_8B_f16_decomposed(self):
         model_name = "llama3_8B_f16_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -61,7 +61,7 @@ def test_llama3_8B_f16_non_decomposed(self):
         model_name = "llama3_8B_f16_non_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -92,7 +92,7 @@ def test_llama3_8B_fp8_decomposed(self):
         model_name = "llama3_8B_fp8_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -122,7 +122,7 @@ def test_llama3_8B_fp8_non_decomposed(self):
         model_name = "llama3_8B_fp8_non_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -150,7 +150,7 @@ def test_llama3_405B_f16_decomposed(self):
         model_name = "llama3_405B_f16_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -181,7 +181,7 @@ def test_llama3_405B_f16_non_decomposed(self):
         model_name = "llama3_405B_f16_non_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -213,7 +213,7 @@ def test_llama3_405B_fp8_decomposed(self):
         model_name = "llama3_405B_fp8_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -244,7 +244,7 @@ def test_llama3_405B_fp8_non_decomposed(self):
         model_name = "llama3_405B_fp8_non_decomposed"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity.main(
+        current_perplexity = perplexity_torch.main(
             [
                 f"--gguf-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",

From 026318ad7a6db34b396aadb829cc3657bde2b6de Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Wed, 23 Oct 2024 03:02:22 -0500
Subject: [PATCH 05/51] Rename perplexity torch test

---
 .github/workflows/ci_eval.yaml                                  | 2 +-
 .../evaluate/{perplexity_test.py => perplexity_torch_test.py}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename sharktank/tests/evaluate/{perplexity_test.py => perplexity_torch_test.py} (100%)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 9181d5b72..66d264758 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,4 +59,4 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Run perplexity test
-        run:  pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
+        run:  pytest sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
diff --git a/sharktank/tests/evaluate/perplexity_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
similarity index 100%
rename from sharktank/tests/evaluate/perplexity_test.py
rename to sharktank/tests/evaluate/perplexity_torch_test.py

From 089f590cea2d3cb3547f23195f5ef12762a49e88 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 02:54:30 -0500
Subject: [PATCH 06/51] Revert npy to json

---
 sharktank/tests/evaluate/perplexity_torch_test.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
index bf5ede529..61e5f8f32 100644
--- a/sharktank/tests/evaluate/perplexity_torch_test.py
+++ b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -6,7 +6,7 @@
 
 import unittest
 import pytest
-import numpy as np
+import json
 
 from sharktank.evaluate import perplexity_torch
 
@@ -19,9 +19,8 @@ def setUp(self):
         self.current_perplexity_all = {}
         self.delta = 5e-1
         self.tensor_parallelism_size = 8
-        self.baseline_perplexity = np.load(
-            self.baseline_perplexity_scores, allow_pickle=True
-        ).item()
+        with open(self.baseline_perplexity_scores, "r") as f:
+            self.baseline_perplexity = json.load(f)
 
     @longrun
     def test_llama3_8B_f16_decomposed(self):

From 1711f85683cb495f757867de3349db8763b6090a Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 04:03:52 -0500
Subject: [PATCH 07/51] Update gguf to irpa

---
 .../tests/evaluate/perplexity_torch_test.py      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
index 61e5f8f32..3c5ff0cda 100644
--- a/sharktank/tests/evaluate/perplexity_torch_test.py
+++ b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -32,7 +32,7 @@ def test_llama3_8B_f16_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_8b_f16_model}",
+                f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
             ]
         )
@@ -62,7 +62,7 @@ def test_llama3_8B_f16_non_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_8b_f16_model}",
+                f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--attention-kernel=torch_sdpa",
             ]
@@ -93,7 +93,7 @@ def test_llama3_8B_fp8_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_8b_fp8_model}",
+                f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
             ]
         )
@@ -123,7 +123,7 @@ def test_llama3_8B_fp8_non_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_8b_fp8_model}",
+                f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--attention-kernel=torch_sdpa",
             ]
@@ -151,7 +151,7 @@ def test_llama3_405B_f16_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_405b_f16_model}",
+                f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
             ]
@@ -182,7 +182,7 @@ def test_llama3_405B_f16_non_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_405b_f16_model}",
+                f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",
@@ -214,7 +214,7 @@ def test_llama3_405B_fp8_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_405b_fp8_model}",
+                f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
             ]
@@ -245,7 +245,7 @@ def test_llama3_405B_fp8_non_decomposed(self):
 
         current_perplexity = perplexity_torch.main(
             [
-                f"--gguf-file={self.llama3_405b_fp8_model}",
+                f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
                 f"--attention-kernel=torch_sdpa",

From 74b376fc457bf1c7bac57dbdfdb7e4620e27679e Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 04:09:01 -0500
Subject: [PATCH 08/51] Add vmfb test

---
 .github/workflows/ci_eval.yaml                |   6 +-
 sharktank/conftest.py                         |  54 +++-
 .../tests/evaluate/perplexity_vmfb_test.py    | 281 ++++++++++++++++++
 3 files changed, 328 insertions(+), 13 deletions(-)
 create mode 100644 sharktank/tests/evaluate/perplexity_vmfb_test.py

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 66d264758..597fd0b99 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -58,5 +58,7 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-      - name: Run perplexity test
-        run:  pytest sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+      - name: Run perplexity test in eager mode
+        run:  pytest -n 4 -v sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+      - name: Run perplexity test with vmfb
+        run:  pytest -n 4 -v sharktank/tests/evaluate/perplexity_vmfb_test.py  --longrun
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index b57af92a1..842d97e73 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -81,11 +81,11 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--llama3-8b-f16-gguf-path",
+        "--llama3-8b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_f16.gguf",
-        help="Llama3.1 8b gguf model path, defaults to 30F CI system path",
+        default="/data/extra/models/llama3.1_8B/llama8b_f16.irpa",
+        help="Llama3.1 8b model path, defaults to 30F CI system path",
     )
 
     parser.addoption(
@@ -105,11 +105,11 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--llama3-405b-f16-gguf-path",
+        "--llama3-405b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_405B/llama405b_fp16.gguf",
-        help="Llama3.1 405b gguf model path, defaults to 30F CI system path",
+        default="/data/extra/models/llama3.1_405B/llama405b_fp16.irpa",
+        help="Llama3.1 405b model path, defaults to 30F CI system path",
     )
 
     parser.addoption(
@@ -124,10 +124,33 @@ def pytest_addoption(parser):
         "--baseline-perplexity-scores",
         type=Path,
         action="store",
-        default="https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/evaluation/baseline_perplexity_scores.npy",
+        default="/home/aramalin/SHARK-Platform/sharktank/tests/evaluate/baseline_perplexity_scores.json",
         help="Llama3.1 8B & 405B model baseline perplexity scores",
     )
 
+    parser.addoption(
+        "--llama3-8b-f16-vmfb-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_8B/llama8b_f16.vmfb",
+        help="Llama3.1 8b fp16 vmfb path, defaults to 30F CI system path",
+    )
+
+    parser.addoption(
+        "--llama3-405b-f16-vmfb-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_405B/llama405b_fp16.vmfb",
+        help="Llama3.1 405b fp16 vmfb path, defaults to 30F CI system path",
+    )
+
+    parser.addoption(
+        "--iree-device",
+        type=str,
+        action="store",
+        help="List an IREE device from iree-run-module --list_devices",
+    )
+
 
 def set_fixture_from_cli_option(
     request: FixtureRequest,
@@ -174,8 +197,8 @@ def get_model_path(request: FixtureRequest):
     model_path["llama3_8b_tokenizer_path"] = set_fixture_from_cli_option(
         request, "--llama3-8b-tokenizer-path", "llama3_8b_tokenizer"
     )
-    model_path["llama3_8b_f16_gguf_path"] = set_fixture_from_cli_option(
-        request, "--llama3-8b-f16-gguf-path", "llama3_8b_f16_model"
+    model_path["llama3_8b_f16_model_path"] = set_fixture_from_cli_option(
+        request, "--llama3-8b-f16-model-path", "llama3_8b_f16_model"
     )
     model_path["llama3_8b_fp8_model_path"] = set_fixture_from_cli_option(
         request, "--llama3-8b-fp8-model-path", "llama3_8b_fp8_model"
@@ -183,8 +206,8 @@ def get_model_path(request: FixtureRequest):
     model_path["llama3_405b_tokenizer_path"] = set_fixture_from_cli_option(
         request, "--llama3-405b-tokenizer-path", "llama3_405b_tokenizer"
     )
-    model_path["llama3_405b_f16_gguf_path"] = set_fixture_from_cli_option(
-        request, "--llama3-405b-f16-gguf-path", "llama3_405b_f16_model"
+    model_path["llama3_405b_f16_model_path"] = set_fixture_from_cli_option(
+        request, "--llama3-405b-f16-model-path", "llama3_405b_f16_model"
     )
     model_path["llama3_405b_fp8_model_path"] = set_fixture_from_cli_option(
         request, "--llama3-405b-fp8-model-path", "llama3_405b_fp8_model"
@@ -192,4 +215,13 @@ def get_model_path(request: FixtureRequest):
     model_path["baseline_perplexity_scores"] = set_fixture_from_cli_option(
         request, "--baseline-perplexity-scores", "baseline_perplexity_scores"
     )
+    model_path["llama3_8b_f16_vmfb"] = set_fixture_from_cli_option(
+        request, "--llama3-8b-f16-vmfb-path", "llama3_8b_f16_vmfb"
+    )
+    model_path["llama3_405b_f16_vmfb"] = set_fixture_from_cli_option(
+        request, "--llama3-405b-f16-vmfb-path", "llama3_405b_f16_vmfb"
+    )
+    model_path["iree_device"] = set_fixture_from_cli_option(
+        request, "--iree-device", "iree_device"
+    )
     return model_path
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
new file mode 100644
index 000000000..a52c9d63e
--- /dev/null
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -0,0 +1,281 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import unittest
+import pytest
+import json
+
+from sharktank.evaluate import perplexity_vmfb
+
+longrun = pytest.mark.skipif("not config.getoption('longrun')")
+
+
+@pytest.mark.usefixtures("get_model_path")
+class PerplexityTest(unittest.TestCase):
+    def setUp(self):
+        self.current_perplexity_all = {}
+        self.delta = 5e-1
+        self.tensor_parallelism_size = 8
+        with open(self.baseline_perplexity_scores, "r") as f:
+            self.baseline_perplexity = json.load(f)
+
+    @longrun
+    def test_llama3_8B_f16_decomposed(self):
+
+        # Llama 3.1 8B decomposed
+
+        model_name = "llama3_8B_f16_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_8b_f16_vmfb}",
+                f"--irpa-file={self.llama3_8b_f16_model}",
+                f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="Non-decomposed attention is not supported yet",
+    )
+    @longrun
+    def test_llama3_8B_f16_non_decomposed(self):
+
+        # Llama 3.1 8B non-decomposed
+
+        model_name = "llama3_8B_f16_non_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_8b_f16_vmfb}",
+                f"--irpa-file={self.llama3_8b_f16_model}",
+                f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="FP8 model is unsupported",
+    )
+    @longrun
+    def test_llama3_8B_fp8_decomposed(self):
+
+        # Llama 3.1 8B decomposed
+
+        model_name = "llama3_8B_fp8_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_8b_fp8_vmfb}",
+                f"--irpa-file={self.llama3_8b_fp8_model}",
+                f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="Non-decomposed attention is not supported yet",
+    )
+    @longrun
+    def test_llama3_8B_fp8_non_decomposed(self):
+
+        # Llama 3.1 8B non-decomposed
+
+        model_name = "llama3_8B_fp8_non_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_8b_fp8_vmfb}",
+                f"--irpa-file={self.llama3_8b_fp8_model}",
+                f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @longrun
+    def test_llama3_405B_f16_decomposed(self):
+
+        # Llama 3.1 405B decomposed
+
+        model_name = "llama3_405B_f16_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_405b_f16_vmfb}",
+                f"--irpa-file={self.llama3_405b_f16_model}",
+                f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+                f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="Non-decomposed attention is not supported yet",
+    )
+    @longrun
+    def test_llama3_405B_f16_non_decomposed(self):
+
+        # Llama 3.1 405B non-decomposed
+
+        model_name = "llama3_405B_f16_non_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_405b_f16_vmfb}",
+                f"--irpa-file={self.llama3_405b_f16_model}",
+                f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+                f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="FP8 model is unsupported",
+    )
+    @longrun
+    def test_llama3_405B_fp8_decomposed(self):
+
+        # Llama 3.1 405B decomposed
+
+        model_name = "llama3_405B_fp8_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_405b_fp8_vmfb}",
+                f"--irpa-file={self.llama3_405b_fp8_model}",
+                f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+                f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+    @pytest.mark.xfail(
+        reason="Non-decomposed attention is not supported yet",
+    )
+    @longrun
+    def test_llama3_405B_fp8_non_decomposed(self):
+
+        # Llama 3.1 405B non-decomposed
+
+        model_name = "llama3_405B_fp8_non_decomposed_vmfb"
+        baseline_perplexity = self.baseline_perplexity[model_name]
+
+        current_perplexity = perplexity_vmfb.main(
+            [
+                f"--vmfb-path={self.llama3_405b_fp8_vmfb}",
+                f"--irpa-file={self.llama3_405b_fp8_model}",
+                f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
+                f"--iree-device={self.iree_device}",
+                "--tensor-parallelism-size={self.tensor_parallelism_size}",
+            ]
+        )
+
+        perplexity_difference = (
+            current_perplexity["mean_perplexity"]
+            - baseline_perplexity["mean_perplexity"]
+        )
+
+        self.assertAlmostEqual(
+            baseline_perplexity["mean_perplexity"],
+            current_perplexity["mean_perplexity"],
+            delta=self.delta,
+            msg=f"Current perplexity deviates baseline by {perplexity_difference}",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6a9b5b324df1073122f46d8d8927777ad0b70759 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 04:11:16 -0500
Subject: [PATCH 09/51] Reduce tqdm progress print frequency

---
 sharktank/sharktank/evaluate/perplexity_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sharktank/sharktank/evaluate/perplexity_torch.py b/sharktank/sharktank/evaluate/perplexity_torch.py
index 41dfaba17..768b6b7ce 100644
--- a/sharktank/sharktank/evaluate/perplexity_torch.py
+++ b/sharktank/sharktank/evaluate/perplexity_torch.py
@@ -177,6 +177,7 @@ def get_logits(self):
         start = 0
         for i in tqdm(
             range(start, self.max_prompt_length - 1),
+            miniters=50,
             desc="eval: Calculating logits",
         ):
             logger.debug(f"Iteration: {i}")

From dfa32183b245677ea0ba0a0521d5600a6fd0705d Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 04:18:01 -0500
Subject: [PATCH 10/51] Add -s flag for pytest to display test progress

---
 .github/workflows/ci_eval.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 597fd0b99..af78d7413 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,6 +59,6 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Run perplexity test in eager mode
-        run:  pytest -n 4 -v sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
       - name: Run perplexity test with vmfb
-        run:  pytest -n 4 -v sharktank/tests/evaluate/perplexity_vmfb_test.py  --longrun
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py  --longrun

From 7c85d0dbe7a4aeaaecaa22f3f5a8b8fe77815560 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 04:41:26 -0500
Subject: [PATCH 11/51] Update vmfb perplexity

---
 .../sharktank/evaluate/perplexity_vmfb.py     | 279 +++++++++++-------
 1 file changed, 176 insertions(+), 103 deletions(-)

diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index d42f77ebd..5232bcc40 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -19,14 +19,19 @@
 import torch
 from torch.nn import CrossEntropyLoss
 
+from sharktank.models.llama.llama import *
+from sharktank.models.mixtral.mixtral import *
+from sharktank.models.grok.grok import *
+
+from ..models.llama.sharding import shard_theta
+
 from sharktank.layers import *
 from sharktank.types import *
 
-from sharktank.utils.vmfb_runner import *
 from sharktank.utils import cli
+from sharktank.utils.vmfb_runner import *
 from sharktank.utils.load_llm import *
-
-import iree.runtime as ireert
+from sharktank.utils.create_cache import *
 
 log_levels = {
     "info": logging.INFO,
@@ -34,7 +39,7 @@
 }
 logger = logging.getLogger("eval")
 
-logger.setLevel(log_levels["info"])
+logger.setLevel(log_levels["debug"])
 
 logger.root.handlers[0].setFormatter(
     logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
@@ -53,17 +58,14 @@ class Perplexity:
     """
 
     def __init__(
-        self,
-        device,
-        tokenizer,
+        self, torch_device, iree_device, kv_cache_type, tensor_parallelism_size
     ):
-        self.device = device
-        self.tokenizer = tokenizer
-        self.pad_sequence_stride = 16
-        self.block_seq_stride = 16
-        self.free_pages = list(range(1, 8192))
-        # TODO: investigate cache
-        self.cache_state = model.cache.paged.allocate(page_cache_size)
+        self.torch_device = torch_device
+        self.iree_device = iree_device
+        self.kv_cache_type = kv_cache_type
+        self.activation_dtype = torch.float32
+        self.attention_dtype = torch.float32
+        self.tensor_parallelism_size = tensor_parallelism_size
 
     def timeit(func):
         def wrapper(*args, **kwargs):
@@ -87,55 +89,58 @@ def wrapper(*args, **kwargs):
     def print_token_comparison(self, i):
         if i <= self.max_prompt_length:
             batch_predicted_token_id = [[i[-1]] for i in self.batch.results]
-            batch_predicted_token = self.tokenizer.decode(batch_predicted_token_id)
+            batch_predicted_token = self.generator.tokenizer.decode(
+                batch_predicted_token_id
+            )
             logger.debug(f"Predicted:")
             logger.debug(f"{batch_predicted_token}")
             logger.debug(f"{batch_predicted_token_id}")
 
             expected_token_id = self.token_ids[:, i + 1 : i + 2].tolist()
-            expected_token = self.tokenizer.decode(expected_token_id)
+            expected_token = self.generator.tokenizer.decode(expected_token_id)
             logger.debug(f"Expected:")
             logger.debug(f"{expected_token}")
             logger.debug(f"{expected_token_id}")
 
-    def alloc_page(self) -> int:
-        # Only applies for paged attention
-        return self.free_pages.pop()
-
-    def pad_block_ids(self, seq_block_ids) -> torch.Tensor:
-        max_length = max(len(r) for r in seq_block_ids)
-        rows = [r + (max_length - len(r)) * [0] for r in seq_block_ids]
-        return torch.tensor(rows)
-
     @timeit
-    def load_model(self, vmfb_path, gguf_weight_path):
-        return vmfbRunner(
-            device=self.device,
-            vmfb_path=vmfb_path,
-            external_weight_path=gguf_weight_path,
+    def load_model(self, weight_path, tokenizer, vmfb_path, weight_path_str):
+
+        config = LlamaModelConfig(
+            hp=configs.LlamaHParams.from_gguf_props(weight_path.properties),
+            block_seq_stride=16,
+            kv_cache_type=self.kv_cache_type,
+            device=self.torch_device,
+            activation_dtype=self.activation_dtype,
+            attention_dtype=self.attention_dtype,
+            tensor_parallelism_size=self.tensor_parallelism_size,
         )
 
-    def get_args(self, seq_lens_batch):
-        # Assemble the batch.
-        seq_stride = self.block_seq_stride
-        seq_block_ids: list[list[int]] = []
-        for seq_len in seq_lens_batch:
-            blocks_needed = (
-                int(math.ceil(seq_len / seq_stride)) if seq_stride > 0 else 0
-            )
-            row = []
-            for _ in range(blocks_needed):
-                row.append(self.alloc_page())
-            seq_block_ids.append(row)
+        if config.tensor_parallelism_size > 1:
+            weight_path.root_theta = shard_theta(weight_path.root_theta, config)
+
+        theta = weight_path.root_theta
+
+        if config.hp.expert_count:
+            if config.hp.model_arch == "grok":
+                model = PagedGrokModelV1(theta, config)
+            else:
+                model = PagedMixtralModelV1(theta, config)
+        else:
+            model = PagedLlamaModelV1(theta, config)
 
-        return seq_block_ids
+        self.generator = TorchGenerator(model, tokenizer)
+
+        self.runner = vmfbRunner(
+            device=self.iree_device,
+            vmfb_path=vmfb_path,
+            external_weight_path=weight_path_str,
+        )
 
     @timeit
     def get_prompts(self):
         test_prompts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")[
             "text"
         ]
-
         num_test_prompts = 219
 
         random.seed(0)
@@ -152,14 +157,87 @@ def get_prompts(self):
 
         return test_prompts
 
+    def prefill_vmfb(self, token_batch, i):
+
+        logger.debug(f"Prefill:")
+
+        logger.debug("Input:")
+        logger.debug(f"{self.generator.tokenizer.decode(token_batch)}")
+
+        token_batch, seq_lens_batch = self.generator.tokenizer.pad_tokens(
+            token_ids=token_batch.tolist(),
+            pad_to_multiple_of=self.generator.model.cache.pad_sequence_stride,
+        )
+
+        logger.debug(f"{token_batch}")
+
+        token_batch = torch.tensor(token_batch, device=self.torch_device)
+        self.seq_lens_batch = torch.tensor(seq_lens_batch, device=self.torch_device)
+
+        self.batch = self.generator.begin_eval_batch(
+            token_batch=token_batch,
+            seq_lens_batch=self.seq_lens_batch,
+            bs=self.bs,
+        )
+
+        seq_block_ids = self.batch.pad_block_ids()
+        prefill_logits = self.runner.ctx.modules.module.prefill_bs4(
+            token_batch,
+            self.seq_lens_batch,
+            seq_block_ids,
+            self.batch.cache_state[0].to(torch.float16),
+        )
+
+        prefill_logits = torch.tensor(prefill_logits[:, 0:1, :])
+
+        tokens = torch.tensor(
+            self.generator.model.extract_tokens_from_logits(
+                prefill_logits, seq_lens_batch
+            )
+        ).unsqueeze(1)
+        self.batch.add_result_token(tokens)
+
+        self.print_token_comparison(i)
+        return prefill_logits
+
+    def decode_vmfb(self, token_batch, i):
+        logger.debug("Decode:")
+
+        logger.debug("Input:")
+        logger.debug(f"{self.generator.tokenizer.decode(token_batch)}")
+        logger.debug(f"{token_batch.tolist()}")
+
+        start_positions = self.seq_lens_batch.clone()
+        self.seq_lens_batch.add_(1)
+        self.batch.allocate_seq_block_ids()
+        seq_block_ids = self.batch.pad_block_ids()
+
+        decode_logits = self.runner.ctx.modules.module.decode_bs4(
+            token_batch,
+            self.seq_lens_batch,
+            start_positions,
+            seq_block_ids,
+            self.batch.cache_state[0].to(torch.float16),
+        )
+
+        decode_logits = torch.tensor(decode_logits[:, :, :])
+
+        tokens = torch.tensor(
+            self.generator.model.extract_tokens_from_logits(
+                decode_logits, [1] * self.bs
+            ),
+            device=self.generator.model.device,
+        ).unsqueeze(1)
+        self.batch.add_result_token(tokens)
+        self.print_token_comparison(i)
+        return decode_logits
+
     @timeit
-    def get_logits(
-        self,
-    ):
+    def get_logits(self):
 
-        token_ids, seq_lens = self.tokenizer.encode(
+        token_ids, seq_lens = self.generator.tokenizer.encode(
             self.test_prompts,
-            pad_to_multiple_of=self.pad_sequence_stride,
+            pad_to_multiple_of=self.generator.model.cache.pad_sequence_stride,
         )
 
         logger.info(f" Prompts for Evaluation:")
@@ -169,8 +247,11 @@ def get_logits(
             )
 
         self.max_prompt_length = max(seq_lens)
-        self.token_ids = torch.tensor(token_ids)
-        self.attention_mask = (self.token_ids != 0).int().detach().clone()
+
+        self.token_ids = torch.tensor(token_ids, device=self.torch_device)
+        self.attention_mask = (
+            (self.token_ids != 0).int().detach().clone().to(self.torch_device)
+        )
 
         self.bs = len(self.test_prompts)
 
@@ -185,54 +266,18 @@ def get_logits(
             if is_first_token:
 
                 token_batch = self.token_ids[:, : i + 1]
-                logger.debug(f"Prefill:")
-
-                logger.debug("Input:")
-                logger.debug(f"{self.tokenizer.decode(token_batch)}")
-
-                token_batch, seq_lens_batch = self.tokenizer.pad_tokens(
-                    token_ids=token_batch.tolist(),
-                    pad_to_multiple_of=self.pad_sequence_stride,
-                )
 
-                logger.debug(f"{token_batch}")
+                prefill_logits = self.prefill_vmfb(token_batch, i)
+                self.out_logits = prefill_logits
 
-                token_batch = torch.tensor(token_batch, device=self.device)
-                seq_lens_batch = torch.tensor(seq_lens_batch, device=self.device)
-
-                seq_block_ids = self.get_args(seq_lens_batch)
-                seq_block_ids = self.pad_block_ids(seq_block_ids)
-                prefill_logits = self.runner.ctx.modules.module.prefill_bs4(
-                    token_batch, seq_lens_batch, seq_block_ids, self.cache_state
-                )
-
-                self.out_logits = prefill_logits[:, -1, :]
                 is_first_token = False
 
-                self.print_token_comparison(i)
-
             else:
                 token_batch = self.token_ids[:, i : i + 1]
 
-                logger.debug("Decode:")
-
-                logger.debug("Input:")
-                logger.debug(f"{self.tokenizer.decode(token_batch)}")
-                logger.debug(f"{token_batch.tolist()}")
-
-                start_positions = seq_lens_batch.clone()
-                seq_lens_batch.add_(1)
-
-                seq_block_ids = self.get_args(seq_lens_batch)
-                seq_block_ids = self.pad_block_ids(seq_block_ids)
-                decode_logits = self.runner.ctx.modules.module.decode_bs4(
-                    token_batch, start_positions, seq_block_ids, self.cache_state
-                )
-
+                decode_logits = self.decode_vmfb(token_batch, i)
                 self.out_logits = torch.cat((self.out_logits, decode_logits), 1)
 
-                self.print_token_comparison(i)
-
         pad_logits_shape = self.token_ids.shape[1] - self.out_logits.shape[1]
 
         self.pad_logits = torch.zeros(
@@ -240,7 +285,7 @@ def get_logits(
         )
 
         self.out_logits = torch.cat((self.out_logits, self.pad_logits), 1).to(
-            self.device
+            self.torch_device
         )
 
     @timeit
@@ -287,12 +332,23 @@ def get_perplexity(self, test_prompts):
 
 def run_perplexity(
     vmfb_path,
-    gguf_weight_path,
+    weight_path,
+    weight_path_str,
     tokenizer,
-    device,
+    torch_device,
+    iree_device,
+    kv_cache_type,
+    tensor_parallelism_size,
 ):
-    perplexity = Perplexity(device=device, tokenizer=tokenizer)
-    perplexity.load_model(tokenizer, vmfb_path, gguf_weight_path)
+    perplexity = Perplexity(
+        torch_device=torch_device,
+        iree_device=iree_device,
+        kv_cache_type=kv_cache_type,
+        tensor_parallelism_size=tensor_parallelism_size,
+    )
+
+    # perplexity.load_model(tokenizer, vmfb_path, weight_path)
+    perplexity.load_model(weight_path, tokenizer, vmfb_path, weight_path_str)
     test_prompts = perplexity.get_prompts()
     ppl = perplexity.get_perplexity(test_prompts=test_prompts)
 
@@ -301,24 +357,41 @@ def run_perplexity(
 
 def main(argv):
     parser = cli.create_parser()
-    parser.add_argument("--device", help="Torch device (or default)")
+    parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
+    parser.add_argument("--torch-device", help="Torch device (or default)")
+    parser.add_argument(
+        "--iree-device", help="List an IREE device from iree-run-module --list_devices"
+    )
+    parser.add_argument("--vmfb-path", help="Path to vmfb file")
+    parser.add_argument(
+        "--tensor-parallelism-size",
+        type=int,
+        default=1,
+        help="Number of devices for tensor parallel sharding.",
+    )
 
     cli.add_tokenizer_options(parser)
+    cli.add_input_dataset_options(parser)
     args = cli.parse(parser, args=argv)
 
-    device = torch.device(args.device) if args.device else None
+    torch_device = torch.device(args.torch_device) if args.torch_device else None
+    iree_device = args.iree_device
+    kv_cache_type = args.kv_cache_type
+    weight_path = cli.get_input_dataset(args)
     tokenizer = cli.get_tokenizer(args)
 
-    # device could be local-sync:// local-task://
-    device = "hip://GPU-34346462-3466-6333-3231-353561336563"
-    vmfb_path = "/home/aramalin/SHARK-Platform/artifacts/llama70b_q4_1.vmfb"
-    gguf_weight_path = "/data/extra/models/llama70b_q4_1.gguf"
+    vmfb_path = args.vmfb_path
+    weight_path_str = str(args.irpa_file)
 
     ppl = run_perplexity(
         vmfb_path=vmfb_path,
-        gguf_weight_path=gguf_weight_path,
+        weight_path=weight_path,
+        weight_path_str=weight_path_str,
         tokenizer=tokenizer,
-        device=device,
+        torch_device=torch_device,
+        iree_device=iree_device,
+        kv_cache_type=kv_cache_type,
+        tensor_parallelism_size=args.tensor_parallelism_size,
     )
 
     logger.info(f"\n{json.dumps(ppl, indent=2)}")

From 26b48de3126f3d93b97b63c928edf4219d4556a3 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 12:48:55 -0500
Subject: [PATCH 12/51] Address review comments

---
 sharktank/conftest.py                           | 2 +-
 sharktank/sharktank/evaluate/perplexity_vmfb.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index cd0b1918d..a32571db8 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -124,7 +124,7 @@ def pytest_addoption(parser):
         "--baseline-perplexity-scores",
         type=Path,
         action="store",
-        default="/home/aramalin/SHARK-Platform/sharktank/tests/evaluate/baseline_perplexity_scores.json",
+        default="sharktank/tests/evaluate/baseline_perplexity_scores.json",
         help="Llama3.1 8B & 405B model baseline perplexity scores",
     )
 
diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index 5232bcc40..d20038d6f 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -39,7 +39,7 @@
 }
 logger = logging.getLogger("eval")
 
-logger.setLevel(log_levels["debug"])
+logger.setLevel(log_levels["info"])
 
 logger.root.handlers[0].setFormatter(
     logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
@@ -188,7 +188,7 @@ def prefill_vmfb(self, token_batch, i):
             self.batch.cache_state[0].to(torch.float16),
         )
 
-        prefill_logits = torch.tensor(prefill_logits[:, 0:1, :])
+        prefill_logits = torch.tensor(prefill_logits[:, :, :])
 
         tokens = torch.tensor(
             self.generator.model.extract_tokens_from_logits(
@@ -268,7 +268,7 @@ def get_logits(self):
                 token_batch = self.token_ids[:, : i + 1]
 
                 prefill_logits = self.prefill_vmfb(token_batch, i)
-                self.out_logits = prefill_logits
+                self.out_logits = prefill_logits[:, 0:1, :]
 
                 is_first_token = False
 
@@ -347,7 +347,6 @@ def run_perplexity(
         tensor_parallelism_size=tensor_parallelism_size,
     )
 
-    # perplexity.load_model(tokenizer, vmfb_path, weight_path)
     perplexity.load_model(weight_path, tokenizer, vmfb_path, weight_path_str)
     test_prompts = perplexity.get_prompts()
     ppl = perplexity.get_perplexity(test_prompts=test_prompts)

From 3945f376eef3e84298d7f004c13d7d0f57496b58 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 20:57:33 -0500
Subject: [PATCH 13/51] Add export & compile tests

---
 .github/workflows/ci_eval.yaml                |  53 ++++++-
 sharktank/conftest.py                         | 113 +++++++++++--
 .../sharktank/evaluate/perplexity_vmfb.py     |   6 +-
 .../tests/evaluate/export_artifacts_test.py   | 149 ++++++++++++++++++
 .../tests/evaluate/perplexity_torch_test.py   |   4 +-
 .../tests/evaluate/perplexity_vmfb_test.py    |   4 +-
 6 files changed, 303 insertions(+), 26 deletions(-)
 create mode 100644 sharktank/tests/evaluate/export_artifacts_test.py

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index af78d7413..68504bdda 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -1,6 +1,7 @@
 name: Evaluation Tests
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
@@ -15,9 +16,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_perplexity:
+  test_perplexity_torch:
     timeout-minutes: 600
-    name: "Evaluation Tests - perplexity"
+    name: "Evaluation Tests - perplexity_torch"
     strategy:
       matrix:
         version: [3.11]
@@ -60,5 +61,51 @@ jobs:
 
       - name: Run perplexity test in eager mode
         run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+
+  test_perplexity_vmfb:
+    timeout-minutes: 600
+    name: "Evaluation Tests - perplexity_vmfb"
+    strategy:
+      matrix:
+        version: [3.11]
+        runs-on: [llama-mi300]
+      fail-fast: false
+    runs-on: ${{matrix.runs-on}}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v3
+
+      - name: Cache Pip Packages
+        uses: actions/cache@v4
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install sharktank deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+      - name: Export mlir and vmfb
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/export_artifacts.py
       - name: Run perplexity test with vmfb
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py  --longrun
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index a32571db8..79d2d477b 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -80,6 +80,14 @@ def pytest_addoption(parser):
         help="Llama3.1 8b tokenizer path, defaults to 30F CI system path",
     )
 
+    parser.addoption(
+        "--llama3-8b-json-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_8B/llama8b_test.json",
+        help="Llama3.1 8b fp8 parameters json path",
+    )
+
     parser.addoption(
         "--llama3-8b-f16-model-path",
         type=Path,
@@ -96,6 +104,30 @@ def pytest_addoption(parser):
         help="Llama3.1 8b fp8 model path",
     )
 
+    parser.addoption(
+        "--llama3-8b-f16-mlir-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_8B/llama8b_f16_test.mlir",
+        help="Llama3.1 8b mlir path, defaults to 30F CI system path",
+    )
+
+    parser.addoption(
+        "--llama3-8b-fp8-mlir-path",
+        type=Path,
+        action="store",
+        default=None,
+        help="Llama3.1 8b fp8 mlir path",
+    )
+
+    parser.addoption(
+        "--llama3-8b-f16-vmfb-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_8B/llama8b_f16.vmfb",
+        help="Llama3.1 8b fp16 vmfb path, defaults to 30F CI system path",
+    )
+
     parser.addoption(
         "--llama3-405b-tokenizer-path",
         type=Path,
@@ -104,6 +136,14 @@ def pytest_addoption(parser):
         help="Llama3.1 405b tokenizer path, defaults to 30F CI system path",
     )
 
+    parser.addoption(
+        "--llama3-405b-json-path",
+        type=Path,
+        action="store",
+        default="/data/extra/models/llama3.1_405B/llama405b_test.json",
+        help="Llama3.1 405b fp8 parameters json path",
+    )
+
     parser.addoption(
         "--llama3-405b-f16-model-path",
         type=Path,
@@ -121,19 +161,19 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--baseline-perplexity-scores",
+        "--llama3-405b-f16-mlir-path",
         type=Path,
         action="store",
-        default="sharktank/tests/evaluate/baseline_perplexity_scores.json",
-        help="Llama3.1 8B & 405B model baseline perplexity scores",
+        default="/data/extra/models/llama3.1_405B/llama405b_fp16_test.mlir",
+        help="Llama3.1 405b mlir path, defaults to 30F CI system path",
     )
 
     parser.addoption(
-        "--llama3-8b-f16-vmfb-path",
+        "--llama3-405b-fp8-mlir-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_f16.vmfb",
-        help="Llama3.1 8b fp16 vmfb path, defaults to 30F CI system path",
+        default=None,
+        help="Llama3.1 405b fp8 mlir path",
     )
 
     parser.addoption(
@@ -144,6 +184,14 @@ def pytest_addoption(parser):
         help="Llama3.1 405b fp16 vmfb path, defaults to 30F CI system path",
     )
 
+    parser.addoption(
+        "--baseline-perplexity-scores",
+        type=Path,
+        action="store",
+        default="sharktank/tests/evaluate/baseline_perplexity_scores.json",
+        help="Llama3.1 8B & 405B model baseline perplexity scores",
+    )
+
     parser.addoption(
         "--iree-device",
         type=str,
@@ -158,6 +206,21 @@ def pytest_addoption(parser):
         help="Specify the iree-hip target version (e.g., gfx942)",
     )
 
+    parser.addoption(
+        "--iree-hal-target-backends",
+        action="store",
+        default="rocm",
+        help="Specify the iree-hal target backend (e.g., rocm)",
+    )
+
+    parser.addoption(
+        "--tensor-parallelism-size",
+        action="store",
+        type=int,
+        default=1,
+        help="Number of devices for tensor parallel sharding",
+    )
+
 
 def set_fixture_from_cli_option(
     request: FixtureRequest,
@@ -206,7 +269,21 @@ def iree_hip_target_type(request: FixtureRequest) -> Optional[str]:
 
 
 @pytest.fixture(scope="class")
-def get_model_path(request: FixtureRequest):
+def tensor_parallelism_size(request: FixtureRequest) -> Optional[str]:
+    return set_fixture_from_cli_option(
+        request, "tensor_parallelism_size", "tensor_parallelism_size"
+    )
+
+
+@pytest.fixture(scope="class")
+def baseline_perplexity_scores(request: FixtureRequest) -> Optional[str]:
+    return set_fixture_from_cli_option(
+        request, "baseline_perplexity_scores", "baseline_perplexity_scores"
+    )
+
+
+@pytest.fixture(scope="class")
+def get_model_artifacts(request: FixtureRequest):
     model_path = {}
     model_path["llama3_8b_tokenizer_path"] = set_fixture_from_cli_option(
         request, "--llama3-8b-tokenizer-path", "llama3_8b_tokenizer"
@@ -226,16 +303,18 @@ def get_model_path(request: FixtureRequest):
     model_path["llama3_405b_fp8_model_path"] = set_fixture_from_cli_option(
         request, "--llama3-405b-fp8-model-path", "llama3_405b_fp8_model"
     )
-    model_path["baseline_perplexity_scores"] = set_fixture_from_cli_option(
-        request, "--baseline-perplexity-scores", "baseline_perplexity_scores"
-    )
-    model_path["llama3_8b_f16_vmfb"] = set_fixture_from_cli_option(
-        request, "--llama3-8b-f16-vmfb-path", "llama3_8b_f16_vmfb"
-    )
-    model_path["llama3_405b_f16_vmfb"] = set_fixture_from_cli_option(
-        request, "--llama3-405b-f16-vmfb-path", "llama3_405b_f16_vmfb"
-    )
+    return model_path
+
+
+@pytest.fixture(scope="class")
+def get_iree_flags(request: FixtureRequest):
+    model_path = {}
     model_path["iree_device"] = set_fixture_from_cli_option(
         request, "--iree-device", "iree_device"
     )
-    return model_path
+    model_path["iree_hip_target"] = set_fixture_from_cli_option(
+        request, "--iree-hip-target", "iree_hip_target"
+    )
+    model_path["iree_hal_target_backends"] = set_fixture_from_cli_option(
+        request, "--iree-hal-target-backends", "iree_hal_target_backends"
+    )
diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index d20038d6f..75cf5ca63 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -358,15 +358,13 @@ def main(argv):
     parser = cli.create_parser()
     parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
     parser.add_argument("--torch-device", help="Torch device (or default)")
-    parser.add_argument(
-        "--iree-device", help="List an IREE device from iree-run-module --list_devices"
-    )
+    parser.add_argument("--iree-device", help="List an IREE device, eg: 'hip://0'")
     parser.add_argument("--vmfb-path", help="Path to vmfb file")
     parser.add_argument(
         "--tensor-parallelism-size",
         type=int,
         default=1,
-        help="Number of devices for tensor parallel sharding.",
+        help="Number of devices for tensor parallel sharding",
     )
 
     cli.add_tokenizer_options(parser)
diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
new file mode 100644
index 000000000..aa8a03582
--- /dev/null
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from pathlib import Path
+import unittest
+import pytest
+import subprocess
+import logging
+import itertools
+
+import iree.compiler as ireec
+
+logger = logging.getLogger("eval")
+
+logger.setLevel(logging.INFO)
+
+# logger.root.handlers[0].setFormatter(
+#     logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
+# )
+
+pytestmark = pytest.mark.usefixtures(
+    "get_model_artifacts", "get_iree_flags", "tensor_parallelism_size"
+)
+
+
+class ExportArtifacts(unittest.TestCase):
+    def setUp(self):
+        self.sharktank_dir = str(
+            Path(os.path.dirname(os.path.abspath(__file__))).parent.parent
+        )
+
+    def export_to_mlir(
+        self,
+        attention_kernel: str,
+        tensor_parallelism_size: int,
+        irpa_path: str,
+        mlir_path: str,
+        json_path: str,
+    ):
+        export_args = [
+            "python3",
+            "-m",
+            "sharktank.examples.export_paged_llm_v1",
+            "--irpa-file",
+            irpa_path,
+            "--output-mlir",
+            mlir_path,
+            "--output-config",
+            json_path,
+        ]
+        if attention_kernel == "decomposed":
+            export_args.append("--attention-kernel")
+            export_args.append(attention_kernel)
+        elif self.attention_kernel == "torch_sdpa":
+            raise NotImplementedError("attention_kernel torch_sdpa not implemented yet")
+        if tensor_parallelism_size:
+            export_args.append("--tensor-parallelism-size")
+            export_args.append(str(tensor_parallelism_size))
+
+        cmd = subprocess.list2cmdline(export_args)
+
+        logger.info(f"Exporting mlir:\n" f"cd {self.sharktank_dir} && {cmd}")
+        proc = subprocess.run(
+            cmd, shell=True, capture_output=True, cwd=self.sharktank_dir
+        )
+        return_code = proc.returncode
+        if return_code != 0:
+            logger.error("Error exporting mlir: ", return_code)
+
+    def compile_to_vmfb(
+        self,
+        mlir_path: str,
+        vmfb_path: str,
+        iree_hip_target: str,
+        iree_hal_target_backends: str,
+    ):
+        compile_flags = ["--iree-hip-target=" + iree_hip_target]
+
+        try:
+            ireec.compile_file(
+                input_file=mlir_path,
+                target_backends=[iree_hal_target_backends],
+                extra_args=compile_flags,
+                output_file=vmfb_path,
+            )
+        except Exception as error:
+            logger.error("Error invoking iree-compile: ", error)
+
+    def create_file(self, suffix, prefix):
+        file_path = Path(prefix).with_suffix(suffix)
+        f = open(file_path, "w")
+        return file_path
+
+    def test_export(self):
+
+        model_paths = [
+            self.llama3_8b_f16_model,
+            self.llama3_8b_fp8_model,
+            self.llama3_405b_f16_model,
+            self.llama3_405b_fp8_model,
+        ]
+        attention_kernels = ["decomposed", "torch_sdpa"]
+
+        self.dir_path = self.sharktank_dir + "/" + "ppl_artifacts/"
+        temp_dir = Path(self.dir_path)
+        temp_dir.mkdir(parents=True, exist_ok=True)
+
+        for model_path, attention_kernel in list(
+            itertools.product(model_paths, attention_kernels)
+        ):
+            model_name = (
+                str(model_path).split("/")[-1].split(".")[0] + "_" + attention_kernel
+            )
+            mlir_path = str(
+                self.create_file(suffix=".mlir", prefix=self.dir_path + model_name)
+            )
+            json_path = str(
+                self.create_file(suffix=".json", prefix=self.dir_path + model_name)
+            )
+            vmfb_path = str(
+                self.create_file(suffix=".vmfb", prefix=self.dir_path + model_name)
+            )
+            logger.info(
+                f"*************************iree-compile: *************************\n {model_path}, {attention_kernel}, {self.dir_path}, {mlir_path}, {vmfb_path}, {self.iree_hal_target_backends}, {self.iree_hip_target}"
+            )
+
+            if attention_kernel == "decomposed":
+                self.export_to_mlir(
+                    attention_kernel=attention_kernel,
+                    tensor_parallelism_size=self.tensor_parallelism_size,
+                    irpa_path=model_path,
+                    mlir_path=mlir_path,
+                    json_path=json_path,
+                )
+
+                self.compile_to_vmfb(
+                    mlir_path=mlir_path,
+                    vmfb_path=vmfb_path,
+                    iree_hip_target=self.iree_hip_target,
+                    iree_hal_target_backends=self.iree_hal_target_backends,
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
index 3c5ff0cda..54af77a9e 100644
--- a/sharktank/tests/evaluate/perplexity_torch_test.py
+++ b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -13,7 +13,9 @@
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
 
-@pytest.mark.usefixtures("get_model_path")
+@pytest.mark.usefixtures(
+    "get_model_artifacts", "tensor_parallelism_size", "baseline_perplexity_scores"
+)
 class PerplexityTest(unittest.TestCase):
     def setUp(self):
         self.current_perplexity_all = {}
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index a52c9d63e..16bfda668 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -13,7 +13,9 @@
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
 
-@pytest.mark.usefixtures("get_model_path")
+@pytest.mark.usefixtures(
+    "get_model_artifacts", "tensor_parallelism_size", "baseline_perplexity_scores"
+)
 class PerplexityTest(unittest.TestCase):
     def setUp(self):
         self.current_perplexity_all = {}

From c9fa0724710d36a035ce0eed386211468c1a307f Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 21:26:37 -0500
Subject: [PATCH 14/51] Update export test script

---
 .github/workflows/ci_eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 68504bdda..f921fb09b 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -106,6 +106,6 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Export mlir and vmfb
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/export_artifacts.py
+        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py
       - name: Run perplexity test with vmfb
         run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun

From 7f4de966e4b9d456a4d273a6c9d33f1b9fcfb516 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 21:33:29 -0500
Subject: [PATCH 15/51] Cleanup

---
 .github/workflows/ci_eval.yaml                | 20 +++++++++----------
 .../tests/evaluate/export_artifacts_test.py   |  9 +++------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index f921fb09b..9a4a7df23 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -16,9 +16,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_perplexity_torch:
+  test_perplexity_vmfb:
     timeout-minutes: 600
-    name: "Evaluation Tests - perplexity_torch"
+    name: "Evaluation Tests - perplexity_vmfb"
     strategy:
       matrix:
         version: [3.11]
@@ -59,12 +59,14 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-      - name: Run perplexity test in eager mode
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+      - name: Export mlir and vmfb
+        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py
+      - name: Run perplexity test with vmfb
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
-  test_perplexity_vmfb:
+  test_perplexity_torch:
     timeout-minutes: 600
-    name: "Evaluation Tests - perplexity_vmfb"
+    name: "Evaluation Tests - perplexity_torch"
     strategy:
       matrix:
         version: [3.11]
@@ -105,7 +107,5 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-      - name: Export mlir and vmfb
-        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py
-      - name: Run perplexity test with vmfb
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
+      - name: Run perplexity test in eager mode
+        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
index aa8a03582..52dde031c 100644
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -18,9 +18,9 @@
 
 logger.setLevel(logging.INFO)
 
-# logger.root.handlers[0].setFormatter(
-#     logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
-# )
+logger.root.handlers[0].setFormatter(
+    logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
+)
 
 pytestmark = pytest.mark.usefixtures(
     "get_model_artifacts", "get_iree_flags", "tensor_parallelism_size"
@@ -124,9 +124,6 @@ def test_export(self):
             vmfb_path = str(
                 self.create_file(suffix=".vmfb", prefix=self.dir_path + model_name)
             )
-            logger.info(
-                f"*************************iree-compile: *************************\n {model_path}, {attention_kernel}, {self.dir_path}, {mlir_path}, {vmfb_path}, {self.iree_hal_target_backends}, {self.iree_hip_target}"
-            )
 
             if attention_kernel == "decomposed":
                 self.export_to_mlir(

From 1a26ed733069158f7283b00717cba5cdb9dfd0ad Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 22:26:17 -0500
Subject: [PATCH 16/51] Test export

---
 .github/workflows/ci_eval.yaml                    | 2 ++
 sharktank/tests/evaluate/export_artifacts_test.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 9a4a7df23..51628bd9b 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,6 +59,8 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
+      - name: test
+        run:  cd /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/extra/models/llama3.1_8B/llama8b_f16.irpa --output-mlir /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank/ppl_artifacts/llama8b_f16_decomposed.mlir --output-config /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank/ppl_artifacts/llama8b_f16_decomposed.json --attention-kernel decomposed --tensor-parallelism-size 1
       - name: Export mlir and vmfb
         run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py
       - name: Run perplexity test with vmfb
diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
index 52dde031c..a77ff8e31 100644
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -63,6 +63,10 @@ def export_to_mlir(
 
         cmd = subprocess.list2cmdline(export_args)
 
+        logger.info(
+            f"export_args: {export_args}\n self.sharktank_dir: {self.sharktank_dir}"
+        )
+
         logger.info(f"Exporting mlir:\n" f"cd {self.sharktank_dir} && {cmd}")
         proc = subprocess.run(
             cmd, shell=True, capture_output=True, cwd=self.sharktank_dir

From 27255126896d7bfb996eb2ad1e21b0849541709b Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 22:43:45 -0500
Subject: [PATCH 17/51] Update artifacts dir

---
 sharktank/tests/evaluate/export_artifacts_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
index a77ff8e31..2ef651c8a 100644
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -32,6 +32,7 @@ def setUp(self):
         self.sharktank_dir = str(
             Path(os.path.dirname(os.path.abspath(__file__))).parent.parent
         )
+        self.artifacts_dir = "/data/extra/models/"
 
     def export_to_mlir(
         self,
@@ -109,7 +110,7 @@ def test_export(self):
         ]
         attention_kernels = ["decomposed", "torch_sdpa"]
 
-        self.dir_path = self.sharktank_dir + "/" + "ppl_artifacts/"
+        self.dir_path = self.artifacts_dir + "/" + "tmp_perplexity_ci_artifacts/"
         temp_dir = Path(self.dir_path)
         temp_dir.mkdir(parents=True, exist_ok=True)
 

From d4d1d1808703150632a175c3faffda191cc101f0 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 22:55:16 -0500
Subject: [PATCH 18/51] Add batch size

---
 .github/workflows/ci_eval.yaml                    |  2 +-
 sharktank/conftest.py                             | 13 +++++++++++++
 sharktank/tests/evaluate/export_artifacts_test.py |  6 +++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 51628bd9b..a1bdb136b 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -62,7 +62,7 @@ jobs:
       - name: test
         run:  cd /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/extra/models/llama3.1_8B/llama8b_f16.irpa --output-mlir /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank/ppl_artifacts/llama8b_f16_decomposed.mlir --output-config /home/esaimana/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank/ppl_artifacts/llama8b_f16_decomposed.json --attention-kernel decomposed --tensor-parallelism-size 1
       - name: Export mlir and vmfb
-        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py
+        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py --bs 4
       - name: Run perplexity test with vmfb
         run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index 79d2d477b..026424693 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -221,6 +221,14 @@ def pytest_addoption(parser):
         help="Number of devices for tensor parallel sharding",
     )
 
+    parser.addoption(
+        "--bs",
+        action="store",
+        type=int,
+        default=4,
+        help="Batch size for mlir export",
+    )
+
 
 def set_fixture_from_cli_option(
     request: FixtureRequest,
@@ -282,6 +290,11 @@ def baseline_perplexity_scores(request: FixtureRequest) -> Optional[str]:
     )
 
 
+@pytest.fixture(scope="class")
+def batch_size(request: FixtureRequest) -> Optional[str]:
+    return set_fixture_from_cli_option(request, "bs", "batch_size")
+
+
 @pytest.fixture(scope="class")
 def get_model_artifacts(request: FixtureRequest):
     model_path = {}
diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
index 2ef651c8a..118282b2a 100644
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -23,7 +23,7 @@
 )
 
 pytestmark = pytest.mark.usefixtures(
-    "get_model_artifacts", "get_iree_flags", "tensor_parallelism_size"
+    "get_model_artifacts", "get_iree_flags", "tensor_parallelism_size", "batch_size"
 )
 
 
@@ -41,6 +41,7 @@ def export_to_mlir(
         irpa_path: str,
         mlir_path: str,
         json_path: str,
+        batch_size: int,
     ):
         export_args = [
             "python3",
@@ -52,6 +53,8 @@ def export_to_mlir(
             mlir_path,
             "--output-config",
             json_path,
+            "--bs",
+            str(batch_size),
         ]
         if attention_kernel == "decomposed":
             export_args.append("--attention-kernel")
@@ -137,6 +140,7 @@ def test_export(self):
                     irpa_path=model_path,
                     mlir_path=mlir_path,
                     json_path=json_path,
+                    batch_size=self.batch_size,
                 )
 
                 self.compile_to_vmfb(

From 1f02051411e42ab8736e530481b9d84796ba77e0 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 23:06:04 -0500
Subject: [PATCH 19/51] Test export

---
 .github/workflows/ci_eval.yaml | 81 +++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index dda1d40bb..4e2a9c362 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -64,47 +64,48 @@ jobs:
       - name: Run perplexity test with vmfb
         run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
-  test_perplexity_torch:
-    name: "Evaluation Tests - perplexity_torch"
-    strategy:
-      matrix:
-        version: [3.11]
-        runs-on: [llama-mi300]
-      fail-fast: false
-    runs-on: ${{matrix.runs-on}}
-    defaults:
-      run:
-        shell: bash
-    env:
-      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
-    steps:
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@v3
-        with:
-          python-version: ${{matrix.version}}
+  # test_perplexity_torch:
+  #   timeout-minutes: 1000
+  #   name: "Evaluation Tests - perplexity_torch"
+  #   strategy:
+  #     matrix:
+  #       version: [3.11]
+  #       runs-on: [llama-mi300]
+  #     fail-fast: false
+  #   runs-on: ${{matrix.runs-on}}
+  #   defaults:
+  #     run:
+  #       shell: bash
+  #   env:
+  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+  #     SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
+  #   steps:
+  #     - name: "Setting up Python"
+  #       id: setup_python
+  #       uses: actions/setup-python@v3
+  #       with:
+  #         python-version: ${{matrix.version}}
 
-      - name: "Checkout Code"
-        uses: actions/checkout@v3
+  #     - name: "Checkout Code"
+  #       uses: actions/checkout@v3
 
-      - name: Cache Pip Packages
-        uses: actions/cache@v4
-        id: cache-pip
-        with:
-          path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+  #     - name: Cache Pip Packages
+  #       uses: actions/cache@v4
+  #       id: cache-pip
+  #       with:
+  #         path: ${{ env.PIP_CACHE_DIR }}
+  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
 
-      - name: Install sharktank deps
-        run: |
-          python -m pip install --no-compile --upgrade pip
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+  #     - name: Install sharktank deps
+  #       run: |
+  #         python -m pip install --no-compile --upgrade pip
+  #         # Note: We install in three steps in order to satisfy requirements
+  #         # from non default locations first. Installing the PyTorch CPU
+  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+  #         pip install --no-compile -r pytorch-cpu-requirements.txt
+  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+  #         pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-      - name: Run perplexity test in eager mode
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+  #     - name: Run perplexity test in eager mode
+  #       run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun

From 61901763066f3785216040ea976c3bfb79e3791b Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Thu, 24 Oct 2024 23:44:25 -0500
Subject: [PATCH 20/51] Remove artifacts dir

---
 sharktank/tests/evaluate/export_artifacts_test.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
index 118282b2a..6a8ad4ce8 100644
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ b/sharktank/tests/evaluate/export_artifacts_test.py
@@ -30,9 +30,8 @@
 class ExportArtifacts(unittest.TestCase):
     def setUp(self):
         self.sharktank_dir = str(
-            Path(os.path.dirname(os.path.abspath(__file__))).parent.parent
+            Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
         )
-        self.artifacts_dir = "/data/extra/models/"
 
     def export_to_mlir(
         self,
@@ -48,7 +47,7 @@ def export_to_mlir(
             "-m",
             "sharktank.examples.export_paged_llm_v1",
             "--irpa-file",
-            irpa_path,
+            str(irpa_path),
             "--output-mlir",
             mlir_path,
             "--output-config",
@@ -71,10 +70,10 @@ def export_to_mlir(
             f"export_args: {export_args}\n self.sharktank_dir: {self.sharktank_dir}"
         )
 
-        logger.info(f"Exporting mlir:\n" f"cd {self.sharktank_dir} && {cmd}")
-        proc = subprocess.run(
-            cmd, shell=True, capture_output=True, cwd=self.sharktank_dir
-        )
+        cwd = self.sharktank_dir + "/sharktank"
+
+        logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
+        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
         return_code = proc.returncode
         if return_code != 0:
             logger.error("Error exporting mlir: ", return_code)
@@ -113,7 +112,7 @@ def test_export(self):
         ]
         attention_kernels = ["decomposed", "torch_sdpa"]
 
-        self.dir_path = self.artifacts_dir + "/" + "tmp_perplexity_ci_artifacts/"
+        self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/"
         temp_dir = Path(self.dir_path)
         temp_dir.mkdir(parents=True, exist_ok=True)
 

From 9fe2c406f56c412bc465a30430fc683b2b079cfd Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 03:10:34 -0500
Subject: [PATCH 21/51] Remove export test and add as tool

---
 .github/workflows/ci_eval.yaml                |   2 -
 .../sharktank/evaluate/perplexity_vmfb.py     |  72 +++++-
 sharktank/sharktank/utils/export_artifacts.py | 135 +++++++++++
 .../evaluate/baseline_perplexity_scores.json  | 211 ++++++++++++++++++
 .../tests/evaluate/export_artifacts_test.py   | 154 -------------
 .../tests/evaluate/perplexity_vmfb_test.py    |  43 +++-
 6 files changed, 440 insertions(+), 177 deletions(-)
 create mode 100644 sharktank/sharktank/utils/export_artifacts.py
 delete mode 100644 sharktank/tests/evaluate/export_artifacts_test.py

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 4e2a9c362..27b6c94e6 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,8 +59,6 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-      - name: Export mlir and vmfb
-        run:  pytest -v -s sharktank/tests/evaluate/export_artifacts_test.py --bs 4
       - name: Run perplexity test with vmfb
         run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index 75cf5ca63..92313b32d 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -32,6 +32,7 @@
 from sharktank.utils.vmfb_runner import *
 from sharktank.utils.load_llm import *
 from sharktank.utils.create_cache import *
+from sharktank.utils.export_artifacts import *
 
 log_levels = {
     "info": logging.INFO,
@@ -58,14 +59,24 @@ class Perplexity:
     """
 
     def __init__(
-        self, torch_device, iree_device, kv_cache_type, tensor_parallelism_size
+        self,
+        torch_device,
+        iree_device,
+        iree_hip_target,
+        iree_hal_target_backends,
+        kv_cache_type,
+        tensor_parallelism_size,
+        attention_kernel,
     ):
         self.torch_device = torch_device
         self.iree_device = iree_device
+        self.iree_hip_target = iree_hip_target
+        self.iree_hal_target_backends = iree_hal_target_backends
         self.kv_cache_type = kv_cache_type
         self.activation_dtype = torch.float32
         self.attention_dtype = torch.float32
         self.tensor_parallelism_size = tensor_parallelism_size
+        self.attention_kernel = attention_kernel
 
     def timeit(func):
         def wrapper(*args, **kwargs):
@@ -102,6 +113,19 @@ def print_token_comparison(self, i):
             logger.debug(f"{expected_token}")
             logger.debug(f"{expected_token_id}")
 
+    @timeit
+    def compile_model(self, weight_path_str):
+        export_artifacts = ExportArtifacts(
+            irpa_path=weight_path_str,
+            batch_size=self.bs,
+            iree_hip_target=self.iree_hip_target,
+            iree_hal_target_backends=self.iree_hal_target_backends,
+            attention_kernel=self.attention_kernel,
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        vmfb_path = export_artifacts.get_artifacts()
+        return vmfb_path
+
     @timeit
     def load_model(self, weight_path, tokenizer, vmfb_path, weight_path_str):
 
@@ -130,6 +154,7 @@ def load_model(self, weight_path, tokenizer, vmfb_path, weight_path_str):
 
         self.generator = TorchGenerator(model, tokenizer)
 
+        self.weight_path_str = weight_path_str
         self.runner = vmfbRunner(
             device=self.iree_device,
             vmfb_path=vmfb_path,
@@ -151,10 +176,12 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ]
+        ][0:4]
 
         logger.info(f" num_test_prompts: {len(test_prompts)}")
 
+        self.bs = len(test_prompts)
+
         return test_prompts
 
     def prefill_vmfb(self, token_batch, i):
@@ -253,8 +280,6 @@ def get_logits(self):
             (self.token_ids != 0).int().detach().clone().to(self.torch_device)
         )
 
-        self.bs = len(self.test_prompts)
-
         is_first_token = True
         start = 0
         for i in tqdm(
@@ -313,6 +338,7 @@ def compute_perplexity(self):
     def get_perplexity(self, test_prompts):
 
         self.test_prompts = test_prompts
+
         self.get_logits()
 
         self.out_logits = self.out_logits[..., :-1, :].contiguous()
@@ -331,25 +357,32 @@ def get_perplexity(self, test_prompts):
 
 
 def run_perplexity(
-    vmfb_path,
     weight_path,
     weight_path_str,
     tokenizer,
     torch_device,
     iree_device,
+    iree_hip_target,
+    iree_hal_target_backends,
     kv_cache_type,
     tensor_parallelism_size,
+    attention_kernel,
 ):
     perplexity = Perplexity(
         torch_device=torch_device,
         iree_device=iree_device,
+        iree_hip_target=iree_hip_target,
+        iree_hal_target_backends=iree_hal_target_backends,
         kv_cache_type=kv_cache_type,
         tensor_parallelism_size=tensor_parallelism_size,
+        attention_kernel=attention_kernel,
     )
 
-    perplexity.load_model(weight_path, tokenizer, vmfb_path, weight_path_str)
     test_prompts = perplexity.get_prompts()
-    ppl = perplexity.get_perplexity(test_prompts=test_prompts)
+
+    vmfb_path = perplexity.compile_model(weight_path_str)
+    perplexity.load_model(weight_path, tokenizer, vmfb_path, weight_path_str)
+    ppl = perplexity.get_perplexity(test_prompts)
 
     return ppl
 
@@ -359,7 +392,24 @@ def main(argv):
     parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
     parser.add_argument("--torch-device", help="Torch device (or default)")
     parser.add_argument("--iree-device", help="List an IREE device, eg: 'hip://0'")
-    parser.add_argument("--vmfb-path", help="Path to vmfb file")
+    parser.add_argument(
+        "--iree-hip-target",
+        action="store",
+        default="gfx942",
+        help="Specify the iree-hip target version (e.g., gfx942)",
+    )
+    parser.add_argument(
+        "--iree-hal-target-backends",
+        action="store",
+        default="rocm",
+        help="Specify the iree-hal target backends (e.g., rocm)",
+    )
+    parser.add_argument(
+        "--attention-kernel",
+        type=str,
+        default="decomposed",
+        choices=["decomposed", "torch_sdpa"],
+    )
     parser.add_argument(
         "--tensor-parallelism-size",
         type=int,
@@ -376,19 +426,19 @@ def main(argv):
     kv_cache_type = args.kv_cache_type
     weight_path = cli.get_input_dataset(args)
     tokenizer = cli.get_tokenizer(args)
-
-    vmfb_path = args.vmfb_path
     weight_path_str = str(args.irpa_file)
 
     ppl = run_perplexity(
-        vmfb_path=vmfb_path,
         weight_path=weight_path,
         weight_path_str=weight_path_str,
         tokenizer=tokenizer,
         torch_device=torch_device,
         iree_device=iree_device,
+        iree_hip_target=args.iree_hip_target,
+        iree_hal_target_backends=args.iree_hal_target_backends,
         kv_cache_type=kv_cache_type,
         tensor_parallelism_size=args.tensor_parallelism_size,
+        attention_kernel=args.attention_kernel,
     )
 
     logger.info(f"\n{json.dumps(ppl, indent=2)}")
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
new file mode 100644
index 000000000..e8df396e9
--- /dev/null
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -0,0 +1,135 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from pathlib import Path
+import subprocess
+import logging
+
+import iree.compiler as ireec
+
+logger = logging.getLogger("eval")
+
+logger.setLevel(logging.INFO)
+
+logger.root.handlers[0].setFormatter(
+    logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
+)
+
+
+class ExportArtifacts:
+    def __init__(
+        self,
+        irpa_path: str,
+        batch_size: int,
+        iree_hip_target: str,
+        iree_hal_target_backends: str,
+        attention_kernel: str,
+        tensor_parallelism_size: int,
+    ):
+        self.sharktank_dir = str(
+            Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
+        )
+        self.irpa_path = irpa_path
+        self.batch_size = batch_size
+        self.iree_hip_target = iree_hip_target
+        self.iree_hal_target_backends = iree_hal_target_backends
+        self.attention_kernel = attention_kernel
+        self.tensor_parallelism_size = tensor_parallelism_size
+
+    def export_to_mlir(
+        self,
+        mlir_path: str,
+        json_path: str,
+    ):
+        export_args = [
+            "python3",
+            "-m",
+            "sharktank.examples.export_paged_llm_v1",
+            "--irpa-file",
+            str(self.irpa_path),
+            "--output-mlir",
+            mlir_path,
+            "--output-config",
+            json_path,
+            "--bs",
+            str(self.batch_size),
+        ]
+        if self.attention_kernel == "decomposed":
+            export_args.append("--attention-kernel")
+            export_args.append(self.attention_kernel)
+        elif self.attention_kernel == "torch_sdpa":
+            raise NotImplementedError("attention_kernel torch_sdpa not implemented yet")
+        if self.tensor_parallelism_size:
+            export_args.append("--tensor-parallelism-size")
+            export_args.append(str(self.tensor_parallelism_size))
+
+        cmd = subprocess.list2cmdline(export_args)
+
+        logger.info(
+            f"export_args: {export_args}\n self.sharktank_dir: {self.sharktank_dir}"
+        )
+
+        cwd = self.sharktank_dir + "/sharktank"
+
+        logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
+        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
+        return_code = proc.returncode
+        if return_code != 0:
+            logger.error("Error exporting mlir: ", return_code)
+
+    def compile_to_vmfb(
+        self,
+        mlir_path,
+        vmfb_path,
+    ):
+        compile_flags = ["--iree-hip-target=" + self.iree_hip_target]
+
+        ireec.compile_file(
+            input_file=mlir_path,
+            target_backends=[self.iree_hal_target_backends],
+            extra_args=compile_flags,
+            output_file=vmfb_path,
+        )
+
+    def create_file(self, suffix, prefix):
+        file_path = Path(prefix).with_suffix(suffix)
+        f = open(file_path, "w")
+        return file_path
+
+    def get_artifacts(self):
+
+        self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/"
+        temp_dir = Path(self.dir_path)
+        temp_dir.mkdir(parents=True, exist_ok=True)
+
+        model_name = (
+            str(self.irpa_path).split("/")[-1].split(".")[0]
+            + "_"
+            + self.attention_kernel
+        )
+        mlir_path = str(
+            self.create_file(suffix=".mlir", prefix=self.dir_path + model_name)
+        )
+        json_path = str(
+            self.create_file(suffix=".json", prefix=self.dir_path + model_name)
+        )
+        vmfb_path = str(
+            self.create_file(suffix=".vmfb", prefix=self.dir_path + model_name)
+        )
+
+        if self.attention_kernel == "decomposed":
+            self.export_to_mlir(
+                mlir_path=mlir_path,
+                json_path=json_path,
+            )
+
+            self.compile_to_vmfb(
+                mlir_path=mlir_path,
+                vmfb_path=vmfb_path,
+            )
+
+        return vmfb_path
diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
index 45515566e..fa353d136 100644
--- a/sharktank/tests/evaluate/baseline_perplexity_scores.json
+++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -209,5 +209,216 @@
       1.915619
     ],
     "mean_perplexity": 6.060831
+  },
+  "llama3_8B_f16_decomposed_vmfb": {
+    "perplexities": [
+      6.677369,
+      21.807926,
+      15.424338,
+      17.332415,
+      14.951956,
+      7.913092,
+      8.728321,
+      22.425966,
+      8.184698,
+      20.977249,
+      7.088408,
+      14.574989,
+      9.036912,
+      7.277581,
+      16.132208,
+      6.685175,
+      6.525683,
+      7.080791,
+      10.680925,
+      9.034086,
+      10.639015,
+      41.102894,
+      11.723896,
+      64.305908,
+      47.054577,
+      19.9259,
+      18.918842,
+      13.842684,
+      9.974381,
+      5.919641,
+      10.181265,
+      23.609016,
+      14.340417,
+      9.712208,
+      5.602878,
+      14.088163,
+      5.680599,
+      17.377926,
+      9.037231,
+      8.305407,
+      8.028031,
+      17.744528,
+      11.5076,
+      3.936302,
+      12.987297,
+      10.371798,
+      11.927772,
+      21.387051,
+      37.799526,
+      25.67762,
+      15.429109,
+      13.923962,
+      7.594806,
+      10.983875,
+      14.595965,
+      11.022234,
+      5.853358,
+      15.609065,
+      8.044486,
+      14.389134,
+      5.917565,
+      6.892455,
+      2.30309,
+      15.974725,
+      42.017342,
+      8.022307,
+      12.284297,
+      10.018423,
+      9.268936,
+      10.680118,
+      8.12535,
+      21.550434,
+      3.638689,
+      15.345065,
+      23.742884,
+      14.288899,
+      17.796623,
+      16.515446,
+      8.746647,
+      12.922096,
+      12.94269,
+      13.574061,
+      14.013302,
+      10.76523,
+      14.746032,
+      28.208134,
+      17.646687,
+      9.848188,
+      15.280471,
+      15.621455,
+      29.126505,
+      12.302313,
+      32.452534,
+      31.192411,
+      14.371797,
+      17.490683,
+      14.689407,
+      15.284843,
+      12.252508,
+      16.460979
+    ],
+    "mean_perplexity": 14.930181
+  },
+
+  "llama3_405B_f16_decomposed_vmfb": {
+    "perplexities": [
+      2.170036,
+      8.014498,
+      3.743922,
+      10.629776,
+      8.965701,
+      2.884743,
+      2.886767,
+      3.853816,
+      2.73785,
+      15.235562,
+      2.65135,
+      1.970936,
+      5.08259,
+      2.507602,
+      7.571635,
+      3.005182,
+      1.904492,
+      3.182651,
+      6.249443,
+      4.661795,
+      12.68933,
+      35.432453,
+      5.50336,
+      60.950359,
+      18.433432,
+      5.001391,
+      4.814827,
+      2.99482,
+      2.697508,
+      2.617349,
+      2.359061,
+      16.697233,
+      2.145065,
+      2.1207,
+      2.496015,
+      1.822896,
+      4.671626,
+      2.389186,
+      2.701802,
+      1.921128,
+      2.236057,
+      4.741998,
+      4.946936,
+      2.758695,
+      2.446043,
+      2.146302,
+      8.72202,
+      4.180647,
+      11.449497,
+      13.429152,
+      3.72468,
+      2.407385,
+      3.592854,
+      5.412414,
+      3.189998,
+      4.186216,
+      1.642744,
+      2.279058,
+      1.855652,
+      3.453852,
+      1.436223,
+      1.516955,
+      1.716439,
+      4.715765,
+      21.48657,
+      2.208737,
+      6.420449,
+      2.001433,
+      2.400955,
+      3.543744,
+      3.054271,
+      7.904545,
+      1.950376,
+      3.983746,
+      6.28265,
+      2.64157,
+      5.473378,
+      3.444444,
+      1.926046,
+      3.092915,
+      3.996159,
+      3.125222,
+      1.718025,
+      3.856093,
+      3.041075,
+      11.798485,
+      14.881112,
+      5.631516,
+      4.407883,
+      4.840533,
+      21.351448,
+      2.065821,
+      6.658993,
+      28.123312,
+      1.673253,
+      3.729975,
+      5.336116,
+      8.579758,
+      2.979404,
+      1.915619
+    ],
+    "mean_perplexity": 6.060831
   }
 }
diff --git a/sharktank/tests/evaluate/export_artifacts_test.py b/sharktank/tests/evaluate/export_artifacts_test.py
deleted file mode 100644
index 6a8ad4ce8..000000000
--- a/sharktank/tests/evaluate/export_artifacts_test.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-from pathlib import Path
-import unittest
-import pytest
-import subprocess
-import logging
-import itertools
-
-import iree.compiler as ireec
-
-logger = logging.getLogger("eval")
-
-logger.setLevel(logging.INFO)
-
-logger.root.handlers[0].setFormatter(
-    logging.Formatter(fmt="\n%(levelname)s:%(name)-8s %(message)s")
-)
-
-pytestmark = pytest.mark.usefixtures(
-    "get_model_artifacts", "get_iree_flags", "tensor_parallelism_size", "batch_size"
-)
-
-
-class ExportArtifacts(unittest.TestCase):
-    def setUp(self):
-        self.sharktank_dir = str(
-            Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
-        )
-
-    def export_to_mlir(
-        self,
-        attention_kernel: str,
-        tensor_parallelism_size: int,
-        irpa_path: str,
-        mlir_path: str,
-        json_path: str,
-        batch_size: int,
-    ):
-        export_args = [
-            "python3",
-            "-m",
-            "sharktank.examples.export_paged_llm_v1",
-            "--irpa-file",
-            str(irpa_path),
-            "--output-mlir",
-            mlir_path,
-            "--output-config",
-            json_path,
-            "--bs",
-            str(batch_size),
-        ]
-        if attention_kernel == "decomposed":
-            export_args.append("--attention-kernel")
-            export_args.append(attention_kernel)
-        elif self.attention_kernel == "torch_sdpa":
-            raise NotImplementedError("attention_kernel torch_sdpa not implemented yet")
-        if tensor_parallelism_size:
-            export_args.append("--tensor-parallelism-size")
-            export_args.append(str(tensor_parallelism_size))
-
-        cmd = subprocess.list2cmdline(export_args)
-
-        logger.info(
-            f"export_args: {export_args}\n self.sharktank_dir: {self.sharktank_dir}"
-        )
-
-        cwd = self.sharktank_dir + "/sharktank"
-
-        logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
-        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
-        return_code = proc.returncode
-        if return_code != 0:
-            logger.error("Error exporting mlir: ", return_code)
-
-    def compile_to_vmfb(
-        self,
-        mlir_path: str,
-        vmfb_path: str,
-        iree_hip_target: str,
-        iree_hal_target_backends: str,
-    ):
-        compile_flags = ["--iree-hip-target=" + iree_hip_target]
-
-        try:
-            ireec.compile_file(
-                input_file=mlir_path,
-                target_backends=[iree_hal_target_backends],
-                extra_args=compile_flags,
-                output_file=vmfb_path,
-            )
-        except Exception as error:
-            logger.error("Error invoking iree-compile: ", error)
-
-    def create_file(self, suffix, prefix):
-        file_path = Path(prefix).with_suffix(suffix)
-        f = open(file_path, "w")
-        return file_path
-
-    def test_export(self):
-
-        model_paths = [
-            self.llama3_8b_f16_model,
-            self.llama3_8b_fp8_model,
-            self.llama3_405b_f16_model,
-            self.llama3_405b_fp8_model,
-        ]
-        attention_kernels = ["decomposed", "torch_sdpa"]
-
-        self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/"
-        temp_dir = Path(self.dir_path)
-        temp_dir.mkdir(parents=True, exist_ok=True)
-
-        for model_path, attention_kernel in list(
-            itertools.product(model_paths, attention_kernels)
-        ):
-            model_name = (
-                str(model_path).split("/")[-1].split(".")[0] + "_" + attention_kernel
-            )
-            mlir_path = str(
-                self.create_file(suffix=".mlir", prefix=self.dir_path + model_name)
-            )
-            json_path = str(
-                self.create_file(suffix=".json", prefix=self.dir_path + model_name)
-            )
-            vmfb_path = str(
-                self.create_file(suffix=".vmfb", prefix=self.dir_path + model_name)
-            )
-
-            if attention_kernel == "decomposed":
-                self.export_to_mlir(
-                    attention_kernel=attention_kernel,
-                    tensor_parallelism_size=self.tensor_parallelism_size,
-                    irpa_path=model_path,
-                    mlir_path=mlir_path,
-                    json_path=json_path,
-                    batch_size=self.batch_size,
-                )
-
-                self.compile_to_vmfb(
-                    mlir_path=mlir_path,
-                    vmfb_path=vmfb_path,
-                    iree_hip_target=self.iree_hip_target,
-                    iree_hal_target_backends=self.iree_hal_target_backends,
-                )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index 16bfda668..d5d9daa6d 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -14,7 +14,10 @@
 
 
 @pytest.mark.usefixtures(
-    "get_model_artifacts", "tensor_parallelism_size", "baseline_perplexity_scores"
+    "get_model_artifacts",
+    "get_iree_flags",
+    "tensor_parallelism_size",
+    "baseline_perplexity_scores",
 )
 class PerplexityTest(unittest.TestCase):
     def setUp(self):
@@ -34,10 +37,13 @@ def test_llama3_8B_f16_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_8b_f16_vmfb}",
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
+                f"--tensor-parallelism-size=1",
+                f"--attention-kernel=decomposed",
             ]
         )
 
@@ -66,10 +72,13 @@ def test_llama3_8B_f16_non_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_8b_f16_vmfb}",
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
+                f"--tensor-parallelism-size=1",
+                f"--attention-kernel=torch_sdpa",
             ]
         )
 
@@ -98,10 +107,13 @@ def test_llama3_8B_fp8_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_8b_fp8_vmfb}",
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
+                f"--tensor-parallelism-size=1",
+                f"--attention-kernel=decomposed",
             ]
         )
 
@@ -130,10 +142,13 @@ def test_llama3_8B_fp8_non_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_8b_fp8_vmfb}",
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
+                f"--tensor-parallelism-size=1",
+                f"--attention-kernel=torch_sdpa",
             ]
         )
 
@@ -159,11 +174,13 @@ def test_llama3_405B_f16_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_405b_f16_vmfb}",
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+                f"--attention-kernel=decomposed",
             ]
         )
 
@@ -192,11 +209,13 @@ def test_llama3_405B_f16_non_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_405b_f16_vmfb}",
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+                f"--attention-kernel=torch_sdpa",
             ]
         )
 
@@ -225,11 +244,13 @@ def test_llama3_405B_fp8_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_405b_fp8_vmfb}",
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
                 f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+                f"--attention-kernel=decomposed",
             ]
         )
 
@@ -258,11 +279,13 @@ def test_llama3_405B_fp8_non_decomposed(self):
 
         current_perplexity = perplexity_vmfb.main(
             [
-                f"--vmfb-path={self.llama3_405b_fp8_vmfb}",
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
                 f"--iree-device={self.iree_device}",
-                "--tensor-parallelism-size={self.tensor_parallelism_size}",
+                f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+                f"--iree-hip-target={self.iree_hip_target}",
+                f"--tensor-parallelism-size={self.tensor_parallelism_size}",
+                f"--attention-kernel=torch_sdpa",
             ]
         )
 

From cf6ee83bbd8512447c7c7e198720113bc288ccc2 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 03:46:21 -0500
Subject: [PATCH 22/51] Add log messages

---
 sharktank/sharktank/utils/export_artifacts.py | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index e8df396e9..ca3adc42d 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -75,11 +75,13 @@ def export_to_mlir(
 
         cwd = self.sharktank_dir + "/sharktank"
 
-        logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
+        logger.debug(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
         proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
         return_code = proc.returncode
         if return_code != 0:
             logger.error("Error exporting mlir: ", return_code)
+        else:
+            logger.info(f"Exported to mlir successfully: {mlir_path}!")
 
     def compile_to_vmfb(
         self,
@@ -88,12 +90,17 @@ def compile_to_vmfb(
     ):
         compile_flags = ["--iree-hip-target=" + self.iree_hip_target]
 
-        ireec.compile_file(
-            input_file=mlir_path,
-            target_backends=[self.iree_hal_target_backends],
-            extra_args=compile_flags,
-            output_file=vmfb_path,
-        )
+        try:
+            ireec.compile_file(
+                input_file=mlir_path,
+                target_backends=[self.iree_hal_target_backends],
+                extra_args=compile_flags,
+                output_file=vmfb_path,
+            )
+        except Exception as error:
+            logger.error("Error running iree-compile: ", error)
+
+        logger.info(f"Compiled to vmfb successfully: {vmfb_path}")
 
     def create_file(self, suffix, prefix):
         file_path = Path(prefix).with_suffix(suffix)

From 9dbc07a372b732b84ff656039ca51e62d6cb6255 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 03:50:12 -0500
Subject: [PATCH 23/51] Add log messages

---
 sharktank/sharktank/utils/export_artifacts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index ca3adc42d..f466c6449 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -81,7 +81,7 @@ def export_to_mlir(
         if return_code != 0:
             logger.error("Error exporting mlir: ", return_code)
         else:
-            logger.info(f"Exported to mlir successfully: {mlir_path}!")
+            logger.info(f"Exported to mlir successfully: {mlir_path}")
 
     def compile_to_vmfb(
         self,

From f5c4fef1e3c3379cd93cce4e58cb00b846b01f16 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 04:22:02 -0500
Subject: [PATCH 24/51] Update vmfb runner module name dynamically

---
 sharktank/sharktank/evaluate/perplexity_vmfb.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index 92313b32d..f1900e6e3 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -176,7 +176,7 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ][0:4]
+        ]
 
         logger.info(f" num_test_prompts: {len(test_prompts)}")
 
@@ -208,7 +208,7 @@ def prefill_vmfb(self, token_batch, i):
         )
 
         seq_block_ids = self.batch.pad_block_ids()
-        prefill_logits = self.runner.ctx.modules.module.prefill_bs4(
+        prefill_logits = self.runner.ctx.modules.module[f"prefill_bs{self.bs}"](
             token_batch,
             self.seq_lens_batch,
             seq_block_ids,
@@ -239,7 +239,7 @@ def decode_vmfb(self, token_batch, i):
         self.batch.allocate_seq_block_ids()
         seq_block_ids = self.batch.pad_block_ids()
 
-        decode_logits = self.runner.ctx.modules.module.decode_bs4(
+        decode_logits = self.runner.ctx.modules.module[f"decode_bs{self.bs}"](
             token_batch,
             self.seq_lens_batch,
             start_positions,

From 3a9105142b7701c25beb7b73d02f8ad5bb085103 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 11:16:46 -0500
Subject: [PATCH 25/51] Update llallama3_8B_f16_decomposed_vmfb perplexities

---
 .../evaluate/baseline_perplexity_scores.json  | 202 +++++++++---------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
index fa353d136..b613809ed 100644
--- a/sharktank/tests/evaluate/baseline_perplexity_scores.json
+++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -212,108 +212,108 @@
   },
   "llama3_8B_f16_decomposed_vmfb": {
     "perplexities": [
-      6.677369,
-      21.807926,
-      15.424338,
-      17.332415,
-      14.951956,
-      7.913092,
-      8.728321,
-      22.425966,
-      8.184698,
-      20.977249,
-      7.088408,
-      14.574989,
-      9.036912,
-      7.277581,
-      16.132208,
-      6.685175,
-      6.525683,
-      7.080791,
-      10.680925,
-      9.034086,
-      10.639015,
-      41.102894,
-      11.723896,
-      64.305908,
-      47.054577,
-      19.9259,
-      18.918842,
-      13.842684,
-      9.974381,
-      5.919641,
-      10.181265,
-      23.609016,
-      14.340417,
-      9.712208,
-      5.602878,
-      14.088163,
-      5.680599,
-      17.377926,
-      9.037231,
-      8.305407,
-      8.028031,
-      17.744528,
-      11.5076,
-      3.936302,
-      12.987297,
-      10.371798,
-      11.927772,
-      21.387051,
-      37.799526,
-      25.67762,
-      15.429109,
-      13.923962,
-      7.594806,
-      10.983875,
-      14.595965,
-      11.022234,
-      5.853358,
-      15.609065,
-      8.044486,
-      14.389134,
-      5.917565,
-      6.892455,
-      2.30309,
-      15.974725,
-      42.017342,
-      8.022307,
-      12.284297,
-      10.018423,
-      9.268936,
-      10.680118,
-      8.12535,
-      21.550434,
-      3.638689,
-      15.345065,
-      23.742884,
-      14.288899,
-      17.796623,
-      16.515446,
-      8.746647,
-      12.922096,
-      12.94269,
-      13.574061,
-      14.013302,
-      10.76523,
-      14.746032,
-      28.208134,
-      17.646687,
-      9.848188,
-      15.280471,
-      15.621455,
-      29.126505,
-      12.302313,
-      32.452534,
-      31.192411,
-      14.371797,
-      17.490683,
-      14.689407,
-      15.284843,
-      12.252508,
-      16.460979
+      21394.824219,
+      21544.3125,
+      14821.359375,
+      16374.799805,
+      8942.28125,
+      9946.700195,
+      16440.865234,
+      10721.15332,
+      9675.765625,
+      14437.389648,
+      27061.357422,
+      8576.095703,
+      22894.248047,
+      8205.601562,
+      4902.503906,
+      14098.294922,
+      11953.639648,
+      9046.456055,
+      7345.877441,
+      14587.374023,
+      20542.126953,
+      14990.035156,
+      15217.208984,
+      22458.199219,
+      17894.568359,
+      11072.371094,
+      11668.830078,
+      11384.431641,
+      7894.328125,
+      7638.759277,
+      10262.393555,
+      16722.433594,
+      5746.149902,
+      7049.083984,
+      7314.810547,
+      7159.469238,
+      8198.553711,
+      5917.909668,
+      12120.987305,
+      13357.332031,
+      6877.470215,
+      7771.493164,
+      13632.90625,
+      7473.57959,
+      8513.025391,
+      5848.255371,
+      21835.617188,
+      13271.357422,
+      45267.539062,
+      13817.6875,
+      14733.533203,
+      14010.263672,
+      27900.892578,
+      8016.948242,
+      6842.378418,
+      10149.141602,
+      7411.538574,
+      17125.933594,
+      4876.651855,
+      8817.567383,
+      13022.648438,
+      10516.925781,
+      6493.474609,
+      6885.805176,
+      13201.474609,
+      9690.910156,
+      2992.262695,
+      12565.056641,
+      13803.712891,
+      12151.033203,
+      10440.636719,
+      16468.451172,
+      13720.111328,
+      9114.548828,
+      14827.0,
+      11495.735352,
+      6366.992676,
+      10188.37793,
+      5519.487305,
+      10712.731445,
+      4132.742188,
+      12887.806641,
+      6262.628906,
+      17117.361328,
+      10427.929688,
+      42412.0,
+      21811.390625,
+      6171.995605,
+      17588.886719,
+      6537.535156,
+      8773.981445,
+      14319.901367,
+      35847.394531,
+      10555.681641,
+      5562.47998,
+      8986.163086,
+      6192.861328,
+      13730.34668,
+      10742.932617,
+      12502.827148
     ],
-    "mean_perplexity": 14.930181
+    "mean_perplexity": 12545.168862
   },
 
   "llama3_405B_f16_decomposed_vmfb": {

From 006c5d4a2e2d78f3264044bb00561cb555e5e6a9 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 15:36:33 -0500
Subject: [PATCH 26/51] Move CI to mi300x-3

---
 .github/workflows/ci_eval.yaml | 2 +-
 sharktank/conftest.py          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 27b6c94e6..5ac553052 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -22,7 +22,7 @@ jobs:
     strategy:
       matrix:
         version: [3.11]
-        runs-on: [llama-mi300]
+        runs-on: [llama-mi300x-3]
       fail-fast: false
     runs-on: ${{matrix.runs-on}}
     defaults:
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index 026424693..f9ba99b2f 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -76,7 +76,7 @@ def pytest_addoption(parser):
         "--llama3-8b-tokenizer-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_8B/tokenizer_config.json",
+        default="/data/llama-3.1/8b/tokenizer_config.json",
         help="Llama3.1 8b tokenizer path, defaults to 30F CI system path",
     )
 
@@ -92,7 +92,7 @@ def pytest_addoption(parser):
         "--llama3-8b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_f16.irpa",
+        default="/data/llama-3.1/8b/llama8b_f16.irpa",
         help="Llama3.1 8b model path, defaults to 30F CI system path",
     )
 
@@ -132,7 +132,7 @@ def pytest_addoption(parser):
         "--llama3-405b-tokenizer-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_405B/tokenizer_config.json",
+        default="/data/llama-3.1/405b/tokenizer_config.json",
         help="Llama3.1 405b tokenizer path, defaults to 30F CI system path",
     )
 
@@ -148,7 +148,7 @@ def pytest_addoption(parser):
         "--llama3-405b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/extra/models/llama3.1_405B/llama405b_fp16.irpa",
+        default="/data/llama-3.1/405b/llama405b_fp16.irpa",
         help="Llama3.1 405b model path, defaults to 30F CI system path",
     )
 

From 7fe9594f9e0ef8eda2c34e47c43f5a71e4df43d0 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 19:03:02 -0500
Subject: [PATCH 27/51] Address review comments

---
 .github/workflows/ci_eval.yaml | 4 ++--
 sharktank/conftest.py          | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 5ac553052..e20cbe050 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -60,7 +60,7 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Run perplexity test with vmfb
-        run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
   # test_perplexity_torch:
   #   timeout-minutes: 1000
@@ -106,4 +106,4 @@ jobs:
   #         pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
   #     - name: Run perplexity test in eager mode
-  #       run:  pytest -n 4 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+  #       run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index f9ba99b2f..b7415b427 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -72,6 +72,7 @@ def pytest_addoption(parser):
         help="Enable long and slow tests",
     )
 
+    # TODO: Remove all hardcoded paths in CI tests
     parser.addoption(
         "--llama3-8b-tokenizer-path",
         type=Path,

From 03baccbd5d87b28ad4d39fb981227727a68e3836 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 19:04:11 -0500
Subject: [PATCH 28/51] Revert debug to info logging

---
 sharktank/sharktank/utils/export_artifacts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index f466c6449..6e603ae1c 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -75,7 +75,7 @@ def export_to_mlir(
 
         cwd = self.sharktank_dir + "/sharktank"
 
-        logger.debug(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
+        logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
         proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
         return_code = proc.returncode
         if return_code != 0:

From 52a6fc1999d04225bcd2771553a950e29a47ff1c Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 19:14:48 -0500
Subject: [PATCH 29/51] Test

---
 .github/workflows/ci_eval.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index e20cbe050..492628dcb 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -58,7 +58,10 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
-
+      - name: test1
+        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed --tensor-parallelism-size 1
+      - name: test2
+        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed --tensor-parallelism-size 1
       - name: Run perplexity test with vmfb
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 

From d1ed9a2e294ebd74605e69c8693b272bbb081393 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 19:33:04 -0500
Subject: [PATCH 30/51] Update export mlir to remove tensor_parallelism_size
 arg

---
 .github/workflows/ci_eval.yaml                | 4 +---
 sharktank/sharktank/utils/export_artifacts.py | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 492628dcb..994cbdbe1 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -59,9 +59,7 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
       - name: test1
-        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed --tensor-parallelism-size 1
-      - name: test2
-        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed --tensor-parallelism-size 1
+        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed
       - name: Run perplexity test with vmfb
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 6e603ae1c..7d54c1e86 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -63,9 +63,9 @@ def export_to_mlir(
             export_args.append(self.attention_kernel)
         elif self.attention_kernel == "torch_sdpa":
             raise NotImplementedError("attention_kernel torch_sdpa not implemented yet")
-        if self.tensor_parallelism_size:
-            export_args.append("--tensor-parallelism-size")
-            export_args.append(str(self.tensor_parallelism_size))
+        # if self.tensor_parallelism_size:
+        #     export_args.append("--tensor-parallelism-size")
+        #     export_args.append(str(self.tensor_parallelism_size))
 
         cmd = subprocess.list2cmdline(export_args)
 

From 1876f54b6c5d88781790158700bf46a915fbbe80 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 25 Oct 2024 19:38:02 -0500
Subject: [PATCH 31/51] Make non_decomposed version the default

---
 .../tests/evaluate/perplexity_torch_test.py      | 16 ++++++++--------
 sharktank/tests/evaluate/perplexity_vmfb_test.py | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
index 54af77a9e..b2b098ae7 100644
--- a/sharktank/tests/evaluate/perplexity_torch_test.py
+++ b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -55,11 +55,11 @@ def test_llama3_8B_f16_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_8B_f16_non_decomposed(self):
+    def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_f16_non_decomposed"
+        model_name = "llama3_8B_f16"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_torch.main(
@@ -116,11 +116,11 @@ def test_llama3_8B_fp8_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_8B_fp8_non_decomposed(self):
+    def test_llama3_8B_fp8(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_fp8_non_decomposed"
+        model_name = "llama3_8B_fp8"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_torch.main(
@@ -175,11 +175,11 @@ def test_llama3_405B_f16_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_405B_f16_non_decomposed(self):
+    def test_llama3_405B_f16(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_f16_non_decomposed"
+        model_name = "llama3_405B_f16"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_torch.main(
@@ -238,11 +238,11 @@ def test_llama3_405B_fp8_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_405B_fp8_non_decomposed(self):
+    def test_llama3_405B_fp8(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_fp8_non_decomposed"
+        model_name = "llama3_405B_fp8"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_torch.main(
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index d5d9daa6d..6ad187d21 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -63,11 +63,11 @@ def test_llama3_8B_f16_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_8B_f16_non_decomposed(self):
+    def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_f16_non_decomposed_vmfb"
+        model_name = "llama3_8B_f16_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_vmfb.main(
@@ -133,11 +133,11 @@ def test_llama3_8B_fp8_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_8B_fp8_non_decomposed(self):
+    def test_llama3_8B_fp8(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_fp8_non_decomposed_vmfb"
+        model_name = "llama3_8B_fp8_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_vmfb.main(
@@ -200,11 +200,11 @@ def test_llama3_405B_f16_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_405B_f16_non_decomposed(self):
+    def test_llama3_405B_f16(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_f16_non_decomposed_vmfb"
+        model_name = "llama3_405B_f16_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_vmfb.main(
@@ -270,11 +270,11 @@ def test_llama3_405B_fp8_decomposed(self):
         reason="Non-decomposed attention is not supported yet",
     )
     @longrun
-    def test_llama3_405B_fp8_non_decomposed(self):
+    def test_llama3_405B_fp8(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_fp8_non_decomposed_vmfb"
+        model_name = "llama3_405B_fp8_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
         current_perplexity = perplexity_vmfb.main(

From 563f72e94d4714097773b2d4b4986277147f91ec Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 26 Oct 2024 00:31:32 -0500
Subject: [PATCH 32/51] Fix export cmd string parsing issues

---
 .../sharktank/evaluate/perplexity_vmfb.py     | 20 ++++---
 sharktank/sharktank/utils/export_artifacts.py | 55 ++++++++++++-------
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_vmfb.py
index f1900e6e3..fedf7c1c9 100644
--- a/sharktank/sharktank/evaluate/perplexity_vmfb.py
+++ b/sharktank/sharktank/evaluate/perplexity_vmfb.py
@@ -91,7 +91,9 @@ def wrapper(*args, **kwargs):
 
             func_name = func.__name__
             if func_name == "get_perplexity":
-                func_name = "Total time"
+                func_name = f"Total time to calculate perplexity"
+            elif func_name == "compile_model":
+                func_name = f"Total time to export and compile"
             logger.info(f" {func_name}: {time_taken}")
             return result
 
@@ -115,8 +117,12 @@ def print_token_comparison(self, i):
 
     @timeit
     def compile_model(self, weight_path_str):
+        self.weight_path_str = weight_path_str
+
+        logger.info(f"Compiling: {self.weight_path_str}")
+
         export_artifacts = ExportArtifacts(
-            irpa_path=weight_path_str,
+            irpa_path=self.weight_path_str,
             batch_size=self.bs,
             iree_hip_target=self.iree_hip_target,
             iree_hal_target_backends=self.iree_hal_target_backends,
@@ -127,7 +133,7 @@ def compile_model(self, weight_path_str):
         return vmfb_path
 
     @timeit
-    def load_model(self, weight_path, tokenizer, vmfb_path, weight_path_str):
+    def load_model(self, weight_path, tokenizer, vmfb_path):
 
         config = LlamaModelConfig(
             hp=configs.LlamaHParams.from_gguf_props(weight_path.properties),
@@ -154,11 +160,10 @@ def load_model(self, weight_path, tokenizer, vmfb_path, weight_path_str):
 
         self.generator = TorchGenerator(model, tokenizer)
 
-        self.weight_path_str = weight_path_str
         self.runner = vmfbRunner(
             device=self.iree_device,
             vmfb_path=vmfb_path,
-            external_weight_path=weight_path_str,
+            external_weight_path=self.weight_path_str,
         )
 
     @timeit
@@ -178,8 +183,6 @@ def get_prompts(self):
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
         ]
 
-        logger.info(f" num_test_prompts: {len(test_prompts)}")
-
         self.bs = len(test_prompts)
 
         return test_prompts
@@ -379,9 +382,10 @@ def run_perplexity(
     )
 
     test_prompts = perplexity.get_prompts()
+    logger.info(f" Total test prompts: {len(test_prompts)}")
 
     vmfb_path = perplexity.compile_model(weight_path_str)
-    perplexity.load_model(weight_path, tokenizer, vmfb_path, weight_path_str)
+    perplexity.load_model(weight_path, tokenizer, vmfb_path)
     ppl = perplexity.get_perplexity(test_prompts)
 
     return ppl
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 7d54c1e86..5d9c6d205 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -5,9 +5,11 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-from pathlib import Path
 import subprocess
 import logging
+import time
+from pathlib import Path
+from datetime import timedelta
 
 import iree.compiler as ireec
 
@@ -40,6 +42,24 @@ def __init__(
         self.attention_kernel = attention_kernel
         self.tensor_parallelism_size = tensor_parallelism_size
 
+    def timeit(func):
+        def wrapper(*args, **kwargs):
+            start = time.time()
+            result = func(*args, **kwargs)
+            end = time.time()
+            seconds = end - start
+            time_taken = abs(timedelta(seconds=round(seconds)))
+
+            if seconds < 1:
+                time_taken = f" {seconds * 1000} ms"
+
+            func_name = func.__name__
+            logger.info(f" {func_name}: {time_taken}")
+            return result
+
+        return wrapper
+
+    @timeit
     def export_to_mlir(
         self,
         mlir_path: str,
@@ -50,7 +70,7 @@ def export_to_mlir(
             "-m",
             "sharktank.examples.export_paged_llm_v1",
             "--irpa-file",
-            str(self.irpa_path),
+            self.irpa_path,
             "--output-mlir",
             mlir_path,
             "--output-config",
@@ -63,31 +83,28 @@ def export_to_mlir(
             export_args.append(self.attention_kernel)
         elif self.attention_kernel == "torch_sdpa":
             raise NotImplementedError("attention_kernel torch_sdpa not implemented yet")
-        # if self.tensor_parallelism_size:
-        #     export_args.append("--tensor-parallelism-size")
-        #     export_args.append(str(self.tensor_parallelism_size))
 
+        cwd = self.sharktank_dir
         cmd = subprocess.list2cmdline(export_args)
 
-        logger.info(
-            f"export_args: {export_args}\n self.sharktank_dir: {self.sharktank_dir}"
-        )
-
-        cwd = self.sharktank_dir + "/sharktank"
-
         logger.info(f"Exporting mlir:\n" f"cd {cwd} && {cmd}")
-        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
-        return_code = proc.returncode
-        if return_code != 0:
-            logger.error("Error exporting mlir: ", return_code)
+
+        proc = subprocess.run(export_args, capture_output=True, cwd=cwd, text=True)
+        if proc.returncode != 0:
+            logger.error(
+                f"Error exporting mlir with export_paged_llm_v1.py\n"
+                f"{proc.stdout+proc.stderr}"
+            )
         else:
-            logger.info(f"Exported to mlir successfully: {mlir_path}")
+            logger.info(f"Exported to mlir successfully:\n" f"{proc.stdout}")
 
+    @timeit
     def compile_to_vmfb(
         self,
         mlir_path,
         vmfb_path,
     ):
+        # TODO: Control flag to enable multiple backends
         compile_flags = ["--iree-hip-target=" + self.iree_hip_target]
 
         try:
@@ -98,9 +115,9 @@ def compile_to_vmfb(
                 output_file=vmfb_path,
             )
         except Exception as error:
-            logger.error("Error running iree-compile: ", error)
-
-        logger.info(f"Compiled to vmfb successfully: {vmfb_path}")
+            logger.error(f"Error running iree-compile:\n" f"{error}")
+        else:
+            logger.info(f"Compiled to vmfb successfully:\n" f"{vmfb_path}")
 
     def create_file(self, suffix, prefix):
         file_path = Path(prefix).with_suffix(suffix)

From 4607fb2f706234f01852c76d66b917ea75c53eed Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 26 Oct 2024 00:35:57 -0500
Subject: [PATCH 33/51] Upgrade to latest iree to resolve dynamo error

---
 .github/workflows/ci_eval.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 994cbdbe1..ebda57661 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -58,6 +58,14 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-compiler \
+            iree-runtime \
+            "numpy<2.0"
       - name: test1
         run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed
       - name: Run perplexity test with vmfb

From 19e29d9bdbbd9bd1c34abf88fe650de31b37111d Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 03:04:35 -0500
Subject: [PATCH 34/51] Add error handling if mlir export fails

---
 sharktank/sharktank/utils/export_artifacts.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 5d9c6d205..b7e7bb2d4 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -98,6 +98,8 @@ def export_to_mlir(
         else:
             logger.info(f"Exported to mlir successfully:\n" f"{proc.stdout}")
 
+        return proc.returncode
+
     @timeit
     def compile_to_vmfb(
         self,
@@ -146,14 +148,15 @@ def get_artifacts(self):
         )
 
         if self.attention_kernel == "decomposed":
-            self.export_to_mlir(
+            returncode = self.export_to_mlir(
                 mlir_path=mlir_path,
                 json_path=json_path,
             )
 
-            self.compile_to_vmfb(
-                mlir_path=mlir_path,
-                vmfb_path=vmfb_path,
-            )
+            if returncode == 0:
+                self.compile_to_vmfb(
+                    mlir_path=mlir_path,
+                    vmfb_path=vmfb_path,
+                )
 
         return vmfb_path

From 493feeb15028e476ecd2a820e10b43bf4dc02c9f Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 03:04:52 -0500
Subject: [PATCH 35/51] Update perplexity scores

---
 .../evaluate/baseline_perplexity_scores.json  | 202 +++++++++---------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
index b613809ed..e824be52c 100644
--- a/sharktank/tests/evaluate/baseline_perplexity_scores.json
+++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -212,108 +212,108 @@
   },
   "llama3_8B_f16_decomposed_vmfb": {
     "perplexities": [
-      21394.824219,
-      21544.3125,
-      14821.359375,
-      16374.799805,
-      8942.28125,
-      9946.700195,
-      16440.865234,
-      10721.15332,
-      9675.765625,
-      14437.389648,
-      27061.357422,
-      8576.095703,
-      22894.248047,
-      8205.601562,
-      4902.503906,
-      14098.294922,
-      11953.639648,
-      9046.456055,
-      7345.877441,
-      14587.374023,
-      20542.126953,
-      14990.035156,
-      15217.208984,
-      22458.199219,
-      17894.568359,
-      11072.371094,
-      11668.830078,
-      11384.431641,
-      7894.328125,
-      7638.759277,
-      10262.393555,
-      16722.433594,
-      5746.149902,
-      7049.083984,
-      7314.810547,
-      7159.469238,
-      8198.553711,
-      5917.909668,
-      12120.987305,
-      13357.332031,
-      6877.470215,
-      7771.493164,
-      13632.90625,
-      7473.57959,
-      8513.025391,
-      5848.255371,
-      21835.617188,
-      13271.357422,
-      45267.539062,
-      13817.6875,
-      14733.533203,
-      14010.263672,
-      27900.892578,
-      8016.948242,
-      6842.378418,
-      10149.141602,
-      7411.538574,
-      17125.933594,
-      4876.651855,
-      8817.567383,
-      13022.648438,
-      10516.925781,
-      6493.474609,
-      6885.805176,
-      13201.474609,
-      9690.910156,
-      2992.262695,
-      12565.056641,
-      13803.712891,
-      12151.033203,
-      10440.636719,
-      16468.451172,
-      13720.111328,
-      9114.548828,
-      14827.0,
-      11495.735352,
-      6366.992676,
-      10188.37793,
-      5519.487305,
-      10712.731445,
-      4132.742188,
-      12887.806641,
-      6262.628906,
-      17117.361328,
-      10427.929688,
-      42412.0,
-      21811.390625,
-      6171.995605,
-      17588.886719,
-      6537.535156,
-      8773.981445,
-      14319.901367,
-      35847.394531,
-      10555.681641,
-      5562.47998,
-      8986.163086,
-      6192.861328,
-      13730.34668,
-      10742.932617,
-      12502.827148
+      21419.466797,
+      21546.818359,
+      14827.014648,
+      16375.65918,
+      8945.300781,
+      9944.508789,
+      16438.810547,
+      10728.957031,
+      9669.796875,
+      14450.475586,
+      27094.927734,
+      8578.132812,
+      22942.267578,
+      8198.905273,
+      4902.405762,
+      14073.242188,
+      11952.408203,
+      9045.265625,
+      7347.615234,
+      14579.709961,
+      20511.626953,
+      15005.15332,
+      15205.226562,
+      22462.205078,
+      17937.900391,
+      11057.017578,
+      11663.111328,
+      11390.241211,
+      7898.138672,
+      7637.557129,
+      10265.848633,
+      16729.228516,
+      5744.851074,
+      7046.032227,
+      7316.122559,
+      7153.626953,
+      8192.285156,
+      5918.197266,
+      12119.681641,
+      13367.679688,
+      6873.890137,
+      7742.501953,
+      13619.378906,
+      7469.197754,
+      8517.003906,
+      5852.495605,
+      21839.90625,
+      13266.838867,
+      45137.652344,
+      13815.619141,
+      14725.118164,
+      14006.322266,
+      27869.220703,
+      8008.710449,
+      6843.859863,
+      10156.393555,
+      7417.569824,
+      17133.203125,
+      4873.34668,
+      8810.631836,
+      13012.022461,
+      10515.050781,
+      6490.756348,
+      6884.498535,
+      13199.611328,
+      9676.604492,
+      2992.313965,
+      12557.617188,
+      13808.018555,
+      12141.337891,
+      10426.229492,
+      16427.511719,
+      13736.017578,
+      9114.052734,
+      14844.96875,
+      11502.46875,
+      6369.100098,
+      10188.533203,
+      5520.150391,
+      10693.388672,
+      4136.566895,
+      12878.518555,
+      6268.281738,
+      17126.113281,
+      10425.692383,
+      42463.15625,
+      21795.568359,
+      6170.659668,
+      17573.275391,
+      6537.691406,
+      8774.048828,
+      14328.767578,
+      35863.398438,
+      10549.089844,
+      5560.846191,
+      8987.045898,
+      6189.242188,
+      13732.914062,
+      10735.333984,
+      12495.99707
     ],
-    "mean_perplexity": 12545.168862
+    "mean_perplexity": 12543.547432
   },
 
   "llama3_405B_f16_decomposed_vmfb": {

From b65c8825d3e0a2fa43db350a560e42c973f42eaf Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 12:56:30 -0500
Subject: [PATCH 36/51] test benchmark export

---
 .github/workflows/ci_eval.yaml                         | 3 +++
 sharktank/tests/models/llama/benchmark_amdgpu_tests.py | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index ebda57661..d70af3004 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -66,6 +66,9 @@ jobs:
             iree-compiler \
             iree-runtime \
             "numpy<2.0"
+
+      - name: test benchmark
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_tests.py -v -s --longrun -k 'testBenchmark8B_f16_Decomposed'
       - name: test1
         run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed
       - name: Run perplexity test with vmfb
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_tests.py b/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
index 174fcbe87..0d234466e 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
@@ -137,9 +137,9 @@ def get_export_cmd(
             raise NotImplementedError(
                 "attention_kernel torch_sdpa not yet plumbed through"
             )
-        if tensor_parallelism_size:
-            export_args.append("--tensor-parallelism-size")
-            export_args.append(str(tensor_parallelism_size))
+        # if tensor_parallelism_size:
+        #     export_args.append("--tensor-parallelism-size")
+        #     export_args.append(str(tensor_parallelism_size))
 
         cmd = subprocess.list2cmdline(export_args)
         return cmd

From ea311e8c1507b8abd163ba80018247c875b57283 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 13:01:54 -0500
Subject: [PATCH 37/51] test benchmark export

---
 .github/workflows/ci_eval.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index d70af3004..cff813f1f 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -67,6 +67,8 @@ jobs:
             iree-runtime \
             "numpy<2.0"
 
+      - name: Fetch reqs
+        run: pip list
       - name: test benchmark
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_tests.py -v -s --longrun -k 'testBenchmark8B_f16_Decomposed'
       - name: test1

From b2206887d1d78cc68e7b43127b7dd35763f7ee87 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 13:41:06 -0500
Subject: [PATCH 38/51] Remove export tests

---
 .github/workflows/ci_eval.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index cff813f1f..37a24eda2 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -66,13 +66,6 @@ jobs:
             iree-compiler \
             iree-runtime \
             "numpy<2.0"
-
-      - name: Fetch reqs
-        run: pip list
-      - name: test benchmark
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_tests.py -v -s --longrun -k 'testBenchmark8B_f16_Decomposed'
-      - name: test1
-        run: cd /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/sharktank && python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file /data/llama-3.1/8b/llama8b_f16.irpa --output-mlir /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.mlir --output-config /home/nod/actions-runner-llama/_work/SHARK-Platform/SHARK-Platform/tmp_perplexity_ci_artifacts/llama8b_f16_decomposed.json --bs 100 --attention-kernel decomposed
       - name: Run perplexity test with vmfb
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
 

From 09796b71f0578f24edac19161c5d3b1f2fe2eebf Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 14:47:00 -0500
Subject: [PATCH 39/51] Remove hardcoded paths

---
 .github/workflows/ci_eval.yaml | 12 +++++-
 sharktank/conftest.py          | 69 ----------------------------------
 2 files changed, 10 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 1b1d153aa..db056b6cd 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -67,7 +67,12 @@ jobs:
             iree-runtime \
             "numpy<2.0"
       - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --iree-device='hip://7' --longrun
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py \
+              --longrun \
+              --iree-device='hip://7' \
+              --iree-hip-target='gfx942' \
+              --llama3-8b-f16-model-path=/data/extra/models/llama3.1_8B/llama8b_f16.irpa \
+              --llama3-8b-tokenizer-path=/data/extra/models/llama3.1_8B/tokenizer_config.json
 
   # test_perplexity_torch:
   #   timeout-minutes: 1000
@@ -113,4 +118,7 @@ jobs:
   #         pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
   #     - name: Run perplexity test in eager mode
-  #       run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py  --longrun
+  #       run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py \
+              # --longrun \
+              # --llama3-8b-f16-model-path=/data/extra/models/llama3.1_8B/llama8b_f16.irpa \
+              # --llama3-8b-tokenizer-path=/data/extra/models/llama3.1_8B/tokenizer_config.json
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index b7415b427..2076c39eb 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -77,23 +77,13 @@ def pytest_addoption(parser):
         "--llama3-8b-tokenizer-path",
         type=Path,
         action="store",
-        default="/data/llama-3.1/8b/tokenizer_config.json",
         help="Llama3.1 8b tokenizer path, defaults to 30F CI system path",
     )
 
-    parser.addoption(
-        "--llama3-8b-json-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_test.json",
-        help="Llama3.1 8b fp8 parameters json path",
-    )
-
     parser.addoption(
         "--llama3-8b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/llama-3.1/8b/llama8b_f16.irpa",
         help="Llama3.1 8b model path, defaults to 30F CI system path",
     )
 
@@ -105,51 +95,17 @@ def pytest_addoption(parser):
         help="Llama3.1 8b fp8 model path",
     )
 
-    parser.addoption(
-        "--llama3-8b-f16-mlir-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_f16_test.mlir",
-        help="Llama3.1 8b mlir path, defaults to 30F CI system path",
-    )
-
-    parser.addoption(
-        "--llama3-8b-fp8-mlir-path",
-        type=Path,
-        action="store",
-        default=None,
-        help="Llama3.1 8b fp8 mlir path",
-    )
-
-    parser.addoption(
-        "--llama3-8b-f16-vmfb-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_8B/llama8b_f16.vmfb",
-        help="Llama3.1 8b fp16 vmfb path, defaults to 30F CI system path",
-    )
-
     parser.addoption(
         "--llama3-405b-tokenizer-path",
         type=Path,
         action="store",
-        default="/data/llama-3.1/405b/tokenizer_config.json",
         help="Llama3.1 405b tokenizer path, defaults to 30F CI system path",
     )
 
-    parser.addoption(
-        "--llama3-405b-json-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_405B/llama405b_test.json",
-        help="Llama3.1 405b fp8 parameters json path",
-    )
-
     parser.addoption(
         "--llama3-405b-f16-model-path",
         type=Path,
         action="store",
-        default="/data/llama-3.1/405b/llama405b_fp16.irpa",
         help="Llama3.1 405b model path, defaults to 30F CI system path",
     )
 
@@ -161,30 +117,6 @@ def pytest_addoption(parser):
         help="Llama3.1 405b fp8 model path",
     )
 
-    parser.addoption(
-        "--llama3-405b-f16-mlir-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_405B/llama405b_fp16_test.mlir",
-        help="Llama3.1 405b mlir path, defaults to 30F CI system path",
-    )
-
-    parser.addoption(
-        "--llama3-405b-fp8-mlir-path",
-        type=Path,
-        action="store",
-        default=None,
-        help="Llama3.1 405b fp8 mlir path",
-    )
-
-    parser.addoption(
-        "--llama3-405b-f16-vmfb-path",
-        type=Path,
-        action="store",
-        default="/data/extra/models/llama3.1_405B/llama405b_fp16.vmfb",
-        help="Llama3.1 405b fp16 vmfb path, defaults to 30F CI system path",
-    )
-
     parser.addoption(
         "--baseline-perplexity-scores",
         type=Path,
@@ -203,7 +135,6 @@ def pytest_addoption(parser):
     parser.addoption(
         "--iree-hip-target",
         action="store",
-        default="gfx942",
         help="Specify the iree-hip target version (e.g., gfx942)",
     )
 

From 8069f24cbd39f86c45462449143949b269ce05c1 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 14:47:43 -0500
Subject: [PATCH 40/51] Xfail 405b as sharding vmfb is unsupported

---
 sharktank/tests/evaluate/perplexity_vmfb_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index 6ad187d21..3a6950ad6 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -164,6 +164,9 @@ def test_llama3_8B_fp8(self):
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
+    @pytest.mark.xfail(
+        reason="Sharding is not supported yet",
+    )
     @longrun
     def test_llama3_405B_f16_decomposed(self):
 
@@ -267,7 +270,7 @@ def test_llama3_405B_fp8_decomposed(self):
         )
 
     @pytest.mark.xfail(
-        reason="Non-decomposed attention is not supported yet",
+        reason="FP8 model is unsupported",
     )
     @longrun
     def test_llama3_405B_fp8(self):

From fb78644401fa64a9a98740acc8b2eb00228013c9 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 14:53:35 -0500
Subject: [PATCH 41/51] Update mi-300x-3 path

---
 .github/workflows/ci_eval.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index db056b6cd..080641909 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -71,8 +71,8 @@ jobs:
               --longrun \
               --iree-device='hip://7' \
               --iree-hip-target='gfx942' \
-              --llama3-8b-f16-model-path=/data/extra/models/llama3.1_8B/llama8b_f16.irpa \
-              --llama3-8b-tokenizer-path=/data/extra/models/llama3.1_8B/tokenizer_config.json
+              --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa \
+              --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
 
   # test_perplexity_torch:
   #   timeout-minutes: 1000
@@ -120,5 +120,5 @@ jobs:
   #     - name: Run perplexity test in eager mode
   #       run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py \
               # --longrun \
-              # --llama3-8b-f16-model-path=/data/extra/models/llama3.1_8B/llama8b_f16.irpa \
-              # --llama3-8b-tokenizer-path=/data/extra/models/llama3.1_8B/tokenizer_config.json
+              # --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa \
+              # --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json

From c3aa964d31550e9f5aeb4482dc94a3cfeb6204d1 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 15:58:43 -0500
Subject: [PATCH 42/51] Test pytest command

---
 .github/workflows/ci_eval.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 080641909..0c10fe2a5 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -67,8 +67,7 @@ jobs:
             iree-runtime \
             "numpy<2.0"
       - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py \
-              --longrun \
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun \
               --iree-device='hip://7' \
               --iree-hip-target='gfx942' \
               --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa \

From 7d277d37b99eba0dedc8c85f89070ebdcdcd75c9 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 16:09:43 -0500
Subject: [PATCH 43/51] Test pytest command

---
 .github/workflows/ci_eval.yaml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 0c10fe2a5..273bfe3ef 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -67,11 +67,7 @@ jobs:
             iree-runtime \
             "numpy<2.0"
       - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun \
-              --iree-device='hip://7' \
-              --iree-hip-target='gfx942' \
-              --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa \
-              --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target='gfx942' --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
 
   # test_perplexity_torch:
   #   timeout-minutes: 1000

From 5f5408495737bf0e9fff687f4feb9a4d047593ec Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 17:52:38 -0500
Subject: [PATCH 44/51] Revert benchmarking test changes

---
 sharktank/tests/models/llama/benchmark_amdgpu_tests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_tests.py b/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
index 0d234466e..174fcbe87 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_tests.py
@@ -137,9 +137,9 @@ def get_export_cmd(
             raise NotImplementedError(
                 "attention_kernel torch_sdpa not yet plumbed through"
             )
-        # if tensor_parallelism_size:
-        #     export_args.append("--tensor-parallelism-size")
-        #     export_args.append(str(tensor_parallelism_size))
+        if tensor_parallelism_size:
+            export_args.append("--tensor-parallelism-size")
+            export_args.append(str(tensor_parallelism_size))
 
         cmd = subprocess.list2cmdline(export_args)
         return cmd

From 052f24a8728182b84083354e08659d39cc06759a Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 18:15:08 -0500
Subject: [PATCH 45/51] Revert debug changes

---
 .github/workflows/ci_eval.yaml                | 86 +++++++++----------
 .../tests/evaluate/perplexity_vmfb_test.py    |  2 +-
 2 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 273bfe3ef..55aef5ac5 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -1,7 +1,6 @@
 name: Evaluation Tests
 
 on:
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
@@ -69,51 +68,48 @@ jobs:
       - name: Run perplexity test with vmfb
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target='gfx942' --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
 
-  # test_perplexity_torch:
-  #   timeout-minutes: 1000
-  #   name: "Evaluation Tests - perplexity_torch"
-  #   strategy:
-  #     matrix:
-  #       version: [3.11]
-  #       runs-on: [llama-mi300]
-  #     fail-fast: false
-  #   runs-on: ${{matrix.runs-on}}
-  #   defaults:
-  #     run:
-  #       shell: bash
-  #   env:
-  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-  #     SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
-  #   steps:
-  #     - name: "Setting up Python"
-  #       id: setup_python
-  #       uses: actions/setup-python@v3
-  #       with:
-  #         python-version: ${{matrix.version}}
+  test_perplexity_torch:
+    timeout-minutes: 1000
+    name: "Evaluation Tests - perplexity_torch"
+    strategy:
+      matrix:
+        version: [3.11]
+        runs-on: [llama-mi300x-3]
+      fail-fast: false
+    runs-on: ${{matrix.runs-on}}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.version}}
 
-  #     - name: "Checkout Code"
-  #       uses: actions/checkout@v3
+      - name: "Checkout Code"
+        uses: actions/checkout@v3
 
-  #     - name: Cache Pip Packages
-  #       uses: actions/cache@v4
-  #       id: cache-pip
-  #       with:
-  #         path: ${{ env.PIP_CACHE_DIR }}
-  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+      - name: Cache Pip Packages
+        uses: actions/cache@v4
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
 
-  #     - name: Install sharktank deps
-  #       run: |
-  #         python -m pip install --no-compile --upgrade pip
-  #         # Note: We install in three steps in order to satisfy requirements
-  #         # from non default locations first. Installing the PyTorch CPU
-  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-  #         pip install --no-compile -r pytorch-cpu-requirements.txt
-  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-  #         pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+      - name: Install sharktank deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-  #     - name: Run perplexity test in eager mode
-  #       run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py \
-              # --longrun \
-              # --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa \
-              # --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
+      - name: Run perplexity test in eager mode
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index 3a6950ad6..8b3fc80dc 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -130,7 +130,7 @@ def test_llama3_8B_fp8_decomposed(self):
         )
 
     @pytest.mark.xfail(
-        reason="Non-decomposed attention is not supported yet",
+        reason="FP8 model is unsupported",
     )
     @longrun
     def test_llama3_8B_fp8(self):

From a9227c7b88858d0a109f82eef44b18ac7b2f92dc Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 18:29:12 -0500
Subject: [PATCH 46/51] Xfail 405b eager mode perplexity till sharding is fixed

---
 sharktank/tests/evaluate/perplexity_vmfb_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index 8b3fc80dc..0003c2afd 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -165,7 +165,7 @@ def test_llama3_8B_fp8(self):
         )
 
     @pytest.mark.xfail(
-        reason="Sharding is not supported yet",
+        reason="Sharding needs to be fixed",
     )
     @longrun
     def test_llama3_405B_f16_decomposed(self):

From 31aebbd7279c4e632e08e8a50f143220651938ef Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 18:42:21 -0500
Subject: [PATCH 47/51] Add xfail to 405b as sharding needs to be fixed

---
 sharktank/tests/evaluate/perplexity_torch_test.py | 3 +++
 sharktank/tests/evaluate/perplexity_vmfb_test.py  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sharktank/tests/evaluate/perplexity_torch_test.py b/sharktank/tests/evaluate/perplexity_torch_test.py
index b2b098ae7..042132f20 100644
--- a/sharktank/tests/evaluate/perplexity_torch_test.py
+++ b/sharktank/tests/evaluate/perplexity_torch_test.py
@@ -143,6 +143,9 @@ def test_llama3_8B_fp8(self):
             msg=f"Current perplexity deviates baseline by {perplexity_difference}",
         )
 
+    @pytest.mark.xfail(
+        reason="Sharding needs to be fixed",
+    )
     @longrun
     def test_llama3_405B_f16_decomposed(self):
 
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_vmfb_test.py
index 0003c2afd..93ffbe61c 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_vmfb_test.py
@@ -165,7 +165,7 @@ def test_llama3_8B_fp8(self):
         )
 
     @pytest.mark.xfail(
-        reason="Sharding needs to be fixed",
+        reason="Sharding is unsupported",
     )
     @longrun
     def test_llama3_405B_f16_decomposed(self):

From 461034bca7efba84bc59c976ba0d1dd3d295a5eb Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 18:55:16 -0500
Subject: [PATCH 48/51] Final testing

---
 .github/workflows/ci_eval.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 55aef5ac5..3936a356b 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -1,6 +1,7 @@
 name: Evaluation Tests
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.

From 22da6e76cdc333558fc029cc940b9f197b2d2303 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 21:26:33 -0500
Subject: [PATCH 49/51] Fix CI test script

---
 .github/workflows/ci_eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 3936a356b..4eebdaf06 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -113,4 +113,4 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
       - name: Run perplexity test in eager mode
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama-3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama-3.1/8b/tokenizer_config.json

From fe4988a46b51d2936e544a7c24ea6a7c4116a98b Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 23:36:24 -0500
Subject: [PATCH 50/51] Remove CI debugging

---
 .github/workflows/ci_eval.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 4eebdaf06..a528cfa13 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -1,7 +1,6 @@
 name: Evaluation Tests
 
 on:
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.

From e2c6c17a4c6e3336238b893f3e454a7c99030a96 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 28 Oct 2024 23:50:26 -0500
Subject: [PATCH 51/51] Remove dummy 405b vmfb baseline numbers

---
 .../evaluate/baseline_perplexity_scores.json  | 106 ------------------
 1 file changed, 106 deletions(-)

diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
index e824be52c..d9d0d454b 100644
--- a/sharktank/tests/evaluate/baseline_perplexity_scores.json
+++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -314,111 +314,5 @@
       12495.99707
     ],
     "mean_perplexity": 12543.547432
-  },
-
-  "llama3_405B_f16_decomposed_vmfb": {
-    "perplexities": [
-      2.170036,
-      8.014498,
-      3.743922,
-      10.629776,
-      8.965701,
-      2.884743,
-      2.886767,
-      3.853816,
-      2.73785,
-      15.235562,
-      2.65135,
-      1.970936,
-      5.08259,
-      2.507602,
-      7.571635,
-      3.005182,
-      1.904492,
-      3.182651,
-      6.249443,
-      4.661795,
-      12.68933,
-      35.432453,
-      5.50336,
-      60.950359,
-      18.433432,
-      5.001391,
-      4.814827,
-      2.99482,
-      2.697508,
-      2.617349,
-      2.359061,
-      16.697233,
-      2.145065,
-      2.1207,
-      2.496015,
-      1.822896,
-      4.671626,
-      2.389186,
-      2.701802,
-      1.921128,
-      2.236057,
-      4.741998,
-      4.946936,
-      2.758695,
-      2.446043,
-      2.146302,
-      8.72202,
-      4.180647,
-      11.449497,
-      13.429152,
-      3.72468,
-      2.407385,
-      3.592854,
-      5.412414,
-      3.189998,
-      4.186216,
-      1.642744,
-      2.279058,
-      1.855652,
-      3.453852,
-      1.436223,
-      1.516955,
-      1.716439,
-      4.715765,
-      21.48657,
-      2.208737,
-      6.420449,
-      2.001433,
-      2.400955,
-      3.543744,
-      3.054271,
-      7.904545,
-      1.950376,
-      3.983746,
-      6.28265,
-      2.64157,
-      5.473378,
-      3.444444,
-      1.926046,
-      3.092915,
-      3.996159,
-      3.125222,
-      1.718025,
-      3.856093,
-      3.041075,
-      11.798485,
-      14.881112,
-      5.631516,
-      4.407883,
-      4.840533,
-      21.351448,
-      2.065821,
-      6.658993,
-      28.123312,
-      1.673253,
-      3.729975,
-      5.336116,
-      8.579758,
-      2.979404,
-      1.915619
-    ],
-    "mean_perplexity": 6.060831
   }
 }