From b9f59a7a1e4e03d56b3a3c86a6220633f6a16959 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:11:04 +0100
Subject: [PATCH 01/18] Store node inputs during TVM codegen op parsing (#593)

- In order to generate unique op tests, we need to have knowledge about
  input shapes for each op during codegen.
- Previously, we only needed to know details about generated op names,
  inputs, params, etc. However, these are not enough in order to support
logic for splitting model accross different op tests

Fix #592
---
 forge/forge/tvm_to_python.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/forge/forge/tvm_to_python.py b/forge/forge/tvm_to_python.py
index 23e76c74a..11e7ebf4e 100644
--- a/forge/forge/tvm_to_python.py
+++ b/forge/forge/tvm_to_python.py
@@ -1830,11 +1830,30 @@ def forward(self, *acts):
 
 
 class Operation:
-    def __init__(self, function_name, output_name, node_name="", input_names=[], args=[], src_layer=None):
+    """
+    A class to store relevant code generation details about a specific operation.
+
+    Attributes:
+        function_name (str): The name of the function associated with the operation.
+        node_name (str): The name of the node in the computation graph.
+        output_name (str): The name of the output variable.
+        input_names (list): A list of input variable names.
+        input_shapes (list): A list of shapes corresponding to the input variables.
+        args (list): A list of arguments for the operation.
+        is_submodule_call (bool): A flag indicating if the operation is a submodule call (related to Torch 2.0).
+        inputs_to_delete (list): A list of inputs to delete.
+        loop_with (list): A list of loop variables.
+        src_layer (optional): The source layer associated with the operation.
+    """
+
+    def __init__(
+        self, function_name, output_name, node_name="", input_names=[], args=[], src_layer=None, input_shapes=[]
+    ):
         self.function_name = function_name
         self.node_name = node_name
         self.output_name = output_name
         self.input_names = input_names
+        self.input_shapes = input_shapes
         self.args = args
         self.is_submodule_call = False
         self.inputs_to_delete = []
@@ -2373,6 +2392,10 @@ def make_parser_friendly_name(node, node_type):
                     input_names=input_names,
                     args=args,
                     src_layer=span_to_src_layer(node),
+                    input_shapes=[
+                        graph["nodes"][node["inputs"][input_port][0]]["forge_shape"]
+                        for input_port in range(int(node["attrs"]["num_inputs"]))
+                    ],
                 )
 
         if any([input is None for input in forge_inputs]):

From 8c6614b5f96f96d3b10ffa08b785a2bf9aa6357a Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:11:29 +0100
Subject: [PATCH 02/18] Expand TVM Codegen functionality to enable generation
 of pytest (#595)

functions withing the module.
- Detailed steps are defined as function doc
- Purpose of this function is to generate pytest within the Forge
  Module, which can futher be used to invoke Forge Module directly from
the file itself.

Fix #594
---
 forge/forge/python_codegen.py | 42 +++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/forge/forge/python_codegen.py b/forge/forge/python_codegen.py
index 8d93d74cb..acf3d038c 100644
--- a/forge/forge/python_codegen.py
+++ b/forge/forge/python_codegen.py
@@ -132,6 +132,8 @@ def write_header(self):
         self.wl("from loguru import logger")
 
         self.wl("import torch")
+        self.wl("from forge import Tensor, compile")
+        self.wl("from forge.op.eval.common import compare_with_golden_pcc, compare_with_golden")
         if self.framework == "tensorflow":
             self.wl("import tensorflow as tf")
             self.wl("from forge.tvm_utils import map_tf_dtype_to_pt")
@@ -949,6 +951,46 @@ def write_param_parser(self, param_names, param_file_name):
         else:
             assert False, "TODO: Add other framework param parsers"
 
+    def write_pytest_function(self, module_name, input_shapes):
+        """
+        Generates a pytest function to test a module with given input shapes.
+
+        This function writes a pytest function that:
+        1. Creates input tensors based on the provided shapes.
+        2. Initializes the framework model with the specified module name.
+        3. Processes the framework parameters.
+        4. Runs the framework model with the created inputs.
+        5. Compiles the framework model.
+        6. Runs the compiled model with the same inputs.
+        7. Asserts that the outputs of the framework model and the compiled model are similar within a specified tolerance.
+
+        Args:
+            module_name (str): The name of the module to be tested.
+            input_shapes (list): A list of shapes for the input tensors.
+        """
+        self.wl("")
+        self.wl("")
+        self.wl("def test_module():")
+        self.indent += 1
+        self.wl("inputs = [")
+        self.indent += 1
+        for shape in input_shapes:
+            self.wl(f"Tensor.create_from_torch(torch.rand({shape})),")
+        self.indent -= 1
+        self.wl("]")
+        self.wl("")
+        self.wl(f"framework_model = {self.class_name}('{module_name}')")
+        self.wl("framework_model.process_framework_parameters()")
+        self.wl("fw_out = framework_model(*inputs)")
+        self.wl("")
+        self.wl("compiled_model = compile(framework_model, sample_inputs=inputs)")
+        self.wl("co_out = compiled_model(*inputs)")
+        self.wl("")
+        self.wl(
+            "assert all([compare_with_golden_pcc(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])"
+        )
+        self.indent -= 1
+
 
 class PyTorchWriter(PythonWriter):
     incompatible_np_float_types = [

From f5b51b6cc0a50f5351d350e80e153dd9e6244ea6 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:11:53 +0100
Subject: [PATCH 03/18] Script to run generated standalone Forge tests + report
 summary (#603)

- Purpose of this script is to run standalone Forge op tests, and
  generate summary with passing/failing states
- During scirpt execution, we're outputing logs with time required for
  each test run
- Have in mind that each pytest is run as separated thread. Why? Because
  MLIR doesn't have good execption handling, and often fails by crushing
pytest thread. Therefore, we're using main thread as monitor to record
those failures as well
- Each test log is sotred in as separated files. Final report is
  generated based on those logs.
- Main summary file contains information about key errors, and is
  configurable to have cleaner, or more verbose error notes

Fix #602
---
 run_tests.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 run_tests.py

diff --git a/run_tests.py b/run_tests.py
new file mode 100644
index 000000000..c203a3b1a
--- /dev/null
+++ b/run_tests.py
@@ -0,0 +1,157 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+import os
+import time
+from datetime import datetime
+
+
+def extract_error_context(log_file, keyword="error", num_lines_before=0, num_lines_after=0, max_errors=None):
+    """
+    Extracts lines around the keyword from the log file.
+    """
+    context_lines = []
+    error_count = 0
+    with open(log_file, "r") as f:
+        lines = f.readlines()
+        for i, line in enumerate(lines):
+            if keyword in line.lower():
+                if max_errors is not None and error_count >= max_errors:
+                    break
+
+                start = max(i - num_lines_before, 0)
+                end = min(i + num_lines_after + 1, len(lines))
+
+                context_lines.append(f"\nError context around line {i + 1}:\n")
+                context_lines.append("---\n")  # Divider between error contexts
+                context_lines.append("".join(lines[start:end]))
+                context_lines.append("\n---\n")  # Divider between error contexts
+                error_count += 1
+
+    return context_lines
+
+
+def run_tests(
+    test_directory,
+    log_directory="test_logs",
+    num_lines_before=1,
+    num_lines_after=5,
+    max_errors=None,
+    max_tests_to_run=-1,
+):
+    """
+    Runs all pytest files in the given directory, logging each test's output separately.
+    Creates a summary with pass/fail counts and specific error messages for failures.
+    """
+    # Ensure the log directory exists
+    os.makedirs(log_directory, exist_ok=True)
+
+    test_files = [f for f in os.listdir(test_directory) if f.startswith("test_") or f.endswith("_test.py")]
+    test_files = sorted(test_files)
+    summary = {"passed": 0, "failed": 0, "failures": {}}
+
+    for test_id, test_file in enumerate(test_files):
+        if test_id > max_tests_to_run and max_tests_to_run > 0:
+            break
+
+        test_path = os.path.join(test_directory, test_file)
+        log_file = os.path.join(log_directory, f"{test_file}_log.txt")
+
+        print(f"Running test: {test_file}")
+
+        start_time = time.time()
+
+        try:
+            # Run each test file as a separate subprocess with a timeout of 30 seconds
+            result = subprocess.run(["pytest", test_path], check=True, capture_output=True, text=True, timeout=30)
+
+            # Log output to a file
+            with open(log_file, "w") as f:
+                if result.stderr:
+                    f.write("=== STDERR ===\n")
+                    f.write(result.stderr)
+                if result.stdout:
+                    f.write("=== STDOUT ===\n")
+                    f.write(result.stdout)
+
+            elapsed_time = time.time() - start_time
+            # Print pass message with clear formatting
+            print(f"\tPassed ({elapsed_time:.2f} seconds)")
+            summary["passed"] += 1
+
+        except subprocess.TimeoutExpired as e:
+            elapsed_time = time.time() - start_time
+            error_message = "Test timed out after 30 seconds"
+
+            # Do WH warm reset (potentially hang occurred)
+            print("\tWarm reset...")
+            os.system("/home/software/syseng/wh/tt-smi -lr all")
+
+            # Log timeout error to a file
+            with open(log_file, "w") as f:
+                f.write("=== TIMEOUT ===\n")
+                f.write(error_message)
+
+            # Print timeout message with clear formatting
+            print(f"\tFailed ({elapsed_time:.2f} seconds) - {error_message}")
+            summary["failed"] += 1
+            summary["failures"][test_file] = error_message
+
+        except subprocess.CalledProcessError as e:
+            # Log output to a file
+            with open(log_file, "w") as f:
+                if e.stderr:
+                    f.write("=== STDERR ===\n")
+                    f.write(e.stderr)
+                if e.stdout:
+                    f.write("=== STDOUT ===\n")
+                    f.write(e.stdout)
+
+            elapsed_time = time.time() - start_time
+            error_message = e.stderr
+
+            # Print fail message with clear formatting
+            print(f"\tFailed ({elapsed_time:.2f} seconds)")
+            summary["failed"] += 1
+            summary["failures"][test_file] = error_message
+
+        except Exception as ex:
+            elapsed_time = time.time() - start_time
+            print(f"An unexpected error occurred while running {test_file}: {ex} ({elapsed_time:.2f} seconds)")
+
+    # Print and log summary
+    print("\n=== Test Summary ===")
+    print(f"Total tests run: {len(test_files)}")
+    print(f"Tests passed: {summary['passed']}")
+    print(f"Tests failed: {summary['failed']}")
+
+    # Write summary to a file with a timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    summary_file = os.path.join(log_directory, f"summary_{timestamp}.txt")
+
+    with open(summary_file, "w") as f:
+        f.write(f"Total tests run: {len(test_files)}\n")
+        f.write(f"Tests passed: {summary['passed']}\n")
+        f.write(f"Tests failed: {summary['failed']}\n")
+
+        if summary["failed"] > 0:
+            f.write("\nFailed Tests:\n")
+            for test, message in summary["failures"].items():
+                f.write(f"\n{'#' * 9}\n")
+                f.write(f"\nTest name: {test}\n")
+                f.write(f"\n{'#' * 9}\n\n")
+                error_context = extract_error_context(
+                    os.path.join(log_directory, f"{test}_log.txt"),
+                    num_lines_before=num_lines_before,
+                    num_lines_after=num_lines_after,
+                    max_errors=max_errors,
+                )
+                f.writelines(error_context)
+
+
+if __name__ == "__main__":
+    # Set your test directory here
+    test_directory = "./generated_modules"  # Adjust this path to your test directory
+    run_tests(test_directory, max_errors=1, max_tests_to_run=-1)  # Adjust max_errors as needed

From 8e76b59c849772d04a7e97055398f7b3a3ff81d7 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:27:29 +0100
Subject: [PATCH 04/18] Remove the requirement to push the whole Framework
 Module as input to (#591)

the ForgeModule during compile time. Previously required for parameter
initialization.
- This change is required to remove added constraints during the op test
  generation
- This change cleans the generated module a bit and does serialization
  of
  all required parameters that are just load within the model during
compile time
- This chain represents one of the steps to generate single op tests
  with
  focus on a specific model

Fix #590
---
 forge/forge/python_codegen.py | 12 +++++++-----
 forge/forge/tvm_to_python.py  | 27 ++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/forge/forge/python_codegen.py b/forge/forge/python_codegen.py
index acf3d038c..e12c9504a 100644
--- a/forge/forge/python_codegen.py
+++ b/forge/forge/python_codegen.py
@@ -259,17 +259,19 @@ def write_forward(self, ops, inputs, outputs):
         self.indent = 0
         self.wl("")
 
-    def write_param_parser(self, param_names, param_file_name):
+    def write_param_parser(
+        self, param_names, param_file_name, names_params_file_name=None, named_buffers_file_name=None
+    ):
         self.indent = 1
 
         if self.framework == "pytorch":
-            self.wl(f"def process_framework_parameters(self, model):")
+            self.wl(f"def process_framework_parameters(self):")
             self.indent += 1
-            self.wl(f"named_parameters = dict(model.state_dict().items())")
+            self.wl(f"named_parameters = torch.load('{names_params_file_name}')")
             if param_file_name is not None:
                 self.wl(f'serialized_params = torch.load("{param_file_name}")')
                 self.wl(f"named_parameters.update(serialized_params)")
-            self.wl("named_buffers = dict(model.named_buffers())")
+            self.wl(f"named_buffers = torch.load('{named_buffers_file_name}')")
             self.wl("named_parameters.update(named_buffers)")
 
             if len(param_names):
@@ -1249,7 +1251,7 @@ def write_param_parser(self, param_names, param_file_name):
         self.indent = 1
 
         if self.framework == "pytorch":
-            self.wl(f"def process_framework_parameters(self, model):")
+            self.wl(f"def process_framework_parameters(self):")
             self.indent += 1
 
             self.wl("named_parameters = dict(model.named_parameters())")
diff --git a/forge/forge/tvm_to_python.py b/forge/forge/tvm_to_python.py
index 11e7ebf4e..671cc7031 100644
--- a/forge/forge/tvm_to_python.py
+++ b/forge/forge/tvm_to_python.py
@@ -2039,7 +2039,12 @@ def generate_forge_module(
             forge_mod.module.process_framework_parameters(framework_mod.module)
         else:
             forge_mod = TestClass(writer.module_name)
-            forge_mod.process_framework_parameters(framework_mod.module)
+
+            if isinstance(framework_mod, forge.module.PyTorchModule):
+                forge_mod.process_framework_parameters()
+            else:
+                forge_mod.process_framework_parameters(framework_mod.module)
+
             assert not any(
                 [param.value() is None for param in forge_mod.get_parameters()]
             ), f"Could not retrieve parameters from framework and tvm"
@@ -2655,8 +2660,24 @@ def delete_unneeded_outputs(ops, returns):
             param_file_name = os.path.join(writer.module_directory, writer.module_name + "_params.pt")
             torch.save(params_from_tvm, param_file_name)
 
-        param_names.update(const_names)
-        writer.write_param_parser(param_names, param_file_name)
+        if framework == "pytorch":
+            # Store named parameters
+            names_params_file_name = os.path.join(writer.module_directory, writer.module_name + "_names_params.pt")
+            named_parameters = dict(framework_mod.module.state_dict().items())
+            torch.save(named_parameters, names_params_file_name)
+
+            # Store named buffers
+            named_buffers_file_name = os.path.join(writer.module_directory, writer.module_name + "_named_buffers.pt")
+            named_buffers = dict(framework_mod.module.named_buffers())
+            torch.save(named_buffers, named_buffers_file_name)
+
+            # Generate Forge module parameter parser
+            param_names.update(const_names)
+            writer.write_param_parser(param_names, param_file_name, names_params_file_name, named_buffers_file_name)
+        else:
+            param_names.update(const_names)
+            writer.write_param_parser(param_names, param_file_name)
+
         writer.close_file()
 
         modules.append(writer)

From 6171aed7dd7b2125fc0b72c56e3d3773a888a525 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:27:48 +0100
Subject: [PATCH 05/18] Option to setup compiler configurations using pytest
 arguments (#601)

Fix #600
---
 forge/test/conftest.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/forge/test/conftest.py b/forge/test/conftest.py
index 722254c6d..90431b713 100644
--- a/forge/test/conftest.py
+++ b/forge/test/conftest.py
@@ -29,6 +29,8 @@
 from forge.verify.config import TestKind
 from forge.torch_compile import reset_state
 
+from forge.config import _get_global_compiler_config
+
 collect_ignore = ["legacy_tests"]
 
 
@@ -98,6 +100,9 @@ def clear_forge():
 
 
 def pytest_addoption(parser):
+    parser.addoption(
+        "--generate-op-tests", action="store_true", default=False, help="Generate op tests for the given model"
+    )
     parser.addoption(
         "--silicon-only", action="store_true", default=False, help="run silicon tests only, skip golden/model"
     )
@@ -166,6 +171,25 @@ def runslow(request):
 """
 
 
+@pytest.fixture(autouse=True, scope="session")
+def initialize_global_compiler_configuration_based_on_pytest_args(pytestconfig):
+    """
+    Set the global compiler config options for the test session
+    which will generate op tests for the given model.
+    """
+    compiler_cfg = _get_global_compiler_config()
+
+    compiler_cfg.tvm_generate_op_tests = pytestconfig.getoption("--generate-op-tests")
+
+    if compiler_cfg.tvm_generate_op_tests:
+        # For running standalone tests, we need to retain the generated python files
+        # together with stored model parameters
+        compiler_cfg.retain_tvm_python_files = True
+
+        # Required to prevent early tensor deallocation
+        compiler_cfg.enable_op_level_comparision = True
+
+
 @pytest.hookimpl(tryfirst=True)
 def pytest_cmdline_preparse(config, args):
 

From 859c0e0120b3cd9fc8daea8ac74fe5c0acf88f19 Mon Sep 17 00:00:00 2001
From: Predrag Ilkic <148892209+pilkicTT@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:44:55 +0100
Subject: [PATCH 06/18] [tt-mlir] uplift to b75d44d9 (#625)

There were 25 test cases failing; all failures contained in
`test_conv2d` and `test_maximum` tests.

To unblock uplift and close the gap to `tt-mlir`, I've marked these test
cases as xfail, until the fixes from `tt-metal` come.

Thanks to @mtopalovicTT, for inliner related changes.
---
 forge/csrc/passes/mlir_compiler.cpp |  3 +++
 forge/test/mlir/test_ops.py         |  1 +
 forge/test/mlir/test_ops_tf.py      | 13 ++-----------
 third_party/tt-mlir                 |  2 +-
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/forge/csrc/passes/mlir_compiler.cpp b/forge/csrc/passes/mlir_compiler.cpp
index 4eba2052a..1495a0626 100644
--- a/forge/csrc/passes/mlir_compiler.cpp
+++ b/forge/csrc/passes/mlir_compiler.cpp
@@ -17,6 +17,7 @@
 #pragma clang diagnostic pop
 
 // MLIR headers
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "utils/logger.hpp"
 
@@ -50,6 +51,8 @@ runtime::Binary run_mlir_compiler(tt::ForgeGraphModule& module)
         mlir::ml_program::MLProgramDialect,
         mlir::tensor::TensorDialect>();
 
+    mlir::func::registerInlinerExtension(registry);
+
     // Create a context with all registered dialects.
     mlir::MLIRContext context(registry);
 
diff --git a/forge/test/mlir/test_ops.py b/forge/test/mlir/test_ops.py
index 985252442..db84c8b56 100644
--- a/forge/test/mlir/test_ops.py
+++ b/forge/test/mlir/test_ops.py
@@ -371,6 +371,7 @@ def forward(self, x):
         ((1, 32, 32, 32), (1,)),
     ],
 )
+@pytest.mark.xfail(reason="TTNN maximum op: unsupported broadcast")
 def test_maximum(shape_x, shape_y):
     class maximum(nn.Module):
         def __init__(self):
diff --git a/forge/test/mlir/test_ops_tf.py b/forge/test/mlir/test_ops_tf.py
index 6537fed26..85f5969be 100644
--- a/forge/test/mlir/test_ops_tf.py
+++ b/forge/test/mlir/test_ops_tf.py
@@ -24,17 +24,7 @@
         (1, 256, 256, 28, 28, 3, 3, 2, 2),
         (1, 256, 256, 14, 14, 3, 3, 1, 1),
         (1, 64, 64, 8, 8, 3, 3, 1, 1),
-        (
-            1,
-            64,
-            64,
-            16,
-            16,
-            3,
-            3,
-            1,
-            1,
-        ),
+        (1, 64, 64, 16, 16, 3, 3, 1, 1),
         (1, 256, 256, 7, 7, 3, 3, 1, 1),
         (1, 256, 64, 56, 56, 1, 1, 2, 2),
     ),
@@ -58,6 +48,7 @@
     ],
 )
 @pytest.mark.parametrize("has_bias", [False, True], ids=["no_bias", "with_bias"])
+@pytest.mark.xfail(reason="TTNN fails to tilize during reshape after conv")
 def test_conv2d(
     batch_size,
     output_channels,
diff --git a/third_party/tt-mlir b/third_party/tt-mlir
index 50f0f035f..b75d44d98 160000
--- a/third_party/tt-mlir
+++ b/third_party/tt-mlir
@@ -1 +1 @@
-Subproject commit 50f0f035f53cd3c755be8c3650c55cf3d4e3170b
+Subproject commit b75d44d98fdba061b1c1da396cde9a42c07911a9

From 3533eb8af9d28d1634a9299b9a4ea9a3e46419e0 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:44:19 +0100
Subject: [PATCH 07/18] Log memory usage during each pytest run (#633)

Fix #632
---
 conftest.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/conftest.py b/conftest.py
index 3582930dc..4cd7f58da 100644
--- a/conftest.py
+++ b/conftest.py
@@ -2,7 +2,11 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
+import time
 import pytest
+import psutil
+import threading
+from loguru import logger
 from datetime import datetime
 
 
@@ -13,3 +17,71 @@ def record_test_timestamp(record_property):
     yield
     end_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%dT%H:%M:%S%z")
     record_property("end_timestamp", end_timestamp)
+
+
+@pytest.fixture(autouse=True)
+def memory_usage_tracker():
+    """
+    A pytest fixture that tracks memory usage during the execution of a test.
+
+    This fixture automatically tracks the memory usage of the process running the tests.
+    It starts tracking before the test runs, continues tracking in a background thread during the test,
+    and stops tracking after the test completes. It logs the memory usage statistics including the
+    minimum, maximum, average, and total memory usage by the test.
+
+    The memory usage is measured in megabytes (MB).
+
+    Note:
+        - This fixture is automatically used for all tests due to the `autouse=True` parameter.
+        - The interval for memory readings can be adjusted by changing the sleep duration in the `track_memory` function.
+        - Min, max, and avg memory usage are calculated based on the recorded memory readings from system memory.
+    """
+    process = psutil.Process()
+
+    # Initialize memory tracking variables
+    start_mem = process.memory_info().rss / (1024 * 1024)  # MB
+    min_mem = start_mem
+    max_mem = start_mem
+    total_mem = start_mem
+    count = 1
+
+    # Start a background thread or loop to collect memory usage over time
+    tracking = True
+
+    def track_memory():
+        nonlocal min_mem, max_mem, total_mem, count
+        while tracking:
+            current_mem = process.memory_info().rss / (1024 * 1024)
+            min_mem = min(min_mem, current_mem)
+            max_mem = max(max_mem, current_mem)
+            total_mem += current_mem
+            count += 1
+            time.sleep(0.1)  # Adjust the interval as needed
+
+    # Start tracking in a background thread
+    import threading
+
+    tracker_thread = threading.Thread(target=track_memory)
+    tracker_thread.start()
+
+    # Run the test
+    yield
+
+    # Stop tracking and wait for the thread to finish
+    tracking = False
+    tracker_thread.join()
+
+    # Calculate end memory and memory usage stats
+    end_mem = process.memory_info().rss / (1024 * 1024)  # MB
+    min_mem = min(min_mem, end_mem)
+    max_mem = max(max_mem, end_mem)
+    total_mem += end_mem
+    count += 1
+    avg_mem = total_mem / count
+
+    # Log memory usage statistics
+    logger.info(f"Test memory usage:")
+    logger.info(f"    By test: {end_mem - start_mem:.2f} MB")
+    logger.info(f"    Minimum: {min_mem:.2f} MB")
+    logger.info(f"    Maximum: {max_mem:.2f} MB")
+    logger.info(f"    Average: {avg_mem:.2f} MB")

From 70dcd1420d95f4de01d4cc1e0cd04df108da240c Mon Sep 17 00:00:00 2001
From: Mateja Stojkovic <mstojkovic@tenstorrent.com>
Date: Thu, 7 Nov 2024 06:33:04 +0100
Subject: [PATCH 08/18] Support for cosine and sine (#635)

* added support for cosine and sine

* comment for lexicographical order

* llama 3b now compiles
---
 forge/csrc/passes/lower_to_mlir.cpp           | 28 ++++++++++---------
 forge/forge/op/eltwise_unary.py               | 12 --------
 forge/test/mlir/llama/test_llama_inference.py |  1 -
 forge/test/mlir/test_ops.py                   |  2 --
 4 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/forge/csrc/passes/lower_to_mlir.cpp b/forge/csrc/passes/lower_to_mlir.cpp
index 8194697f9..311388b00 100644
--- a/forge/csrc/passes/lower_to_mlir.cpp
+++ b/forge/csrc/passes/lower_to_mlir.cpp
@@ -543,37 +543,39 @@ class MLIRGenerator
         return string_value;
     }
 
-    /// Initialize lowering handler map
+    /// Initialize lowering handler map, keep in lexicographical order
     void init_lowering_handler_map()
     {
+        lowering_handler_map["abs"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::AbsOp>;
         lowering_handler_map["add"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::AddOp>;
+        lowering_handler_map["cast"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::TypecastOp>;
+        lowering_handler_map["concatenate"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ConcatOp>;
+        lowering_handler_map["conv2d"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::Conv2dOp>;
+        lowering_handler_map["cosine"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::CosOp>;
         lowering_handler_map["embedding"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::EmbeddingOp>;
+        lowering_handler_map["exp"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ExpOp>;
+        lowering_handler_map["greater_equal"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::GreaterEqualOp>;
+        lowering_handler_map["greater"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::GreaterThanOp>;
+        lowering_handler_map["less"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::LessEqualOp>;
         lowering_handler_map["matmul"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MatmulOp>;
+        lowering_handler_map["max_pool2d"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MaxPool2dOp>;
+        lowering_handler_map["maximum"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MaximumOp>;
         lowering_handler_map["multiply"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MultiplyOp>;
+        lowering_handler_map["not_equal"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::NotEqualOp>;
         lowering_handler_map["reciprocal"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ReciprocalOp>;
         lowering_handler_map["reduce_avg"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MeanOp>;
         lowering_handler_map["reduce_max"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MaxOp>;
         lowering_handler_map["reduce_sum"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SumOp>;
         lowering_handler_map["relu"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ReluOp>;
         lowering_handler_map["reshape"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ReshapeOp>;
+        lowering_handler_map["sigmoid"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SigmoidOp>;
+        lowering_handler_map["sine"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SinOp>;
         lowering_handler_map["softmax"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SoftmaxOp>;
         lowering_handler_map["sqrt"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SqrtOp>;
         lowering_handler_map["squeeze"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SqueezeOp>;
         lowering_handler_map["subtract"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SubtractOp>;
         lowering_handler_map["transpose"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::TransposeOp>;
-        lowering_handler_map["greater_equal"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::GreaterEqualOp>;
         lowering_handler_map["unsqueeze"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::UnsqueezeOp>;
-        lowering_handler_map["conv2d"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::Conv2dOp>;
-        lowering_handler_map["concatenate"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ConcatOp>;
-        lowering_handler_map["sigmoid"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::SigmoidOp>;
-        lowering_handler_map["max_pool2d"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MaxPool2dOp>;
-        lowering_handler_map["abs"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::AbsOp>;
-        lowering_handler_map["exp"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::ExpOp>;
-        lowering_handler_map["maximum"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::MaximumOp>;
-        lowering_handler_map["less"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::LessEqualOp>;
-        lowering_handler_map["greater"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::GreaterThanOp>;
-        lowering_handler_map["not_equal"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::NotEqualOp>;
-        lowering_handler_map["cast"] = &MLIRGenerator::emit_mlir_ttforge_op<mlir::tt::ttir::TypecastOp>;
     }
 };
 }  // namespace
diff --git a/forge/forge/op/eltwise_unary.py b/forge/forge/op/eltwise_unary.py
index a2fc38fa2..365cb882e 100644
--- a/forge/forge/op/eltwise_unary.py
+++ b/forge/forge/op/eltwise_unary.py
@@ -380,12 +380,6 @@ def Sine(name: str, operandA: Tensor) -> Tensor:
     operandA: Tensor
         First operand
 
-    min: float
-        Minimum value
-
-    max: float
-        Maximum value
-
     Returns
     -------
     Tensor
@@ -407,12 +401,6 @@ def Cosine(name: str, operandA: Tensor) -> Tensor:
     operandA: Tensor
         First operand
 
-    min: float
-        Minimum value
-
-    max: float
-        Maximum value
-
     Returns
     -------
     Tensor
diff --git a/forge/test/mlir/llama/test_llama_inference.py b/forge/test/mlir/llama/test_llama_inference.py
index a5b918cd1..e6f55e4ed 100644
--- a/forge/test/mlir/llama/test_llama_inference.py
+++ b/forge/test/mlir/llama/test_llama_inference.py
@@ -11,7 +11,6 @@
 
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
-@pytest.mark.xfail()
 def test_llama_inference(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/test_ops.py b/forge/test/mlir/test_ops.py
index db84c8b56..482a0491d 100644
--- a/forge/test/mlir/test_ops.py
+++ b/forge/test/mlir/test_ops.py
@@ -61,7 +61,6 @@ def get_input_tensor(dtype):
         (1, 7, 256),
     ],
 )
-@pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
 def test_sin(shape):
     class sin(nn.Module):
         def __init__(self):
@@ -89,7 +88,6 @@ def forward(self, x):
         (1, 7, 256),
     ],
 )
-@pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
 def test_cosine(shape):
     class cosine(nn.Module):
         def __init__(self):

From 535a25fb29219d8db435e42daf52043b013c37e1 Mon Sep 17 00:00:00 2001
From: Kamalraj Kannan <157608228+kamalrajkannan78@users.noreply.github.com>
Date: Thu, 7 Nov 2024 12:21:46 +0530
Subject: [PATCH 09/18] update compile depth of models & fix UNet issue (#636)

---
 .../high_prio/cnn/pytorch/test_blazepose.py        |  4 ++--
 .../model_demos/high_prio/cnn/pytorch/test_clip.py |  2 +-
 .../model_demos/high_prio/cnn/pytorch/test_dla.py  |  5 ++++-
 .../high_prio/cnn/pytorch/test_efficientnet.py     |  6 +++---
 .../high_prio/cnn/pytorch/test_mobilenet_v1.py     |  6 +++---
 .../high_prio/cnn/pytorch/test_mobilenet_v2.py     |  2 +-
 .../high_prio/cnn/pytorch/test_openpose.py         |  2 +-
 .../model_demos/high_prio/cnn/pytorch/test_rcnn.py |  2 +-
 .../high_prio/cnn/pytorch/test_resnet.py           |  6 ++----
 .../high_prio/cnn/pytorch/test_resnext.py          | 14 +++++++-------
 .../model_demos/high_prio/cnn/pytorch/test_unet.py |  8 ++------
 .../model_demos/high_prio/cnn/pytorch/test_vgg.py  |  9 ++++++---
 .../model_demos/high_prio/cnn/pytorch/test_vit.py  |  2 --
 .../high_prio/cnn/pytorch/test_vovnet.py           |  1 -
 .../high_prio/cnn/pytorch/test_yolox.py            |  2 +-
 .../high_prio/nlp/pytorch/test_llama3.py           |  2 +-
 .../model_demos/high_prio/nlp/pytorch/test_phi3.py |  4 ++--
 .../model_demos/high_prio/nlp/pytorch/test_t5.py   |  8 +++++++-
 .../high_prio/nlp/pytorch/test_whisper_0.py        |  2 +-
 forge/test/model_demos/models/wideresnet.py        |  4 ++--
 forge/test/model_demos/models/xception.py          |  2 +-
 21 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
index 222dc0b88..bcf04d32d 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
@@ -50,7 +50,7 @@ def test_blazepose_regressor_pytorch(test_device):
     pose_regressor = BlazePoseLandmark()
     pose_regressor.load_weights("mediapipepytorch/blazepose_landmark.pth")
     img2 = [torch.rand(1, 3, 256, 256)]
-    compiled_model = forge.compile(pose_regressor, sample_inputs=[img2], module_name="pt_blazepose_regressor")
+    compiled_model = forge.compile(pose_regressor, sample_inputs=img2, module_name="pt_blazepose_regressor")
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
@@ -88,4 +88,4 @@ def test_blaze_hand_pytorch(test_device):
     hand_regressor.load_weights("mediapipepytorch/blazehand_landmark.pth")
 
     sample_tensor = [torch.rand(1, 3, 256, 256)]
-    compiled_model = forge.compile(hand_regressor, sample_inputs=[sample_tensor], module_name="pt_hand_regressor")
+    compiled_model = forge.compile(hand_regressor, sample_inputs=sample_tensor, module_name="pt_hand_regressor")
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
index 3d3fa2e20..e62aedbc0 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
@@ -103,7 +103,7 @@ def test_clip_pytorch(test_device):
 
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.CONSTEVAL_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
 
     # Load processor and model from HuggingFace
     model_ckpt = "openai/clip-vit-base-patch32"
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
index 728bcb080..a5de8a808 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
@@ -40,7 +40,10 @@
 def test_dla_pytorch(variant, test_device):
     # Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    if variant in ("dla102", "dla102x", "dla102x2", "dla169"):
+        compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
+    else:
+        compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
     func = variants_func[variant]
 
     # Load data sample
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index d555fce09..bf7dcc0cd 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -21,7 +21,7 @@
 
 variants = [
     "efficientnet_b0",
-    # "efficientnet_b4",
+    "efficientnet_b4",
     # "hf_hub:timm/efficientnet_b0.ra_in1k",
     # "hf_hub:timm/efficientnet_b4.ra2_in1k",
     # "hf_hub:timm/efficientnet_b5.in12k_ft_in1k",
@@ -36,7 +36,7 @@ def test_efficientnet_timm(variant, test_device):
 
     # Configuration
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Load model
     framework_model = download_model(timm.create_model, variant, pretrained=True)
@@ -85,7 +85,7 @@ def get_state_dict(self, *args, **kwargs):
 def test_efficientnet_torchvision(variant, test_device):
     # Configuration
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Load model
     if variant == "efficientnet_b0":
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
index 932373a88..d90074594 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
@@ -137,7 +137,7 @@ def forward(self, input):
 def generate_model_mobilenetV1_base_custom_pytorch(test_device, variant):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Create Forge module from PyTorch model
     model = MobileNetV1(9)
@@ -165,7 +165,7 @@ def test_mobilenetv1_basic(test_device):
 def generate_model_mobilenetv1_imgcls_hf_pytorch(test_device, variant):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
 
     # Create Forge module from PyTorch model
     preprocessor = download_model(AutoImageProcessor.from_pretrained, variant)
@@ -193,7 +193,7 @@ def test_mobilenetv1_192(test_device):
 def generate_model_mobilenetV1I224_imgcls_hf_pytorch(test_device, variant):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
 
     # Create Forge module from PyTorch model
     preprocessor = download_model(AutoImageProcessor.from_pretrained, variant)
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
index 400789015..798683e37 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
@@ -20,7 +20,7 @@
 def generate_model_mobilenetV2_imgcls_torchhub_pytorch(test_device, variant):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
+    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
 
     model = download_model(torch.hub.load, variant, "mobilenet_v2", pretrained=True)
 
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
index 82e82772c..827f07bad 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
@@ -326,7 +326,7 @@ def generate_model_openpose_posdet_osmr_pytorch(test_device, variant):
 
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
 
     # Load model
     framework_model = download_model(ptcv_get_model, variant, pretrained=True)
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
index 622eab9c8..d3ce463a3 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
@@ -57,7 +57,7 @@ def test_rcnn_pytorch(test_device):
 
     # Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.GENERATE_INITIAL_GRAPH
 
     # Proposals generated by selective search were fed to a model in a loop manner to compute features.
     # [Refer line No.151 in https://github.com/object-detection-algorithm/R-CNN/blob/master/py/car_detector.py]
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
index 71710cb8e..82995033e 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
@@ -22,7 +22,7 @@ def generate_model_resnet_imgcls_hf_pytorch(variant):
 
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Load data sample
     try:
@@ -47,8 +47,6 @@ def test_resnet(test_device):
         "microsoft/resnet-50",
     )
 
-    compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
     compiled_model = forge.compile(model, sample_inputs=[inputs[0]], module_name="pt_resnet50")
 
 
@@ -60,7 +58,7 @@ def generate_model_resnet_imgcls_timm_pytorch(variant):
 
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Load data sample
     try:
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
index 065dd6b1f..d0ee7f4a3 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
@@ -38,7 +38,7 @@ def get_image_tensor():
 def test_resnext_50_torchhub_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "resnext50_32x4d", pretrained=True)
@@ -55,7 +55,7 @@ def test_resnext_50_torchhub_pytorch(test_device):
 def test_resnext_101_torchhub_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "resnext101_32x8d", pretrained=True)
@@ -73,7 +73,7 @@ def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     # 4 variants
@@ -91,7 +91,7 @@ def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
 def test_resnext_14_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(ptcv_get_model, "resnext14_32x4d", pretrained=True)
@@ -109,7 +109,7 @@ def test_resnext_14_osmr_pytorch(test_device):
 def test_resnext_26_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(ptcv_get_model, "resnext26_32x4d", pretrained=True)
@@ -126,7 +126,7 @@ def test_resnext_26_osmr_pytorch(test_device):
 def test_resnext_50_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(ptcv_get_model, "resnext50_32x4d", pretrained=True)
@@ -143,7 +143,7 @@ def test_resnext_50_osmr_pytorch(test_device):
 def test_resnext_101_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     model = download_model(ptcv_get_model, "resnext101_64x4d", pretrained=True)
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
index 64b39d258..6855e1403 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
@@ -14,7 +14,6 @@
 import pytest
 from pytorchcv.model_provider import get_model as ptcv_get_model
 import segmentation_models_pytorch as smp
-from segmentation_models_pytorch.encoders import get_preprocessing_fn
 
 
 def generate_model_unet_imgseg_osmr_pytorch(variant):
@@ -124,7 +123,7 @@ def generate_model_unet_imgseg_torchhub_pytorch(variant):
 
     model = download_model(
         torch.hub.load,
-        "mateuszforge/brain-segmentation-pytorch",
+        "mateuszbuda/brain-segmentation-pytorch",
         variant,
         in_channels=3,
         out_channels=1,
@@ -135,7 +134,7 @@ def generate_model_unet_imgseg_torchhub_pytorch(variant):
 
     # Download an example input image
     url, filename = (
-        "https://github.com/mateuszforge/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png",
+        "https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png",
         "TCGA_CS_4944.png",
     )
     try:
@@ -156,9 +155,6 @@ def generate_model_unet_imgseg_torchhub_pytorch(variant):
     return model, [img_batch], {}
 
 
-@pytest.mark.skip(
-    reason="Failed to download the model after multiple retries."
-)  # https://github.com/tenstorrent/tt-forge-fe/issues/515
 def test_unet_torchhub_pytorch(test_device):
     model, inputs, _ = generate_model_unet_imgseg_torchhub_pytorch(
         "unet",
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
index bb80712fc..e543f4848 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
@@ -27,7 +27,10 @@
 def test_vgg_osmr_pytorch(variant, test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    if variant == "bn_vgg19":
+        compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
+    else:
+        compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
 
     model = download_model(ptcv_get_model, variant, pretrained=True)
     model.eval()
@@ -59,7 +62,7 @@ def test_vgg_19_hf_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.GENERATE_INITIAL_GRAPH
 
     """
     # https://pypi.org/project/vgg-pytorch/
@@ -129,7 +132,7 @@ def test_vgg_bn19_torchhub_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.GENERATE_INITIAL_GRAPH
 
     model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "vgg19_bn", pretrained=True)
     model.eval()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
index f73817637..4507a31c9 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
@@ -36,8 +36,6 @@ def generate_model_vit_imgcls_hf_pytorch(test_device, variant):
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_vit_classify_224_hf_pytorch(variant, test_device):
-    compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
     model, inputs, _ = generate_model_vit_imgcls_hf_pytorch(
         test_device,
         variant,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
index c9a4b7ec4..7a89ee968 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
@@ -107,7 +107,6 @@ def test_vovnet_v1_39_stigma_pytorch(test_device, enable_default_dram_parameters
         None,
     )
 
-    compiler_cfg = forge.config._get_global_compiler_config()
     compiled_model = forge.compile(model, sample_inputs=[inputs[0]], module_name=f"pt_vovnet_39_stigma")
 
 
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
index 21b3322e5..bfa809dd2 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
@@ -57,7 +57,7 @@ def test_yolox_pytorch(variant, test_device):
 
     # Set PyBuda configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
 
     # prepare model
     weight_name = f"{variant}.pth"
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
index 061e7a0a5..ff6f5914f 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
@@ -113,7 +113,7 @@ def _update_causal_mask(
 def test_llama3_causal_lm(variant, test_device):
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.INIT_COMPILE
 
     # Load model (with tokenizer)
     tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
index c27696217..ff30527d1 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -63,7 +63,7 @@ def test_phi3_token_classification(variant, test_device):
 
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Phi3Config from pretrained variant, disable return_dict and caching.
     config = Phi3Config.from_pretrained(variant)
@@ -96,7 +96,7 @@ def test_phi3_sequence_classification(variant, test_device):
 
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # Phi3Config from pretrained variant, disable return_dict and caching.
     config = Phi3Config.from_pretrained(variant)
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
index 2e5a977a4..fbf843264 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
@@ -82,7 +82,13 @@ def forward(self, decoder_input_ids, encoder_outputs):
 def test_t5_generation(variant, test_device):
 
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = CompileDepth.POST_AUTOGRAD_PASS
+
+    if variant in ["t5-small", "t5-base", "t5-large"]:
+        compiler_cfg.compile_depth = CompileDepth.FINISH_COMPILE
+    elif variant in ["google/flan-t5-small", "google/flan-t5-base"]:
+        compiler_cfg.compile_depth = CompileDepth.SPLIT_GRAPH
+    else:
+        compiler_cfg.compile_depth = CompileDepth.INIT_COMPILE
 
     # Load tokenizer and model from HuggingFace
     # Variants: t5-small, t5-base, t5-large
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
index 64642dfd4..f254eca9c 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
@@ -37,7 +37,7 @@
 def generate_model_whisper_congen_hf_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.POST_AUTOGRAD_PASS
+    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
 
     class Wrapper(torch.nn.Module):
         def __init__(self, model):
diff --git a/forge/test/model_demos/models/wideresnet.py b/forge/test/model_demos/models/wideresnet.py
index 712d29319..b812aab1a 100644
--- a/forge/test/model_demos/models/wideresnet.py
+++ b/forge/test/model_demos/models/wideresnet.py
@@ -17,7 +17,7 @@ def generate_model_wideresnet_imgcls_pytorch(test_device, variant):
 
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     framework_model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", variant, pretrained=True)
@@ -45,7 +45,7 @@ def generate_model_wideresnet_imgcls_timm(test_device, variant):
 
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     framework_model = download_model(timm.create_model, variant, pretrained=True)
diff --git a/forge/test/model_demos/models/xception.py b/forge/test/model_demos/models/xception.py
index e9b9c496f..7a3b1559c 100644
--- a/forge/test/model_demos/models/xception.py
+++ b/forge/test/model_demos/models/xception.py
@@ -14,7 +14,7 @@
 def generate_model_xception_imgcls_timm(test_device, variant):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
-    compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
+    compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
 
     # STEP 2: Create Forge module from PyTorch model
     framework_model = download_model(timm.create_model, variant, pretrained=True)

From 7972a6a63b9fe9bf7c3640822e2b1d9c8c4c16d1 Mon Sep 17 00:00:00 2001
From: Deepak Sudhakar <dsudhakar@tenstorrent.com>
Date: Thu, 7 Nov 2024 13:19:48 +0530
Subject: [PATCH 10/18] Add pytest nightly marker for CI (#551)

---
 forge/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py  | 2 ++
 forge/test/model_demos/high_prio/cnn/onnx/test_dla.py     | 1 +
 forge/test/model_demos/high_prio/cnn/onnx/test_fpn.py     | 1 +
 forge/test/model_demos/high_prio/cnn/onnx/test_hardnet.py | 1 +
 .../model_demos/high_prio/cnn/onnx/test_lstm_genom.py     | 1 +
 .../model_demos/high_prio/cnn/onnx/test_lstm_valence.py   | 1 +
 .../model_demos/high_prio/cnn/onnx/test_perceiverio.py    | 1 +
 .../test/model_demos/high_prio/cnn/onnx/test_retinanet.py | 2 ++
 .../high_prio/cnn/onnx/test_segformer_imgcls.py           | 1 +
 .../high_prio/cnn/onnx/test_segformer_semseg.py           | 1 +
 forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py | 2 ++
 forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py | 3 +++
 forge/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py  | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_alexnet.py     | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_autoencoder.py | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_blazepose.py   | 4 ++++
 forge/test/model_demos/high_prio/cnn/pytorch/test_bts.py  | 1 +
 forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py | 2 ++
 forge/test/model_demos/high_prio/cnn/pytorch/test_deit.py | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_densenet.py    | 4 ++++
 forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py  | 1 +
 .../high_prio/cnn/pytorch/test_efficientnet.py            | 2 ++
 .../high_prio/cnn/pytorch/test_efficientnet_lite.py       | 5 +++++
 .../model_demos/high_prio/cnn/pytorch/test_fchardnet.py   | 1 +
 forge/test/model_demos/high_prio/cnn/pytorch/test_fpn.py  | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_ghostnet.py    | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_googlenet.py   | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_hardnet.py     | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_hrnet.py  | 2 ++
 .../high_prio/cnn/pytorch/test_inception_v4.py            | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py   | 1 +
 .../high_prio/cnn/pytorch/test_mobilenet_v1.py            | 3 +++
 .../high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py        | 1 +
 .../high_prio/cnn/pytorch/test_mobilenet_v2.py            | 6 ++++++
 .../high_prio/cnn/pytorch/test_mobilenet_v3.py            | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_monodle.py     | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_nbeats.py | 3 +++
 .../model_demos/high_prio/cnn/pytorch/test_openpose.py    | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_perceiverio.py | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_pidnet.py | 1 +
 forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_resnet.py | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_resnext.py     | 7 +++++++
 .../model_demos/high_prio/cnn/pytorch/test_retinanet.py   | 1 +
 .../high_prio/cnn/pytorch/test_segformer_imgcls.py        | 1 +
 .../high_prio/cnn/pytorch/test_segformer_semseg.py        | 1 +
 .../high_prio/cnn/pytorch/test_ssd300_resnet50.py         | 1 +
 .../high_prio/cnn/pytorch/test_stable_diffusion.py        | 1 +
 forge/test/model_demos/high_prio/cnn/pytorch/test_swin.py | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py | 1 +
 forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py | 4 ++++
 forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py  | 4 ++++
 forge/test/model_demos/high_prio/cnn/pytorch/test_vilt.py | 2 ++
 forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py  | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_vovnet.py | 4 ++++
 .../model_demos/high_prio/cnn/pytorch/test_wideresnet.py  | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_xception.py    | 1 +
 .../model_demos/high_prio/cnn/pytorch/test_yolo_v3.py     | 2 ++
 .../model_demos/high_prio/cnn/pytorch/test_yolo_v5.py     | 4 ++++
 .../model_demos/high_prio/cnn/pytorch/test_yolo_v6.py     | 1 +
 .../test/model_demos/high_prio/cnn/pytorch/test_yolox.py  | 1 +
 .../high_prio/cnn/tflite/test_efficientnet_lite.py        | 7 +++++++
 .../high_prio/cnn/tflite/test_hand_landmarker.py          | 2 ++
 .../high_prio/cnn/tflite/test_mobilenet_ssd.py            | 1 +
 .../high_prio/cnn/tflite/test_pose_landmark.py            | 4 ++++
 .../test/model_demos/high_prio/nlp/pytorch/test_albert.py | 2 ++
 forge/test/model_demos/high_prio/nlp/pytorch/test_bart.py | 1 +
 forge/test/model_demos/high_prio/nlp/pytorch/test_bert.py | 4 ++++
 .../model_demos/high_prio/nlp/pytorch/test_codegen.py     | 1 +
 .../model_demos/high_prio/nlp/pytorch/test_distilbert.py  | 4 ++++
 forge/test/model_demos/high_prio/nlp/pytorch/test_dpr.py  | 3 +++
 .../test/model_demos/high_prio/nlp/pytorch/test_falcon.py | 1 +
 .../model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py     | 2 ++
 .../model_demos/high_prio/nlp/pytorch/test_gemma_2b.py    | 8 ++++++++
 forge/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py | 2 ++
 .../test/model_demos/high_prio/nlp/pytorch/test_gptneo.py | 2 ++
 .../test/model_demos/high_prio/nlp/pytorch/test_llama3.py | 2 ++
 .../model_demos/high_prio/nlp/pytorch/test_mistral.py     | 4 ++++
 forge/test/model_demos/high_prio/nlp/pytorch/test_opt.py  | 3 +++
 forge/test/model_demos/high_prio/nlp/pytorch/test_phi2.py | 3 +++
 forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py | 3 +++
 forge/test/model_demos/high_prio/nlp/pytorch/test_qwen.py | 2 ++
 .../model_demos/high_prio/nlp/pytorch/test_roberta.py     | 2 ++
 .../model_demos/high_prio/nlp/pytorch/test_squeezebert.py | 1 +
 forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py   | 6 ++++++
 .../model_demos/high_prio/nlp/pytorch/test_whisper_0.py   | 3 +++
 .../model_demos/high_prio/nlp/pytorch/test_whisper_1.py   | 3 +++
 forge/test/model_demos/high_prio/nlp/pytorch/test_xglm.py | 1 +
 89 files changed, 193 insertions(+)

diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py b/forge/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
index 3354ff962..6b4bf3b49 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
@@ -16,6 +16,7 @@
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_ddrnet(variant, test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -76,6 +77,7 @@ def test_ddrnet(variant, test_device):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_dla.py b/forge/test/model_demos/high_prio/cnn/onnx/test_dla.py
index 77227bb59..8bc8550b3 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_dla.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_dla.py
@@ -29,6 +29,7 @@
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_dla_onnx(test_device, variant):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_fpn.py b/forge/test/model_demos/high_prio/cnn/onnx/test_fpn.py
index 98cbac273..aaac9de1d 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_fpn.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_fpn.py
@@ -9,6 +9,7 @@
 from forge import VerifyConfig
 
 
+@pytest.mark.nightly
 def test_fpn_onnx(test_device, test_kind):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_hardnet.py b/forge/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
index 619ef31a2..49f70a0a9 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
@@ -16,6 +16,7 @@
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_hardnet_onnx(variant, test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_genom.py b/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_genom.py
index c5221648f..fdf77b029 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_genom.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_genom.py
@@ -14,6 +14,7 @@
 from test.utils import download_model
 
 
+@pytest.mark.nightly
 def test_lstm_genom_onnx(test_device):
     load_path = "third_party/confidential_customer_models/model_2/onnx/lstm_genom/lstm-genom-model.onnx"
     model = onnx.load(load_path)
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py b/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
index 0b8d495ad..07c8c4813 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
@@ -13,6 +13,7 @@
 from forge.verify.config import TestKind
 
 
+@pytest.mark.nightly
 def test_lstm_valence_onnx(test_device):
     # Load model checkpoint from HuggingFace
     load_path = "third_party/confidential_customer_models/model_2/onnx/lstm_valence/lstm-valence-model.onnx"
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py b/forge/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
index 2c83b8b29..eeb3d8e09 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
@@ -33,6 +33,7 @@ def get_sample_data(model_name):
         "deepmind/vision-perceiver-fourier",
     ],
 )
+@pytest.mark.nightly
 def test_perceiver_for_image_classification_onnx(test_device, model_name):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_retinanet.py b/forge/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
index 074f4b492..22d6c51d5 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
@@ -49,6 +49,7 @@ def img_preprocess(scal_val=1):
 #########
 
 
+@pytest.mark.nightly
 def test_retinanet_r101_640x480_onnx(test_device):
     os.environ["FORGE_DECOMPOSE_SIGMOID"] = "1"
     os.environ["FORGE_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
@@ -120,6 +121,7 @@ def img_preprocessing():
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_retinanet_onnx(variant, test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py b/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
index 5c69f7231..7f9b9e112 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
@@ -34,6 +34,7 @@ def get_sample_data(model_name):
 
 
 @pytest.mark.parametrize("variant", variants_img_classification)
+@pytest.mark.nightly
 def test_segformer_image_classification_onnx(test_device, variant):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py b/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py
index 55e3e4514..6c0b83d7c 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py
@@ -33,6 +33,7 @@ def get_sample_data(model_name):
 
 
 @pytest.mark.parametrize("variant", variants_semseg)
+@pytest.mark.nightly
 def test_segformer_semantic_segmentation_onnx(test_device, variant):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
index a198ed0c8..14ff21409 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
@@ -46,6 +46,7 @@ def preprocess(img):
 
 
 @pytest.mark.skip(reason="While loop in model, not supported yet")
+@pytest.mark.nightly
 def test_yolov3_tiny_onnx(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -78,6 +79,7 @@ def test_yolov3_tiny_onnx(test_device):
 
 
 @pytest.mark.skip(reason="While loop in model, not supported yet")
+@pytest.mark.nightly
 def test_yolov3_onnx(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
index 8f486a147..ca930d147 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -58,6 +58,7 @@ def data_preprocessing(ims: Image.Image, size: tuple) -> tuple:
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolo_v5_320x320_onnx(test_device, variant):
 
     # forge configuration parameters
@@ -103,6 +104,7 @@ def test_yolo_v5_320x320_onnx(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolo_v5_480x480_onnx(test_device, variant):
 
     # forge configuration parameters
@@ -183,6 +185,7 @@ def test_yolo_v5_480x480_onnx(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolo_v5_640x640_onnx(test_device, variant):
 
     # forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
index 28f30dd86..b58307a8a 100644
--- a/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
+++ b/forge/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -38,6 +38,7 @@ def preprocess(img, input_size, swap=(2, 0, 1)):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolox_onnx(variant, test_device):
 
     # forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
index ddbebcbaf..74262a761 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
@@ -13,6 +13,7 @@
 import os
 
 
+@pytest.mark.nightly
 def test_alexnet_torchhub(test_device):
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -45,6 +46,7 @@ def test_alexnet_torchhub(test_device):
     compiled_model = forge.compile(framework_model, sample_inputs=inputs, module_name="pt_alexnet_torchhub")
 
 
+@pytest.mark.nightly
 def test_alexnet_osmr(test_device):
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
index cbc92461d..771471958 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
@@ -87,6 +87,7 @@ def forward(self, x):
         return act
 
 
+@pytest.mark.nightly
 def test_conv_ae_pytorch(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -113,6 +114,7 @@ def test_conv_ae_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[sample_tensor], module_name="pt_conv_ae")
 
 
+@pytest.mark.nightly
 def test_linear_ae_pytorch(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
index bcf04d32d..68a4e1c09 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
@@ -20,6 +20,7 @@
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_blazepose_detector_pytorch(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -41,6 +42,7 @@ def test_blazepose_detector_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_blazepose_regressor_pytorch(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -54,6 +56,7 @@ def test_blazepose_regressor_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_blaze_palm_pytorch(test_device):
 
     # Set Forge configuration parameters
@@ -77,6 +80,7 @@ def test_blaze_palm_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_blaze_hand_pytorch(test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_bts.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_bts.py
index ce103aaae..7dbaa4f20 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_bts.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_bts.py
@@ -22,6 +22,7 @@
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_bts_pytorch(test_device, variant):
 
     # Set PyBuda configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
index e62aedbc0..9416ad5c9 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_clip.py
@@ -99,6 +99,7 @@ def forward(self, input_ids, vision_outputs, last_hidden_state, *encoder_outputs
         return output
 
 
+@pytest.mark.nightly
 def test_clip_pytorch(test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
index 15cb14f60..99e105542 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -20,6 +20,7 @@
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_ddrnet_pytorch(variant, test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -69,6 +70,7 @@ def test_ddrnet_pytorch(variant, test_device):
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_ddrnet_semantic_segmentation_pytorch(variant, test_device):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_deit.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_deit.py
index 2c12ce199..3eccb6a85 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_deit.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_deit.py
@@ -15,6 +15,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_vit_base_classify_224_hf_pytorch(variant, test_device):
     model, inputs, _ = generate_model_deit_imgcls_hf_pytorch(
         variant,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_densenet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
index 08239562d..ebc01710f 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
@@ -82,6 +82,7 @@ def get_input_img_hf_xray():
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_densenet_121_pytorch(variant, test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -104,6 +105,7 @@ def test_densenet_121_pytorch(variant, test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name=f"pt_{variant_name}")
 
 
+@pytest.mark.nightly
 def test_densenet_161_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -120,6 +122,7 @@ def test_densenet_161_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_densenet_161")
 
 
+@pytest.mark.nightly
 def test_densenet_169_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -136,6 +139,7 @@ def test_densenet_169_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_densenet_169")
 
 
+@pytest.mark.nightly
 def test_densenet_201_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
index a5de8a808..e00da85bd 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_dla.py
@@ -37,6 +37,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_dla_pytorch(variant, test_device):
     # Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index bf7dcc0cd..93ba7fcdf 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -32,6 +32,7 @@
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_efficientnet_timm(variant, test_device):
 
     # Configuration
@@ -81,6 +82,7 @@ def get_state_dict(self, *args, **kwargs):
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_efficientnet_torchvision(variant, test_device):
     # Configuration
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
index adf33b7fb..ad25b2e02 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
@@ -32,6 +32,7 @@ def get_image_tensor(wh):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_efficientnet_lite_0_pytorch():
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -50,6 +51,7 @@ def test_efficientnet_lite_0_pytorch():
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_efficientnet_lite_1_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -70,6 +72,7 @@ def test_efficientnet_lite_1_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_efficientnet_lite_2_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -89,6 +92,7 @@ def test_efficientnet_lite_2_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_efficientnet_lite_3_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -108,6 +112,7 @@ def test_efficientnet_lite_3_pytorch(test_device):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_efficientnet_lite_4_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_fchardnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_fchardnet.py
index 1c9bc7cc7..1c88359f6 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_fchardnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_fchardnet.py
@@ -15,6 +15,7 @@
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_fchardnet(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_fpn.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_fpn.py
index 334cca227..887c51c47 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_fpn.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_fpn.py
@@ -25,6 +25,7 @@ def forward(self, feat0, feat1, feat2):
         return self.fpn(x)
 
 
+@pytest.mark.nightly
 def test_fpn_pytorch(test_device):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.compile_depth = forge.CompileDepth.SPLIT_GRAPH
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
index 9b721afb0..221458731 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
@@ -9,6 +9,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_ghostnet_timm(variant, test_device):
     model, inputs = generate_model_ghostnet_imgcls_timm(variant)
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name=f"pt_{variant}")
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
index d49380c0e..76d9e2323 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
@@ -10,6 +10,7 @@
 import os
 
 
+@pytest.mark.nightly
 def test_googlenet_pytorch(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
index 6d0abf0a8..32309656d 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
@@ -14,6 +14,7 @@
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_hardnet_pytorch(test_device, variant):
 
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
index 8f7e0b1d7..0a8268037 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
@@ -83,6 +83,7 @@ def generate_model_hrnet_imgcls_osmr_pytorch(variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_hrnet_osmr_pytorch(test_device, variant):
     model, inputs, _ = generate_model_hrnet_imgcls_osmr_pytorch(
         variant,
@@ -147,6 +148,7 @@ def generate_model_hrnet_imgcls_timm_pytorch(variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_hrnet_timm_pytorch(test_device, variant):
     model, inputs, _ = generate_model_hrnet_imgcls_timm_pytorch(
         variant,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
index 797e1d0a3..3a6679cef 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
@@ -76,6 +76,7 @@ def get_image():
     return img_tensor
 
 
+@pytest.mark.nightly
 def test_inception_v4_osmr_pytorch(test_device):
     model, inputs = generate_model_inceptionV4_imgcls_osmr_pytorch("inceptionv4")
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_osmr_inception_v4")
@@ -91,6 +92,7 @@ def generate_model_inceptionV4_imgcls_timm_pytorch(variant):
     return framework_model, [img_tensor]
 
 
+@pytest.mark.nightly
 def test_inception_v4_timm_pytorch(test_device):
     model, inputs = generate_model_inceptionV4_imgcls_timm_pytorch("inception_v4")
 
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
index 8e2eb182e..fdd47b6f3 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
@@ -37,6 +37,7 @@
 
 
 @pytest.mark.parametrize("variant", varaints, ids=varaints)
+@pytest.mark.nightly
 def test_mlp_mixer_timm_pytorch(variant, test_device):
 
     model = download_model(timm.create_model, variant, pretrained=True)
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
index d90074594..804a8c744 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
@@ -148,6 +148,7 @@ def generate_model_mobilenetV1_base_custom_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv1_basic(test_device):
     model, inputs, _ = generate_model_mobilenetV1_base_custom_pytorch(
         test_device,
@@ -182,6 +183,7 @@ def generate_model_mobilenetv1_imgcls_hf_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv1_192(test_device):
     model, inputs, _ = generate_model_mobilenetv1_imgcls_hf_pytorch(
         test_device,
@@ -209,6 +211,7 @@ def generate_model_mobilenetV1I224_imgcls_hf_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv1_224(test_device):
     model, inputs, _ = generate_model_mobilenetV1I224_imgcls_hf_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
index 838a543a7..06873634c 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
@@ -13,6 +13,7 @@
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_mobilenet_v1_ssd_pytorch_1x1(test_device):
 
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
index 798683e37..1769cf95b 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
@@ -35,6 +35,7 @@ def generate_model_mobilenetV2_imgcls_torchhub_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv2_basic(test_device):
     model, inputs, _ = generate_model_mobilenetV2_imgcls_torchhub_pytorch(
         test_device,
@@ -60,6 +61,7 @@ def generate_model_mobilenetV2I96_imgcls_hf_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv2_96(test_device):
     model, inputs, _ = generate_model_mobilenetV2I96_imgcls_hf_pytorch(
         test_device,
@@ -85,6 +87,7 @@ def generate_model_mobilenetV2I160_imgcls_hf_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv2_160(test_device):
     model, inputs, _ = generate_model_mobilenetV2I160_imgcls_hf_pytorch(
         test_device,
@@ -112,6 +115,7 @@ def generate_model_mobilenetV2I244_imgcls_hf_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv2_224(test_device):
     model, inputs, _ = generate_model_mobilenetV2I244_imgcls_hf_pytorch(
         test_device,
@@ -148,6 +152,7 @@ def generate_model_mobilenetV2_imgcls_timm_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_mobilenetv2_timm(test_device):
     model, inputs, _ = generate_model_mobilenetV2_imgcls_timm_pytorch(
         test_device,
@@ -192,6 +197,7 @@ def generate_model_mobilenetV2_semseg_hf_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_mobilenetv2_deeplabv3(variant, test_device):
     model, inputs, _ = generate_model_mobilenetV2_semseg_hf_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
index 1426252fc..b3db18b2f 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
@@ -37,6 +37,7 @@ def generate_model_mobilenetV3_imgcls_torchhub_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_mobilenetv3_basic(variant, test_device):
     model, inputs, _ = generate_model_mobilenetV3_imgcls_torchhub_pytorch(
         test_device,
@@ -81,6 +82,7 @@ def generate_model_mobilenetV3_imgcls_timm_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_mobilenetv3_timm(variant, test_device):
     model, inputs, _ = generate_model_mobilenetV3_imgcls_timm_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_monodle.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
index 5d8a8dd06..77e2da30b 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_monodle.py
@@ -9,6 +9,7 @@
 import os
 
 
+@pytest.mark.nightly
 def test_monodle_pytorch(test_device):
     # PyBuda configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py
index 008938196..b59c6c12c 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_nbeats.py
@@ -15,6 +15,7 @@
 )
 
 
+@pytest.mark.nightly
 def test_nbeats_with_seasonality_basis(test_device):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
@@ -33,6 +34,7 @@ def test_nbeats_with_seasonality_basis(test_device):
     compiled_model = forge.compile(pytorch_model, sample_inputs=[x, x_mask], module_name="nbeats_seasonality")
 
 
+@pytest.mark.nightly
 def test_nbeats_with_generic_basis(test_device):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
@@ -45,6 +47,7 @@ def test_nbeats_with_generic_basis(test_device):
     compiled_model = forge.compile(pytorch_model, sample_inputs=[x, x_mask], module_name="nbeats_generic")
 
 
+@pytest.mark.nightly
 def test_nbeats_with_trend_basis(test_device):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.compile_depth = forge.CompileDepth.FINISH_COMPILE
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
index 827f07bad..3c539ea6f 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
@@ -314,6 +314,7 @@ def generate_model_openpose_posdet_custom_pytorch(test_device, variant):
 
 @pytest.mark.parametrize("variant", variants)
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_openpose_basic(variant, test_device):
     model, inputs, _ = generate_model_openpose_posdet_custom_pytorch(
         test_device,
@@ -349,6 +350,7 @@ def generate_model_openpose_posdet_osmr_pytorch(test_device, variant):
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_openpose_osmr(variant, test_device):
     model, inputs, _ = generate_model_openpose_posdet_osmr_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
index f844ee4ab..f3ac619d9 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
@@ -40,6 +40,7 @@ def get_sample_data(model_name):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_perceiverio_for_image_classification_pytorch(test_device, variant):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
index 8b9b87963..bc7344b06 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_pidnet.py
@@ -17,6 +17,7 @@
 
 @pytest.mark.skip(reason="dependent on CCM repo")
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_pidnet_pytorch(variant, test_device):
 
     # STEP 1: Set PyBuda configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
index d3ce463a3..b7eec5476 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
@@ -10,6 +10,7 @@
 import os
 
 
+@pytest.mark.nightly
 def test_rcnn_pytorch(test_device):
 
     # Load Alexnet Model
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
index 82995033e..3435b3030 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
@@ -41,6 +41,7 @@ def generate_model_resnet_imgcls_hf_pytorch(variant):
     return model, [pixel_values], {}
 
 
+@pytest.mark.nightly
 def test_resnet(test_device):
 
     model, inputs, _ = generate_model_resnet_imgcls_hf_pytorch(
@@ -76,6 +77,7 @@ def generate_model_resnet_imgcls_timm_pytorch(variant):
     return model, [pixel_values], {}
 
 
+@pytest.mark.nightly
 def test_resnet_timm(test_device):
     model, inputs, _ = generate_model_resnet_imgcls_timm_pytorch(
         "resnet50",
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
index d0ee7f4a3..2925573df 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
@@ -35,6 +35,7 @@ def get_image_tensor():
     return input_batch
 
 
+@pytest.mark.nightly
 def test_resnext_50_torchhub_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -52,6 +53,7 @@ def test_resnext_50_torchhub_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext50_torchhub")
 
 
+@pytest.mark.nightly
 def test_resnext_101_torchhub_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -69,6 +71,7 @@ def test_resnext_101_torchhub_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext101_torchhub")
 
 
+@pytest.mark.nightly
 def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -88,6 +91,7 @@ def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext101_fb_wsl")
 
 
+@pytest.mark.nightly
 def test_resnext_14_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -106,6 +110,7 @@ def test_resnext_14_osmr_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext14_osmr")
 
 
+@pytest.mark.nightly
 def test_resnext_26_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -123,6 +128,7 @@ def test_resnext_26_osmr_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext26_osmr")
 
 
+@pytest.mark.nightly
 def test_resnext_50_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -140,6 +146,7 @@ def test_resnext_50_osmr_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name="pt_resnext50_osmr")
 
 
+@pytest.mark.nightly
 def test_resnext_101_osmr_pytorch(test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
index d288591d9..1388a0dc2 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
@@ -40,6 +40,7 @@ def img_preprocess():
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_retinanet(variant, test_device):
 
     # Set PyBuda configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
index 6caaf5ab6..c8e2f8d87 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
@@ -35,6 +35,7 @@ def get_sample_data(model_name):
 
 
 @pytest.mark.parametrize("variant", variants_img_classification)
+@pytest.mark.nightly
 def test_segformer_image_classification_pytorch(test_device, variant):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
index 71fd61bf8..0017fb1d1 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
@@ -32,6 +32,7 @@ def get_sample_data(model_name):
 
 
 @pytest.mark.parametrize("variant", variants_semseg)
+@pytest.mark.nightly
 def test_segformer_semantic_segmentation_pytorch(test_device, variant):
 
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
index 19a191a49..c7fc3853a 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
@@ -55,6 +55,7 @@ def prepare_input(img_uri):
     return img
 
 
+@pytest.mark.nightly
 def test_pytorch_ssd300_resnet50(test_device):
 
     # STEP 1 : Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_stable_diffusion.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_stable_diffusion.py
index 915db395d..aa72a7b50 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_stable_diffusion.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_stable_diffusion.py
@@ -217,6 +217,7 @@ def stable_diffusion_postprocessing(
 
 
 @pytest.mark.skip(reason="unsupported for now")
+@pytest.mark.nightly
 def test_stable_diffusion_pytorch(variant="CompVis/stable-diffusion-v1-4", batch_size=1):
 
     # Set inference steps
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_swin.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_swin.py
index 356d42758..dd8ba772e 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_swin.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_swin.py
@@ -14,6 +14,7 @@
 image = Image.open(requests.get(url, stream=True).raw)
 
 
+@pytest.mark.nightly
 def test_swin_v1_tiny_4_224_hf_pytorch(test_device):
     # pytest.skip() # Working on it
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
index 001367480..6ddaa94e8 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -15,6 +15,7 @@
 
 
 @pytest.mark.skip(reason="dependent on CCM repo and Hang observed at post_initial_graph_pass")
+@pytest.mark.nightly
 def test_tri_basic_2_sematic_segmentation_pytorch(test_device):
 
     # Set PyBuda configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
index 6855e1403..5d72cceb6 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_unet.py
@@ -30,6 +30,7 @@ def generate_model_unet_imgseg_osmr_pytorch(variant):
     return model, [img_tensor], {}
 
 
+@pytest.mark.nightly
 def test_unet_osmr_cityscape_pytorch(test_device):
     model, inputs, _ = generate_model_unet_imgseg_osmr_pytorch(
         "unet_cityscapes",
@@ -65,6 +66,7 @@ def get_imagenet_sample():
 
 
 @pytest.mark.skip(reason="Model script not found")
+@pytest.mark.nightly
 def test_unet_holocron_pytorch(test_device):
     from holocron.models.segmentation.unet import unet_tvvgg11
 
@@ -109,6 +111,7 @@ def generate_model_unet_imgseg_smp_pytorch(variant):
     return model, [img_tensor], {}
 
 
+@pytest.mark.nightly
 def test_unet_qubvel_pytorch(test_device):
     model, inputs, _ = generate_model_unet_imgseg_smp_pytorch(
         None,
@@ -155,6 +158,7 @@ def generate_model_unet_imgseg_torchhub_pytorch(variant):
     return model, [img_batch], {}
 
 
+@pytest.mark.nightly
 def test_unet_torchhub_pytorch(test_device):
     model, inputs, _ = generate_model_unet_imgseg_torchhub_pytorch(
         "unet",
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
index e543f4848..b3db7c046 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
@@ -24,6 +24,7 @@
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_vgg_osmr_pytorch(variant, test_device):
     # STEP 1: Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()  # load global compiler config object
@@ -58,6 +59,7 @@ def test_vgg_osmr_pytorch(variant, test_device):
     compiled_model = forge.compile(model, sample_inputs=[input_batch], module_name=f"pt_{variant}_osmr")
 
 
+@pytest.mark.nightly
 def test_vgg_19_hf_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
@@ -116,6 +118,7 @@ def preprocess_timm_model(model_name):
     return model, img_tensor
 
 
+@pytest.mark.nightly
 def test_vgg_bn19_timm_pytorch(test_device):
     torch.multiprocessing.set_sharing_strategy("file_system")
     model_name = "vgg19_bn"
@@ -128,6 +131,7 @@ def test_vgg_bn19_timm_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=[image_tensor], module_name=f"pt_{model_name}_timm")
 
 
+@pytest.mark.nightly
 def test_vgg_bn19_torchhub_pytorch(test_device):
 
     # STEP 1: Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vilt.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
index dac26567e..5a499e4c3 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
@@ -131,6 +131,7 @@ def generate_model_vilt_question_answering_hf_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_vilt_question_answering_hf_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vilt_question_answering_hf_pytorch(
         test_device,
@@ -174,6 +175,7 @@ def generate_model_vilt_maskedlm_hf_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_vilt_maskedlm_hf_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vilt_maskedlm_hf_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
index 4507a31c9..f7c63ab52 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vit.py
@@ -35,6 +35,7 @@ def generate_model_vit_imgcls_hf_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_vit_classify_224_hf_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vit_imgcls_hf_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
index 7a89ee968..b4c2be213 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
@@ -58,6 +58,7 @@ def generate_model_vovnet_imgcls_osmr_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", varaints, ids=varaints)
+@pytest.mark.nightly
 def test_vovnet_osmr_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vovnet_imgcls_osmr_pytorch(
         test_device,
@@ -101,6 +102,7 @@ def generate_model_vovnet39_imgcls_stigma_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("enable_default_dram_parameters", [True, False])
+@pytest.mark.nightly
 def test_vovnet_v1_39_stigma_pytorch(test_device, enable_default_dram_parameters):
     model, inputs, _ = generate_model_vovnet39_imgcls_stigma_pytorch(
         test_device,
@@ -124,6 +126,7 @@ def generate_model_vovnet57_imgcls_stigma_pytorch(test_device, variant):
     return model, [image_tensor], {}
 
 
+@pytest.mark.nightly
 def test_vovnet_v1_57_stigma_pytorch(test_device):
     model, inputs, _ = generate_model_vovnet57_imgcls_stigma_pytorch(
         test_device,
@@ -165,6 +168,7 @@ def generate_model_vovnet_imgcls_timm_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_vovnet_timm_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vovnet_imgcls_timm_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
index 837351d2a..b2117b584 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
@@ -12,6 +12,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_wideresnet_pytorch(variant, test_device):
     (model, inputs,) = generate_model_wideresnet_imgcls_pytorch(
         test_device,
@@ -25,6 +26,7 @@ def test_wideresnet_pytorch(variant, test_device):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_wideresnet_timm(variant, test_device):
     (model, inputs,) = generate_model_wideresnet_imgcls_timm(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_xception.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_xception.py
index e3fe778ac..ac115a1d4 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_xception.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_xception.py
@@ -9,6 +9,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_xception_timm(variant, test_device):
 
     (model, inputs,) = generate_model_xception_imgcls_timm(
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
index 2544b3ef7..eef6ef18b 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
@@ -36,6 +36,7 @@ def generate_model_yolotinyV3_imgcls_holli_pytorch(test_device, variant):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_yolov3_tiny_holli_pytorch(test_device):
     model, inputs, _ = generate_model_yolotinyV3_imgcls_holli_pytorch(
         test_device,
@@ -67,6 +68,7 @@ def generate_model_yoloV3_imgcls_holli_pytorch(test_device, variant):
 
 
 @pytest.mark.skip(reason="dependent on CCM repo")
+@pytest.mark.nightly
 def test_yolov3_holli_pytorch(test_device):
     model, inputs, other = generate_model_yoloV3_imgcls_holli_pytorch(
         test_device,
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index e1d94ec87..123f68dce 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -25,6 +25,7 @@ def generate_model_yoloV5I320_imgcls_torchhub_pytorch(test_device, variant, size
 
 
 @pytest.mark.parametrize("size", size, ids=["yolov5" + s for s in size])
+@pytest.mark.nightly
 def test_yolov5_320x320(test_device, size):
     model, inputs, _ = generate_model_yoloV5I320_imgcls_torchhub_pytorch(
         test_device,
@@ -53,6 +54,7 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
 
 
 @pytest.mark.parametrize("size", size, ids=["yolov5" + s for s in size])
+@pytest.mark.nightly
 def test_yolov5_640x640(test_device, size):
 
     model, inputs, _ = generate_model_yoloV5I640_imgcls_torchhub_pytorch(
@@ -77,6 +79,7 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
 
 
 @pytest.mark.parametrize("size", size, ids=["yolov5" + s for s in size])
+@pytest.mark.nightly
 def test_yolov5_480x480(test_device, size):
 
     model, inputs, _ = generate_model_yoloV5I480_imgcls_torchhub_pytorch(
@@ -89,6 +92,7 @@ def test_yolov5_480x480(test_device, size):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_" + name + "_480x480")
 
 
+@pytest.mark.nightly
 def test_yolov5_1280x1280(test_device):
 
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
index 5edbdd5a0..34bf63dd4 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
@@ -83,6 +83,7 @@ def process_image(path, img_size, stride, half):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolo_v6_pytorch(variant, test_device):
 
     # STEP 1 : Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
index bfa809dd2..d9f595099 100644
--- a/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
+++ b/forge/test/model_demos/high_prio/cnn/pytorch/test_yolox.py
@@ -53,6 +53,7 @@ def preprocess(img, input_size, swap=(2, 0, 1)):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.nightly
 def test_yolox_pytorch(variant, test_device):
 
     # Set PyBuda configuration parameters
diff --git a/forge/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py b/forge/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
index 9c5c40fd2..b3f8d1f7a 100644
--- a/forge/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
+++ b/forge/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
@@ -7,6 +7,7 @@
 import forge
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="dependent on CCM repo")
 def test_efficientnet_lite0_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -15,6 +16,7 @@ def test_efficientnet_lite0_1x1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="dependent on CCM repo")
 def test_efficientnet_lite4_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -23,6 +25,7 @@ def test_efficientnet_lite4_1x1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="dependent on CCM repo")
 def test_efficientnet_lite0(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -31,6 +34,7 @@ def test_efficientnet_lite0(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_efficientnet_lite1(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -39,6 +43,7 @@ def test_efficientnet_lite1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_efficientnet_lite2(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -47,6 +52,7 @@ def test_efficientnet_lite2(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_efficientnet_lite3(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -55,6 +61,7 @@ def test_efficientnet_lite3(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_efficientnet_lite4(test_device):
     compiler_cfg = _get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py b/forge/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
index 1fffd7449..2c9d0b1a4 100644
--- a/forge/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
+++ b/forge/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
@@ -7,6 +7,7 @@
 import forge
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_hand_landmark_lite_1x1(test_device):
     tflite_path = "third_party/confidential_customer_models/model_2/tflite/hand_landmark_lite.tflite"
@@ -14,6 +15,7 @@ def test_hand_landmark_lite_1x1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_palm_detection_lite_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py b/forge/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
index 654d0df58..a5f96357e 100644
--- a/forge/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
+++ b/forge/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
@@ -7,6 +7,7 @@
 import forge
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_mobilenet_ssd_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py b/forge/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
index df23be627..3fb9acdbf 100644
--- a/forge/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
+++ b/forge/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
@@ -7,6 +7,7 @@
 import forge
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_pose_landmark_lite_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -16,6 +17,7 @@ def test_pose_landmark_lite_1x1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_pose_landmark_heavy_1x1(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -25,6 +27,7 @@ def test_pose_landmark_heavy_1x1(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_pose_landmark_lite(test_device):
     compiler_cfg = _get_global_compiler_config()
@@ -33,6 +36,7 @@ def test_pose_landmark_lite(test_device):
     compiled_model = forge.compile(tflite_path, sample_inputs=sample_tensor)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 def test_pose_landmark_heavy(test_device):
     compiler_cfg = _get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_albert.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_albert.py
index fae4507c3..9e5165e0d 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_albert.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_albert.py
@@ -13,6 +13,7 @@
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.parametrize("size", sizes, ids=sizes)
+@pytest.mark.nightly
 def test_albert_masked_lm_pytorch(size, variant, test_device):
     model_ckpt = f"albert-{size}-{variant}"
 
@@ -47,6 +48,7 @@ def test_albert_masked_lm_pytorch(size, variant, test_device):
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.parametrize("size", sizes, ids=sizes)
+@pytest.mark.nightly
 def test_albert_token_classification_pytorch(size, variant, test_device):
 
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_bart.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_bart.py
index a7064c2ff..23f8dbfa5 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_bart.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_bart.py
@@ -22,6 +22,7 @@ def forward(self, input_ids, attention_mask, decoder_input_ids):
         return out
 
 
+@pytest.mark.nightly
 def test_pt_bart_classifier(test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.compile_depth = CompileDepth.SPLIT_GRAPH
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_bert.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_bert.py
index 4d9f5c920..a5a2f774c 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_bert.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_bert.py
@@ -37,6 +37,7 @@ def generate_model_bert_maskedlm_hf_pytorch(variant):
     return model, [input_tokens["input_ids"]], {}
 
 
+@pytest.mark.nightly
 def test_bert_masked_lm_pytorch(test_device):
     model, inputs, _ = generate_model_bert_maskedlm_hf_pytorch("bert-base-uncased")
 
@@ -77,6 +78,7 @@ def generate_model_bert_qa_hf_pytorch(variant):
     return model, [input_tokens["input_ids"]], {}
 
 
+@pytest.mark.nightly
 def test_bert_question_answering_pytorch(test_device):
     model, inputs, _ = generate_model_bert_qa_hf_pytorch("bert-large-cased-whole-word-masking-finetuned-squad")
 
@@ -107,6 +109,7 @@ def generate_model_bert_seqcls_hf_pytorch(variant):
     return model, [input_tokens["input_ids"]], {}
 
 
+@pytest.mark.nightly
 def test_bert_sequence_classification_pytorch(test_device):
     model, inputs, _ = generate_model_bert_seqcls_hf_pytorch(
         "textattack/bert-base-uncased-SST-2",
@@ -139,6 +142,7 @@ def generate_model_bert_tkcls_hf_pytorch(variant):
     return model, [input_tokens["input_ids"]], {}
 
 
+@pytest.mark.nightly
 def test_bert_token_classification_pytorch(test_device):
     model, inputs, _ = generate_model_bert_tkcls_hf_pytorch("dbmdz/bert-large-cased-finetuned-conll03-english")
 
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_codegen.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
index 519d118ea..19f5d135f 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
@@ -17,6 +17,7 @@
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_codegen(test_device, variant):
     # Configurations
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
index 240f1e1db..80362774d 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
@@ -15,6 +15,7 @@
 variants = ["distilbert-base-uncased", "distilbert-base-cased", "distilbert-base-multilingual-cased"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_distilbert_masked_lm_pytorch(variant, test_device):
     # Load DistilBert tokenizer and model from HuggingFace
@@ -44,6 +45,7 @@ def test_distilbert_masked_lm_pytorch(variant, test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_distilbert_masked_lm")
 
 
+@pytest.mark.nightly
 def test_distilbert_question_answering_pytorch(test_device):
     # Load Bert tokenizer and model from HuggingFace
     model_ckpt = "distilbert-base-cased-distilled-squad"
@@ -79,6 +81,7 @@ def test_distilbert_question_answering_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_distilbert_question_answering")
 
 
+@pytest.mark.nightly
 def test_distilbert_sequence_classification_pytorch(test_device):
 
     # Load DistilBert tokenizer and model from HuggingFace
@@ -105,6 +108,7 @@ def test_distilbert_sequence_classification_pytorch(test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_distilbert_sequence_classification")
 
 
+@pytest.mark.nightly
 def test_distilbert_token_classification_pytorch(test_device):
     # Load DistilBERT tokenizer and model from HuggingFace
     model_ckpt = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_dpr.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
index 9d4b35fed..1aa04e82c 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
@@ -16,6 +16,7 @@
 variants = ["facebook/dpr-ctx_encoder-single-nq-base", "facebook/dpr-ctx_encoder-multiset-base"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_dpr_context_encoder_pytorch(variant, test_device):
 
@@ -49,6 +50,7 @@ def test_dpr_context_encoder_pytorch(variant, test_device):
 variants = ["facebook/dpr-question_encoder-single-nq-base", "facebook/dpr-question_encoder-multiset-base"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_dpr_question_encoder_pytorch(variant, test_device):
     # Load Bert tokenizer and model from HuggingFace
@@ -81,6 +83,7 @@ def test_dpr_question_encoder_pytorch(variant, test_device):
 variants = ["facebook/dpr-reader-single-nq-base", "facebook/dpr-reader-multiset-base"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_dpr_reader_pytorch(variant, test_device):
     # Load Bert tokenizer and model from HuggingFace
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_falcon.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_falcon.py
index 4dd18ac91..ff98a5d83 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_falcon.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_falcon.py
@@ -6,6 +6,7 @@
 import forge
 
 
+@pytest.mark.nightly
 def test_falcon(test_device):
 
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
index dee05dd08..e4375cdbc 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
@@ -140,6 +140,7 @@ def forward(self, inputs_embeds, attention_mask, position_ids, *past_key_values)
         return hidden_states, *presents
 
 
+@pytest.mark.nightly
 def test_fuyu8b(test_device):
     # Set Forge configuration parameters
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -185,6 +186,7 @@ def test_fuyu8b(test_device):
     os.remove("bus.png")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="not supported yet")
 def test_fuyu8b_past_cache(test_device):
     if test_device.arch == BackendDevice.Grayskull:
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
index c9f39e1a2..58dd095bd 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
@@ -49,6 +49,7 @@ def cpu_sanity_run_1():
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_rotary_embedding(test_device, variant):
@@ -89,6 +90,7 @@ def forward(self, x, pos_ids):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b_rotary_embedding")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_rms_norm(test_device, variant):
@@ -127,6 +129,7 @@ def forward(self, x):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b_rms_norm")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_attention(test_device, variant):
@@ -168,6 +171,7 @@ def forward(self, hidden_states, attn_mask, pos_ids):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b_attention")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_mlp(test_device, variant):
@@ -207,6 +211,7 @@ def forward(self, hidden_states):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b_mlp")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_single_decoder(test_device, variant):
@@ -248,6 +253,7 @@ def forward(self, hidden_states, attn_mask, pos_ids):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b_single_decoder")
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b(test_device, variant):
     # Random see for reproducibility
@@ -286,6 +292,7 @@ def test_gemma_2b(test_device, variant):
     compiled_model = forge.compile(pytorch_model, sample_inputs=inputs, module_name="pt_gemma_2b")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_gen(test_device, variant):
@@ -367,6 +374,7 @@ def test_gemma_2b_gen(test_device, variant):
         print(f"{tt_ans}")
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported yet")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gemma_2b_1x1_gen(test_device, variant):
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
index 999f9973e..c1234290f 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
@@ -9,6 +9,7 @@
 from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
 
 
+@pytest.mark.nightly
 def test_gpt2_text_gen(test_device):
     # Load tokenizer and model from HuggingFace
     config = GPT2Config.from_pretrained("gpt2")
@@ -52,6 +53,7 @@ def forward(self, input_ids, attention_mask, *kv):
         return self.model(input_ids, past_key_values, attention_mask)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="not supported yet")
 def test_gpt2_past_cache(test_device):
     os.environ["GOLDEN_WORMHOLE_B0"] = "1"
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
index 7ef561674..e47fbb2f1 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
@@ -22,6 +22,7 @@
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gptneo_causal_lm(variant, test_device):
     # Set random seed for repeatability
@@ -74,6 +75,7 @@ def forward(self, input_ids, attention_mask):
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_gptneo_sequence_classification(variant, test_device):
     # Load tokenizer and model from HuggingFace
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
index ff6f5914f..4ab29bdd7 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_llama3.py
@@ -110,6 +110,7 @@ def _update_causal_mask(
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_llama3_causal_lm(variant, test_device):
     # Configurations
     compiler_cfg = forge.config._get_global_compiler_config()
@@ -151,6 +152,7 @@ def test_llama3_causal_lm(variant, test_device):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_llama3_sequence_classification(variant, test_device):
 
     # Configurations
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_mistral.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
index 59bccfc00..426e7cac6 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
@@ -16,6 +16,7 @@
 
 @pytest.mark.skip(reason="Tested as part of full model test run")
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.nightly
 def test_mistral_decoder_layer(variant, test_device):
 
     model = AutoModelForCausalLM.from_pretrained(variant, device_map="auto")
@@ -37,6 +38,7 @@ def test_mistral_decoder_layer(variant, test_device):
 variants = ["mistralai/Mistral-7B-v0.1"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_mistral(variant, test_device):
 
@@ -67,6 +69,7 @@ def test_mistral(variant, test_device):
 variants = ["mistralai/Mistral-7B-v0.1"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="This test currently serves the same purpose as test_mistral")
 def test_mistral_decode(variant, test_device):
@@ -132,6 +135,7 @@ def test_mistral_decode(variant, test_device):
 variants = ["mistralai/Mistral-7B-v0.1"]
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="under development")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_mistral_kv_cache(variant, test_device):
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_opt.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_opt.py
index ecf14d0b5..a0bcbbad4 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_opt.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_opt.py
@@ -9,6 +9,7 @@
 variants = ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_opt_causal_lm(variant, test_device):
     # Load tokenizer and model from HuggingFace
@@ -44,6 +45,7 @@ def test_opt_causal_lm(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_opt_qa(variant, test_device):
     # Load tokenizer and model from HuggingFace
@@ -78,6 +80,7 @@ def test_opt_qa(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_opt_sequence_classification(variant, test_device):
     # Set Forge configuration parameters
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi2.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
index 2b72c0669..bac92a5db 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi2.py
@@ -15,6 +15,7 @@
 variants = ["microsoft/phi-2", "microsoft/phi-2-pytdml"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_phi2_clm(variant, test_device):
 
@@ -56,6 +57,7 @@ def test_phi2_clm(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_phi2_token_classification(variant, test_device):
 
@@ -88,6 +90,7 @@ def test_phi2_token_classification(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_phi2_sequence_classification(variant, test_device):
 
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
index ff30527d1..893acc2f2 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_phi3.py
@@ -15,6 +15,7 @@
 variants = ["microsoft/phi-3-mini-4k-instruct"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_phi3_causal_lm(variant, test_device):
 
@@ -58,6 +59,7 @@ def test_phi3_causal_lm(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_phi3_token_classification(variant, test_device):
 
@@ -91,6 +93,7 @@ def test_phi3_token_classification(variant, test_device):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants)
 def test_phi3_sequence_classification(variant, test_device):
 
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_qwen.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_qwen.py
index b1893a9f1..e6281665e 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_qwen.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_qwen.py
@@ -8,6 +8,7 @@
 import re
 
 
+@pytest.mark.nightly
 def test_qwen1_5_causal_lm(test_device):
 
     # Set PyBuda configurations
@@ -56,6 +57,7 @@ def parse_chat_completion(text: str):
     return messages
 
 
+@pytest.mark.nightly
 def test_qwen1_5_chat(test_device):
 
     # Set PyBuda configurations
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_roberta.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
index 909ae3970..0978f34b1 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
@@ -7,6 +7,7 @@
 from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModelForSequenceClassification
 
 
+@pytest.mark.nightly
 def test_roberta_masked_lm(test_device):
     # Load Albert tokenizer and model from HuggingFace
     tokenizer = download_model(AutoTokenizer.from_pretrained, "xlm-roberta-base")
@@ -31,6 +32,7 @@ def test_roberta_masked_lm(test_device):
     compiled_model = forge.compile(model, sample_inputs=inputs, module_name="pt_roberta_masked_lm")
 
 
+@pytest.mark.nightly
 def test_roberta_sentiment_pytorch(test_device):
     # Load Bart tokenizer and model from HuggingFace
     tokenizer = download_model(AutoTokenizer.from_pretrained, "cardiffnlp/twitter-roberta-base-sentiment")
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_squeezebert.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_squeezebert.py
index 466770bd1..9a4388a32 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_squeezebert.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_squeezebert.py
@@ -6,6 +6,7 @@
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 
+@pytest.mark.nightly
 def test_squeezebert_sequence_classification_pytorch(test_device):
     # Load Bart tokenizer and model from HuggingFace
     tokenizer = download_model(AutoTokenizer.from_pretrained, "squeezebert/squeezebert-mnli")
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
index fbf843264..ba838469c 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_t5.py
@@ -12,6 +12,7 @@
 from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Not supported")
 def test_t5_loop_tiny_tile(test_device):
     import os
@@ -78,6 +79,7 @@ def forward(self, decoder_input_ids, encoder_outputs):
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_t5_generation(variant, test_device):
 
@@ -168,6 +170,7 @@ def forward(
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="not supported yet")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_t5_past_cache_enc_dec(variant, test_device):
@@ -352,6 +355,7 @@ def test_t5_past_cache_enc_dec(variant, test_device):
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Redundant")
 def test_t5_past_cache_forge_pipeline(variant, test_device):
@@ -686,6 +690,7 @@ def wrap_generate(inputs):
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Redundant")
 def test_t5_forge_pipeline(variant, test_device):
@@ -734,6 +739,7 @@ def test_t5_forge_pipeline(variant, test_device):
     print(answer)
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="Redundant")
 def test_t5_small_tiny_tile(test_device):
     if test_device.arch == BackendDevice.Grayskull:
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
index f254eca9c..a948d13e5 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
@@ -98,6 +98,7 @@ def forward(self, decoder_input_ids, encoder_hidden_states):
     return framework_model, [decoder_input_ids, encoder_outputs]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_whisper(test_device, variant):
 
@@ -111,6 +112,7 @@ def test_whisper(test_device, variant):
     )
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Redundant")
 def test_whisper_pipeline(test_device, variant):
@@ -171,6 +173,7 @@ def test_whisper_pipeline(test_device, variant):
     assert cpu_out["text"] == tt_out["text"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Not supported")
 def test_whisper_encoder(test_device, variant):
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
index c5c4b4238..75be9378f 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
@@ -36,6 +36,7 @@
 ]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Redundant")
 def test_whisper_dec_past_cache(test_device, variant):
@@ -68,6 +69,7 @@ def test_whisper_dec_past_cache(test_device, variant):
             break
 
 
+@pytest.mark.nightly
 @pytest.mark.skip(reason="not supported yet")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_whisper_enc_dec(test_device, variant):
@@ -339,6 +341,7 @@ def test_whisper_enc_dec(test_device, variant):
         print(f"generated tokens: {tokenizer.decode(generated_tokens)}")
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 @pytest.mark.skip(reason="Redundant")
 def test_whisper_enc_dec_pipeline(test_device, variant):
diff --git a/forge/test/model_demos/high_prio/nlp/pytorch/test_xglm.py b/forge/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
index b9c0b138e..823029496 100644
--- a/forge/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
+++ b/forge/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
@@ -10,6 +10,7 @@
 variants = ["facebook/xglm-564M", "facebook/xglm-1.7B"]
 
 
+@pytest.mark.nightly
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_xglm_causal_lm(variant, test_device):
     # Set Forge configuration parameters

From 2337eae8c992a379b3658199164905dcd8303fa0 Mon Sep 17 00:00:00 2001
From: Deepak Sudhakar <dsudhakar@tenstorrent.com>
Date: Thu, 7 Nov 2024 14:01:04 +0530
Subject: [PATCH 11/18] Add pytest marker for Push CI (#550)

---
 .../benchmark/models/mnist_linear.py          |  1 +
 forge/test/mlir/llama/test_llama_inference.py |  4 ++
 .../mlir/llama/tests/test_llama_embedding.py  |  1 +
 .../mlir/llama/tests/test_llama_lm_head.py    |  1 +
 forge/test/mlir/llama/tests/test_llama_mlp.py |  1 +
 .../mlir/llama/tests/test_llama_prefil.py     |  1 +
 .../mlir/llama/tests/test_llama_rms_norm.py   |  1 +
 .../mlir/llama/tests/test_llama_rotary_emb.py |  1 +
 .../mlir/llama/tests/test_llama_self_attn.py  |  1 +
 forge/test/mlir/mnist/test_inference.py       |  2 +
 .../test/mlir/mnist/training/test_training.py |  3 ++
 .../test/mlir/resnet/test_resnet_inference.py |  1 +
 forge/test/mlir/test_features.py              |  5 +++
 forge/test/mlir/test_ops.py                   | 44 +++++++++++++++++++
 forge/test/mlir/test_ops_tf.py                |  3 ++
 forge/test/mlir/test_training.py              |  3 +-
 pytest.ini                                    |  5 +++
 17 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/forge/test/benchmark/benchmark/models/mnist_linear.py b/forge/test/benchmark/benchmark/models/mnist_linear.py
index 2e16c9683..f17bd1a7a 100644
--- a/forge/test/benchmark/benchmark/models/mnist_linear.py
+++ b/forge/test/benchmark/benchmark/models/mnist_linear.py
@@ -73,6 +73,7 @@ def forward(self, x):
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZE, ids=[f"hidden_size={item}" for item in HIDDEN_SIZE])
 @pytest.mark.parametrize("input_size", INPUT_SIZE, ids=[f"input_size={item}" for item in INPUT_SIZE])
 @pytest.mark.parametrize("batch_size", BATCH_SIZE, ids=[f"batch_size={item}" for item in BATCH_SIZE])
+@pytest.mark.push
 def test_mnist_linear(
     training,
     batch_size,
diff --git a/forge/test/mlir/llama/test_llama_inference.py b/forge/test/mlir/llama/test_llama_inference.py
index e6f55e4ed..38ad8a9b7 100644
--- a/forge/test/mlir/llama/test_llama_inference.py
+++ b/forge/test/mlir/llama/test_llama_inference.py
@@ -11,6 +11,8 @@
 
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
+@pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_inference(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
@@ -31,6 +33,7 @@ def test_llama_inference(model_path):
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.skip(reason="No need to run in CI, this is PoC that should be mapped to work on device.")
+@pytest.mark.push
 def test_llama_inference_no_cache_cpu(model_path):
     """
     This function tests the inference of the Llama 3B model without using a past-cache (KV cache).
@@ -68,6 +71,7 @@ def test_llama_inference_no_cache_cpu(model_path):
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.skip(reason="No need to run in CI, this is PoC that should be mapped to work on device.")
+@pytest.mark.push
 def test_llama_inference_cache_cpu(model_path):
     """
     This function tests the inference of the Llama 3B model using a past-cache (KV cache).
diff --git a/forge/test/mlir/llama/tests/test_llama_embedding.py b/forge/test/mlir/llama/tests/test_llama_embedding.py
index 491151f89..f96bc9b15 100644
--- a/forge/test/mlir/llama/tests/test_llama_embedding.py
+++ b/forge/test/mlir/llama/tests/test_llama_embedding.py
@@ -11,6 +11,7 @@
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_embedding(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/llama/tests/test_llama_lm_head.py b/forge/test/mlir/llama/tests/test_llama_lm_head.py
index 44f3c6dc6..17adf6b55 100644
--- a/forge/test/mlir/llama/tests/test_llama_lm_head.py
+++ b/forge/test/mlir/llama/tests/test_llama_lm_head.py
@@ -11,6 +11,7 @@
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_lm_head(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/llama/tests/test_llama_mlp.py b/forge/test/mlir/llama/tests/test_llama_mlp.py
index 353e2a828..6a338da1f 100644
--- a/forge/test/mlir/llama/tests/test_llama_mlp.py
+++ b/forge/test/mlir/llama/tests/test_llama_mlp.py
@@ -11,6 +11,7 @@
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_mlp(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/llama/tests/test_llama_prefil.py b/forge/test/mlir/llama/tests/test_llama_prefil.py
index 54f141442..f6e2c1278 100644
--- a/forge/test/mlir/llama/tests/test_llama_prefil.py
+++ b/forge/test/mlir/llama/tests/test_llama_prefil.py
@@ -50,6 +50,7 @@ def decode_on_cpu(model, tokenizer, input_ids, hidden_states, max_new_tokens):
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_prefil_on_device_decode_on_cpu(model_path):
     """
     This function tests the inference of the Llama models split into two parts:
diff --git a/forge/test/mlir/llama/tests/test_llama_rms_norm.py b/forge/test/mlir/llama/tests/test_llama_rms_norm.py
index 5f36f527f..08a75b38d 100644
--- a/forge/test/mlir/llama/tests/test_llama_rms_norm.py
+++ b/forge/test/mlir/llama/tests/test_llama_rms_norm.py
@@ -10,6 +10,7 @@
 
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
+@pytest.mark.push
 def test_llama_lm_head(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/llama/tests/test_llama_rotary_emb.py b/forge/test/mlir/llama/tests/test_llama_rotary_emb.py
index 6456fdf96..1c5ab111f 100644
--- a/forge/test/mlir/llama/tests/test_llama_rotary_emb.py
+++ b/forge/test/mlir/llama/tests/test_llama_rotary_emb.py
@@ -12,6 +12,7 @@
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_rotary_emb(model_path):
     class Llama_Rotary_Embedding(torch.nn.Module):
         def __init__(self, model):
diff --git a/forge/test/mlir/llama/tests/test_llama_self_attn.py b/forge/test/mlir/llama/tests/test_llama_self_attn.py
index 5459e8261..b75630134 100644
--- a/forge/test/mlir/llama/tests/test_llama_self_attn.py
+++ b/forge/test/mlir/llama/tests/test_llama_self_attn.py
@@ -11,6 +11,7 @@
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
+@pytest.mark.push
 def test_llama_self_attn(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")
diff --git a/forge/test/mlir/mnist/test_inference.py b/forge/test/mlir/mnist/test_inference.py
index 0a6af1b99..6540c1f38 100644
--- a/forge/test/mlir/mnist/test_inference.py
+++ b/forge/test/mlir/mnist/test_inference.py
@@ -5,9 +5,11 @@
 import torch
 from .utils import *
 import forge
+import pytest
 from forge.op.eval.common import compare_with_golden_pcc
 
 
+@pytest.mark.push
 def test_mnist_inference():
     inputs = [torch.rand(1, 784)]
 
diff --git a/forge/test/mlir/mnist/training/test_training.py b/forge/test/mlir/mnist/training/test_training.py
index f55537e12..f666d3e05 100644
--- a/forge/test/mlir/mnist/training/test_training.py
+++ b/forge/test/mlir/mnist/training/test_training.py
@@ -15,6 +15,7 @@
 from loguru import logger
 
 
+@pytest.mark.push
 def test_mnist_training():
     torch.manual_seed(0)
 
@@ -94,6 +95,7 @@ def test_mnist_training():
 
 
 @pytest.mark.parametrize("freeze_layer", [None, 0, 2, 4])
+@pytest.mark.push
 def test_forge_vs_torch_gradients(freeze_layer):
     logger.disable("")
     torch.manual_seed(0)
@@ -154,6 +156,7 @@ def test_forge_vs_torch_gradients(freeze_layer):
 # That sets relu threshold to bfloat16 tensor.
 # And in file forge/forge/compile.py::compile_main forced bfloat 16 should be added compiler_cfg.default_df_override = DataFormat.Float16_b
 @pytest.mark.skip(reason="Need to be tested with bfloat16 and takes around 10 minutes to run")
+@pytest.mark.push
 def test_forge_vs_torch():
     torch.manual_seed(0)
 
diff --git a/forge/test/mlir/resnet/test_resnet_inference.py b/forge/test/mlir/resnet/test_resnet_inference.py
index 3ad6890b5..b66b2a3e0 100644
--- a/forge/test/mlir/resnet/test_resnet_inference.py
+++ b/forge/test/mlir/resnet/test_resnet_inference.py
@@ -8,6 +8,7 @@
 import forge
 
 
+@pytest.mark.push
 def test_resnet_inference():
     # Compiler configurations
     compiler_cfg = forge.config._get_global_compiler_config()
diff --git a/forge/test/mlir/test_features.py b/forge/test/mlir/test_features.py
index 6321073a7..73c921b07 100644
--- a/forge/test/mlir/test_features.py
+++ b/forge/test/mlir/test_features.py
@@ -13,6 +13,7 @@
 from forge.op.eval.common import compare_with_golden_pcc, compare_with_golden
 
 
+@pytest.mark.push
 def test_multiple_inputs():
     class MultipleInputs(nn.Module):
         def __init__(self):
@@ -39,6 +40,7 @@ def forward(self, a, b, c):
         ((1, 1, 32, 64), (1, 1, 64, 128), (1, 1, 128, 32)),
     ],
 )
+@pytest.mark.push
 def test_input_order(a_shape, b_shape, c_shape):
     class InputOrderWithConstants(nn.Module):
         def __init__(self):
@@ -68,6 +70,7 @@ def forward(self, a, b, c):
 
 @pytest.mark.parametrize("batch_size", [1, 4, 16, 32, 64])
 @pytest.mark.parametrize("linear_features", [(784, 10)])
+@pytest.mark.push
 def test_matmul_bias(batch_size, linear_features):
     input_features, output_dim = linear_features
 
@@ -95,6 +98,7 @@ def forward(self, a):
 @pytest.mark.parametrize("batch_size", [1, 2, 16, 64, 512])
 @pytest.mark.parametrize("in_features", [784])
 @pytest.mark.parametrize("out_features", [10])
+@pytest.mark.push
 def test_batch_size_inference(batch_size, in_features, out_features):
     class SimpleModel(nn.Module):
         def __init__(self):
@@ -120,6 +124,7 @@ def forward(self, x):
 @pytest.mark.parametrize("batch_size", [1, 2, 16, 64, 512])
 @pytest.mark.parametrize("in_features", [784])
 @pytest.mark.parametrize("out_features", [10])
+@pytest.mark.push
 def test_batch_size_training(batch_size, in_features, out_features):
     class SimpleModel(nn.Module):
         def __init__(self):
diff --git a/forge/test/mlir/test_ops.py b/forge/test/mlir/test_ops.py
index 482a0491d..376e42209 100644
--- a/forge/test/mlir/test_ops.py
+++ b/forge/test/mlir/test_ops.py
@@ -15,6 +15,7 @@
 
 
 @pytest.mark.parametrize("operand_and_cast_dtype", [(torch.float32, torch.int32), (torch.int32, torch.float32)])
+@pytest.mark.push
 def test_cast(operand_and_cast_dtype):
 
     operand_dtype = operand_and_cast_dtype[0]
@@ -61,6 +62,8 @@ def get_input_tensor(dtype):
         (1, 7, 256),
     ],
 )
+@pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_sin(shape):
     class sin(nn.Module):
         def __init__(self):
@@ -88,6 +91,8 @@ def forward(self, x):
         (1, 7, 256),
     ],
 )
+@pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_cosine(shape):
     class cosine(nn.Module):
         def __init__(self):
@@ -116,6 +121,7 @@ def forward(self, x):
     ],
 )
 @pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_tanh(shape):
     class tanh(nn.Module):
         def __init__(self):
@@ -144,6 +150,7 @@ def forward(self, x):
     ],
 )
 @pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_leakyrelu(shape):
 
     inputs = [torch.rand(shape)]
@@ -166,6 +173,7 @@ def test_leakyrelu(shape):
     ],
 )
 @pytest.mark.xfail(reason="shape mismatch: expected [1], got []")
+@pytest.mark.push
 def test_layernorm(batch_size, num_channels, height, width):
 
     # framework_model = nn.LayerNorm((num_channels, height, width)) # Support only normalization over last one dimension
@@ -190,6 +198,7 @@ def test_layernorm(batch_size, num_channels, height, width):
     ],
 )
 @pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_gelu(shape):
 
     inputs = [torch.rand(shape)]
@@ -212,6 +221,7 @@ def test_gelu(shape):
     ],
 )
 @pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_clip(shape, min_val, max_val):
     class Clip(nn.Module):
         def __init__(self, min_val, max_val):
@@ -242,6 +252,7 @@ def forward(self, x):
     ],
 )
 @pytest.mark.xfail(reason="Found Unsupported operations while lowering from TTForge to TTIR in forward graph")
+@pytest.mark.push
 def test_cumsum(shape, dim):
     class cumsum(nn.Module):
         def __init__(self, dim):
@@ -275,6 +286,7 @@ def forward(self, x):
     ],
 )
 @pytest.mark.xfail(reason="Unsupported data format during lowering from TTForge to TTIR: Bfp2_b")
+@pytest.mark.push
 def test_where(condition, input, other):
     class Where(nn.Module):
         def __init__(self):
@@ -304,6 +316,7 @@ def forward(self, condition, input1, input2):
 @pytest.mark.parametrize(
     "shape", [(1, 1, 256, 256), (1, 1, 1, 128), (1, 1, 1, 384), (1, 1, 32, 32), (1, 1, 6, 6), (1, 1, 29, 29)]
 )
+@pytest.mark.push
 def test_abs(shape):
     class abs(nn.Module):
         def __init__(self):
@@ -337,6 +350,7 @@ def forward(self, x):
         (1, 512, 7, 7),
     ],
 )
+@pytest.mark.push
 def test_exp(shape):
     class exp(nn.Module):
         def __init__(self):
@@ -370,6 +384,7 @@ def forward(self, x):
     ],
 )
 @pytest.mark.xfail(reason="TTNN maximum op: unsupported broadcast")
+@pytest.mark.push
 def test_maximum(shape_x, shape_y):
     class maximum(nn.Module):
         def __init__(self):
@@ -406,6 +421,7 @@ def forward(self, x, y):
         ((1, 32, 32, 32), (1, 32, 32, 32)),
     ],
 )
+@pytest.mark.push
 def test_less(shape_x, shape_y):
     class less(nn.Module):
         def __init__(self):
@@ -442,6 +458,7 @@ def forward(self, x, y):
         ((1, 32, 32, 32), (1, 32, 32, 32)),
     ],
 )
+@pytest.mark.push
 def test_greater(shape_x, shape_y):
     class greater(nn.Module):
         def __init__(self):
@@ -478,6 +495,7 @@ def forward(self, x, y):
         ((1, 32, 32, 32), (1, 32, 32, 32)),
     ],
 )
+@pytest.mark.push
 def test_not_equal(shape_x, shape_y):
     class not_equal(nn.Module):
         def __init__(self):
@@ -516,6 +534,7 @@ def forward(self, x, y):
         (32, 256, 28, 28),  # pcc = 0.39200606381500713
     ],
 )
+@pytest.mark.push
 def test_batchnorm2d(batch_size, num_channels, height, width):
 
     framework_model = nn.BatchNorm2d(num_features=num_channels)
@@ -533,6 +552,7 @@ def test_batchnorm2d(batch_size, num_channels, height, width):
         assert all([compare_with_golden_pcc(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
 
 
+@pytest.mark.push
 def test_add():
     class Add(nn.Module):
         def __init__(self):
@@ -568,6 +588,7 @@ def forward(self, a, b):
         ((32, 128, 24), (1, -3)),
     ],
 )
+@pytest.mark.push
 def test_transpose(params):
     class Transpose(nn.Module):
         def __init__(self, dims):
@@ -600,6 +621,7 @@ def forward(self, a):
     [((8, 32, 256), (2, 4, 32, 256)), ((8, 32, 32), (1, 2, 4, 32, 32)), ((8192, 128), (1, 256, 32, 128))],
     ids=["1", "2", "3"],
 )
+@pytest.mark.push
 def test_reshape(source_and_target_shape):
     source_shape, target_shape = source_and_target_shape
 
@@ -638,6 +660,7 @@ def forward(self, a):
         ([1, 1, 1, 64], [-4, -3]),
     ],
 )
+@pytest.mark.push
 def test_squeeze(input_shape_and_dim):
     input_shape, dim = input_shape_and_dim
 
@@ -675,6 +698,7 @@ def forward(self, a):
         ([12, 8640], 0),
     ],
 )
+@pytest.mark.push
 def test_unsqueeze(input_shape_and_dim):
     input_shape, dim = input_shape_and_dim
 
@@ -715,6 +739,7 @@ def forward(self, a):
     ],
     ids=["0", "1", "2", "3", "-1", "-2", "-3", "-4"],
 )
+@pytest.mark.push
 def test_concat(inputs_and_dim):
     in_shape1, in_shape2, dim = inputs_and_dim
 
@@ -738,6 +763,7 @@ def forward(self, a, b):
 
 
 @pytest.mark.parametrize("dims", [(1, 32, 64), (6, 33), (4, 16, 17)])
+@pytest.mark.push
 def test_greater_equal(dims):
     class GreaterEqual(nn.Module):
         def __init__(self):
@@ -759,6 +785,7 @@ def forward(self, a, b):
     assert compare_with_golden(golden=fw_out, calculated=output)
 
 
+@pytest.mark.push
 def test_subtract():
     class Subtract(nn.Module):
         def __init__(self):
@@ -787,6 +814,7 @@ def forward(self, a, b):
         (12, 8640),
     ],
 )
+@pytest.mark.push
 def test_multiply(shape):
     class Multiply(nn.Module):
         def __init__(self):
@@ -808,6 +836,7 @@ def forward(self, a, b):
     assert all([compare_with_golden_pcc(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
 
 
+@pytest.mark.push
 def test_relu():
     class ReLU(nn.Module):
         def __init__(self):
@@ -831,6 +860,7 @@ def forward(self, a):
 
 
 @pytest.mark.skip(reason="This is not ready yet")
+@pytest.mark.push
 def test_linear():
     class Linear(nn.Module):
         def __init__(self):
@@ -853,6 +883,7 @@ def forward(self, a):
     assert all([compare_with_golden_pcc(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
 
 
+@pytest.mark.push
 def test_softmax():
     class Softmax(nn.Module):
         def __init__(self):
@@ -877,6 +908,7 @@ def forward(self, a):
 
 @pytest.mark.parametrize("input_shape", [(1, 32, 32), (1, 64, 64), (1, 128, 128, 128)], ids=["32", "64", "128"])
 @pytest.mark.parametrize("dim", [-1, -2], ids=["-1", "-2"])
+@pytest.mark.push
 def test_reduce_sum(input_shape, dim):
     class ReduceSum(nn.Module):
         def __init__(self):
@@ -916,6 +948,7 @@ def forward(self, a):
         -2,
     ],
 )
+@pytest.mark.push
 def test_reduce_mean(input_shape, dim):
 
     if input_shape == (1, 12, 3200) and dim == -1:
@@ -946,6 +979,7 @@ def forward(self, a):
 @pytest.mark.parametrize("outer_dim_x", [7, 32, 41, 64])
 @pytest.mark.parametrize("outer_dim_y", [7, 32, 41, 64])
 @pytest.mark.parametrize("inner_dim", [1, 7, 32, 41, 64])
+@pytest.mark.push
 def test_matmul(batch_size, outer_dim_x, outer_dim_y, inner_dim):
     class Matmul(nn.Module):
         def __init__(self):
@@ -975,6 +1009,7 @@ def forward(self, x, y):
 @pytest.mark.parametrize("x_shape", [7, 32, 41])
 @pytest.mark.parametrize("y_shape", [7, 32, 41])
 @pytest.mark.parametrize("dim", [1, 2])
+@pytest.mark.push
 def test_mean(x_shape, y_shape, dim):
     class Mean(nn.Module):
         def __init__(self):
@@ -999,6 +1034,7 @@ def forward(self, x):
 
 @pytest.mark.parametrize("x_shape", [7, 32, 41])
 @pytest.mark.parametrize("y_shape", [7, 32, 41])
+@pytest.mark.push
 def test_sqrt(x_shape, y_shape):
     class Sqrt(nn.Module):
         def __init__(self):
@@ -1028,6 +1064,7 @@ def forward(self, x):
 @pytest.mark.parametrize("vocab_size", [32000])
 @pytest.mark.parametrize("token_num", [12])
 @pytest.mark.parametrize("embedding_dim", [3200])
+@pytest.mark.push
 def test_embedding(vocab_size, token_num, embedding_dim):
     compiler_cfg = forge.config._get_global_compiler_config()
     compiler_cfg.enable_tvm_cpu_fallback = False
@@ -1067,6 +1104,7 @@ def forward(self, x):
         (2, 7, 32, 41),  # 4D tensor
     ],
 )
+@pytest.mark.push
 def test_reciprocal(shape):
     class Reciprocal(nn.Module):
         def __init__(self):
@@ -1102,6 +1140,7 @@ def forward(self, x):
         (2, 7, 32, 41),  # 4D tensor
     ],
 )
+@pytest.mark.push
 def test_sigmoid(shape):
     class Sigmoid(nn.Module):
         def __init__(self):
@@ -1128,6 +1167,7 @@ def forward(self, x):
 @pytest.mark.parametrize("stop", [2, 32, 64], ids=["2", "32", "64"])
 @pytest.mark.parametrize("stride", [1, 2, 4, 8], ids=["1", "2", "4", "8"])
 @pytest.mark.parametrize("shape", [(1, 32, 64, 64), (32, 64, 64), (64, 64)])
+@pytest.mark.push
 def test_indexing(dim, start, stop, stride, shape):
     if len(shape) == 2 and dim == -3:
         pytest.skip("Skipping since indexing on dim=-3, 2D tensor doesn't make sense")
@@ -1173,6 +1213,7 @@ def forward(self, x):
         (4127, 256),
     ],
 )
+@pytest.mark.push
 def test_adv_index_embedding_decompostion(indices_shape, input_tensor_shape):
     class ForgeAdvIndex(forge.ForgeModule):
         def __init__(self, name):
@@ -1225,6 +1266,7 @@ def forward(self, input_tensor, indices):
         -4,
     ],
 )
+@pytest.mark.push
 def test_reduce_max(input_shape, dim):
 
     reduce_max_dim = dim
@@ -1272,6 +1314,7 @@ def forward(self, a):
         (16, 33, (3, 5), 2, 0, 1, True, 1, "zeros"),
     ],
 )
+@pytest.mark.push
 def test_convtranspose2d(in_channels, out_channels, kernel_size, stride, padding, groups, bias, dilation, padding_mode):
     inputs = [torch.randn(20, 16, 50, 100)]
 
@@ -1299,6 +1342,7 @@ def test_convtranspose2d(in_channels, out_channels, kernel_size, stride, padding
 @pytest.mark.xfail(
     reason="Unable to reshape a tensor in TILE_LAYOUT to non-tile height and width! Please convert the tensor to ROW_MAJOR_LAYOUT first"
 )
+@pytest.mark.push
 def test_avg_pool2d():
     class AvgPool2d(nn.Module):
         def __init__(self):
diff --git a/forge/test/mlir/test_ops_tf.py b/forge/test/mlir/test_ops_tf.py
index 85f5969be..8a76b057f 100644
--- a/forge/test/mlir/test_ops_tf.py
+++ b/forge/test/mlir/test_ops_tf.py
@@ -49,6 +49,7 @@
 )
 @pytest.mark.parametrize("has_bias", [False, True], ids=["no_bias", "with_bias"])
 @pytest.mark.xfail(reason="TTNN fails to tilize during reshape after conv")
+@pytest.mark.push
 def test_conv2d(
     batch_size,
     output_channels,
@@ -102,6 +103,7 @@ def call(self, x):
     assert compare_tensor_to_golden("conv2d", fw_out[0], co_out[0].reshape(fw_out[0].shape))
 
 
+@pytest.mark.push
 def test_dual_conv2d():
 
     tf.random.set_seed(0)
@@ -162,6 +164,7 @@ def call(self, x):
         (1, 128, 128, 128),
     ],
 )
+@pytest.mark.push
 def test_maxpool2d(
     act_shape,
 ):
diff --git a/forge/test/mlir/test_training.py b/forge/test/mlir/test_training.py
index 7c8522582..b568b9dee 100644
--- a/forge/test/mlir/test_training.py
+++ b/forge/test/mlir/test_training.py
@@ -4,12 +4,13 @@
 
 import torch
 import torch.nn as nn
-
+import pytest
 import forge
 import forge.config
 from forge.op.eval.common import compare_with_golden
 
 
+@pytest.mark.push
 def test_torch_training():
     class MatmulParam(nn.Module):
         def __init__(self):
diff --git a/pytest.ini b/pytest.ini
index f0cbd699b..d91c55a39 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -4,6 +4,11 @@
 # Ignore specific tests
 addopts = -svv
 
+# Add pytest markers
+markers =
+    push: marks tests as push
+    nightly: marks tests as nightly
+
 # Where pytest should look for tests
 testpaths =
     # Ops

From 0f1d8ee0265b6e29ccda004728ac7767a4075583 Mon Sep 17 00:00:00 2001
From: Vladimir Milosevic <157983820+vmilosevic@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:18:05 +0100
Subject: [PATCH 12/18] Uplift third_party/tt-mlir to origin/main 2024-11-07
 (#418)

---
 third_party/tt-mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/tt-mlir b/third_party/tt-mlir
index b75d44d98..ae9352400 160000
--- a/third_party/tt-mlir
+++ b/third_party/tt-mlir
@@ -1 +1 @@
-Subproject commit b75d44d98fdba061b1c1da396cde9a42c07911a9
+Subproject commit ae93524006b67741fa9116d184b05b2cc8584cc1

From fcbee4685d2a60fbe6ccc1a0d8b274e22110f57d Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:20:22 +0100
Subject: [PATCH 13/18] Skip failing model until issues is resolved (#638)

Depending on which machine Llama inference test lands, it'll ether pass or fail due to system DRAM limitations. In sum, during compile time Llama inference requires around 32GB of system memory, which is on limit for most machines. Therefore, untill this issues is resolved we're skipping this test to unblock our CI.
---
 forge/test/mlir/llama/test_llama_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/forge/test/mlir/llama/test_llama_inference.py b/forge/test/mlir/llama/test_llama_inference.py
index 38ad8a9b7..af439c2f6 100644
--- a/forge/test/mlir/llama/test_llama_inference.py
+++ b/forge/test/mlir/llama/test_llama_inference.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
-@pytest.mark.xfail()
 @pytest.mark.push
+@pytest.mark.skip(reason="Out of system memory during compile time. Skipping until resolved")
 def test_llama_inference(model_path):
     if model_path == "meta-llama/Llama-3.2-1B":
         pytest.skip("Skipping test for Llama-3.2-1B model, waiting for new transformers version.")

From 895733905cca6209cac00b3454731fc6595dd612 Mon Sep 17 00:00:00 2001
From: Nikola Vukobrat <124874832+nvukobratTT@users.noreply.github.com>
Date: Thu, 7 Nov 2024 13:10:00 +0100
Subject: [PATCH 14/18] Generate op tests based on run model (#599)

- Introduce new compiler configuration: tvm_generate_op_tests
  (true/false)
- Introduce logic that will generate op tests for a model we run

_Note: This change is based on few dependant issues that are cherry-picked. Feel free to review only last commit_

Fix #589
---
 forge/forge/config.py        |  3 ++
 forge/forge/tvm_to_python.py | 86 ++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/forge/forge/config.py b/forge/forge/config.py
index 9b403b460..3b292f569 100644
--- a/forge/forge/config.py
+++ b/forge/forge/config.py
@@ -184,6 +184,9 @@ class CompilerConfig:
     # Number of patterns to match for each module
     tvm_module_to_num_patterns: Dict[str, int] = field(default_factory=lambda: dict())
 
+    # If enabled, for given test, it generates Forge Modules in form of PyTest for each op that exists in given module
+    tvm_generate_op_tests: bool = False
+
     # Enables a transform for conv that directly reads input, such that it goes from stride > 1 to stride = 1
     # This usually translates to lower DRAM BW and less math as the input better populates tiles
     enable_conv_prestride: bool = True
diff --git a/forge/forge/tvm_to_python.py b/forge/forge/tvm_to_python.py
index 671cc7031..e5c284344 100644
--- a/forge/forge/tvm_to_python.py
+++ b/forge/forge/tvm_to_python.py
@@ -9,6 +9,7 @@
 
 import torch
 import numpy as np
+import pytest
 
 # import forge._C.pattern_matcher as pypattern_matcher
 from forge.module import OnnxModule, ForgeModule, TFLiteModule
@@ -2682,7 +2683,92 @@ def delete_unneeded_outputs(ops, returns):
 
         modules.append(writer)
 
+        # Generate op tests based on requested model. Currently only supported
+        # for PyTorch framework.
+        if compiler_cfg.tvm_generate_op_tests:
+            generate_op_tests(
+                ops,
+                current_module_name,
+                framework,
+                contains_incompatible_np_floats,
+                delete_inputs,
+                params,
+                constants,
+                param_names,
+                param_file_name,
+                names_params_file_name,
+                named_buffers_file_name,
+            )
+
+            # Exit python progrems without error
+            # - Two different exit methods depending on whether compile is run using
+            # pytest, or as a standalone python script
+            if "pytest" in sys.modules:
+                pytest.exit("Exiting test without error", returncode=0)
+            else:
+                sys.exit(0)
+
     if compiler_cfg.retain_tvm_python_files:
         save_writers_metadata(modules, flattened_pytorch_inputs, forge_inputs, graph_name)
 
     return modules, forge_inputs
+
+
+def generate_op_tests(
+    ops,
+    current_module_name,
+    framework,
+    contains_incompatible_np_floats,
+    delete_inputs,
+    params,
+    constants,
+    param_names,
+    param_file_name,
+    names_params_file_name,
+    named_buffers_file_name,
+):
+    """
+    Generates test modules for a list of operations.
+
+    This function creates unique test modules for each operation in the provided list.
+    It initializes a ForgeWriter to generate the necessary code for testing each operation,
+    including headers, class definitions, forward functions, parameter parsers, and pytest functions.
+    The generated tests are designed to run the operations as standalone tests.
+    """
+    for op_idx, key in enumerate(sorted(ops)):
+        # Create unique module name
+        module_name = "test_" + current_module_name.lower() + str(op_idx)
+
+        # Initialize Forge writer and generate header and class definition
+        writer = ForgeWriter(
+            module_name,
+            framework,
+            contains_incompatible_np_floats=contains_incompatible_np_floats,
+            delete_inputs=delete_inputs,
+        )
+        writer.write_header()
+        writer.write_class_definition(params, constants)
+
+        # Focus on generating test for a single op
+        single_op = {key: ops[key]}
+
+        # Create new inputs for the single op
+        new_inputs = {}
+        for i, input_name in enumerate(single_op[key].input_names):
+            # Detected parameter as input, insert dummy input
+            # TODO: Need to handle this case better. Probably just ignoring
+            # model parameters, and using new generated inputs.
+            if "." in input_name:
+                input_name = "dummy_input_" + str(i)
+            new_inputs[input_name] = input_name
+
+        # Force output to be same as the op we're running
+        single_return = {key: single_op[key].output_name}
+
+        # Generate forward function and parameter parser (loading params and constants)
+        writer.write_forward(single_op, new_inputs, single_return)
+        writer.write_param_parser(param_names, param_file_name, names_params_file_name, named_buffers_file_name)
+
+        # Generate pytest function that enables runing Forge Module as standalone test
+        writer.write_pytest_function(module_name, single_op[key].input_shapes)
+        writer.close_file()

From 93f2df6a385bc278b015f563c70e4f5c5311374a Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <161027113+vbrkicTT@users.noreply.github.com>
Date: Thu, 7 Nov 2024 13:48:57 +0100
Subject: [PATCH 15/18] Split VerifyUtils (#553)

Steps: setup, create, verify
Support customization of input tensors
---
 forge/test/operators/utils/compat.py |  7 ++-
 forge/test/operators/utils/utils.py  | 74 +++++++++++++++++++++++-----
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/forge/test/operators/utils/compat.py b/forge/test/operators/utils/compat.py
index 3745b4391..d85e10166 100644
--- a/forge/test/operators/utils/compat.py
+++ b/forge/test/operators/utils/compat.py
@@ -240,14 +240,17 @@ def verify_module(
     verify_module_for_inputs(model, inputs, pcc, dev_data_format)
 
 
+# TODO move to class TestTensorsUtils
 def create_torch_inputs(
     input_shapes: List[TensorShape],
     dev_data_format: forge.DataFormat = None,
     value_range: Optional[Union[ValueRanges, ValueRange, OperatorParameterTypes.RangeValue]] = None,
-    random_seed: int = 42,
+    random_seed: Optional[int] = None,
 ) -> List[torch.Tensor]:
 
-    # TODO configure manual seed
+    if random_seed is None:
+        # Set a default seed if not provided
+        random_seed = 42
     generator = torch.Generator().manual_seed(random_seed)
 
     dtype = TestTensorsUtils.get_dtype_for_df(dev_data_format)
diff --git a/forge/test/operators/utils/utils.py b/forge/test/operators/utils/utils.py
index fd215d47b..d53c7d806 100644
--- a/forge/test/operators/utils/utils.py
+++ b/forge/test/operators/utils/utils.py
@@ -23,7 +23,8 @@
 from forge.config import _get_global_compiler_config
 from forge._C import MathFidelity
 
-from .compat import TestDevice, verify_module
+from .compat import TestDevice
+from .compat import create_torch_inputs, verify_module_for_inputs
 from .datatypes import ValueRanges
 
 
@@ -111,8 +112,9 @@ def warm_reset():
 class VerifyUtils:
     """Utility functions for Forge verification"""
 
-    @staticmethod
+    @classmethod
     def verify(
+        cls,
         model: Module,
         test_device: TestDevice,
         input_shapes: List[TensorShape],
@@ -122,6 +124,7 @@ def verify(
         dev_data_format: forge.DataFormat = None,
         math_fidelity: forge.MathFidelity = None,
         value_range: Optional[ValueRanges] = None,
+        random_seed: Optional[int] = None,
         warm_reset: bool = False,
     ):
         """Perform Forge verification on the model
@@ -136,9 +139,37 @@ def verify(
             dev_data_format: Data format
             math_fidelity: Math fidelity
             value_range: Value range of input tensors
+            random_seed: Random seed
             warm_reset: Warm reset the device before verification
         """
 
+        cls.setup(
+            input_source_flag=input_source_flag,
+            math_fidelity=math_fidelity,
+            warm_reset=warm_reset,
+        )
+
+        inputs = cls.create_torch_inputs(
+            input_shapes=input_shapes,
+            dev_data_format=dev_data_format,
+            value_range=value_range,
+            random_seed=random_seed,
+        )
+
+        cls.verify_module_for_inputs(
+            model=model,
+            inputs=inputs,
+            pcc=pcc,
+            dev_data_format=dev_data_format,
+        )
+
+    @classmethod
+    def setup(
+        cls,
+        input_source_flag: InputSourceFlags = None,
+        math_fidelity: forge.MathFidelity = None,
+        warm_reset: bool = False,
+    ):
         if warm_reset:
             DeviceUtils.warm_reset()
 
@@ -151,19 +182,38 @@ def verify(
         # if dev_data_format:
         #     input_params.append({"dev_data_format": dev_data_format})
 
-        verify_module(
-            model,
+    @classmethod
+    def create_torch_inputs(
+        cls,
+        input_shapes: List[TensorShape],
+        dev_data_format: forge.DataFormat = None,
+        value_range: Optional[ValueRanges] = None,
+        random_seed: Optional[int] = None,
+    ) -> List[torch.Tensor]:
+
+        inputs = create_torch_inputs(
             input_shapes=input_shapes,
-            # verify_cfg=VerifyConfig(
-            #     test_kind=TestKind.INFERENCE,
-            #     devtype=test_device.devtype,
-            #     arch=test_device.arch,
-            #     pcc=pcc,
-            # ),
-            # input_params=[input_params],
-            pcc=pcc,
             dev_data_format=dev_data_format,
             value_range=value_range,
+            random_seed=random_seed,
+        )
+
+        return inputs
+
+    @classmethod
+    def verify_module_for_inputs(
+        cls,
+        model: Module,
+        inputs: List[torch.Tensor],
+        pcc: Optional[float] = None,
+        dev_data_format: forge.DataFormat = None,
+    ):
+
+        verify_module_for_inputs(
+            model=model,
+            inputs=inputs,
+            pcc=pcc,
+            dev_data_format=dev_data_format,
         )
 
 

From 3123165512bd41c0d20a607c3bbef8a0a4da52f8 Mon Sep 17 00:00:00 2001
From: Vladica Obojevic <163161189+vobojevicTT@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:24:24 +0100
Subject: [PATCH 16/18] Test PyTorch unary operators (#423)

---
 .../pytorch/eltwise_unary/__init__.py         |   3 +
 .../pytorch/eltwise_unary/test_unary.py       | 244 ++++++++++++++++++
 forge/test/operators/utils/failing_reasons.py |   1 +
 3 files changed, 248 insertions(+)
 create mode 100644 forge/test/operators/pytorch/eltwise_unary/__init__.py
 create mode 100644 forge/test/operators/pytorch/eltwise_unary/test_unary.py

diff --git a/forge/test/operators/pytorch/eltwise_unary/__init__.py b/forge/test/operators/pytorch/eltwise_unary/__init__.py
new file mode 100644
index 000000000..2332467ef
--- /dev/null
+++ b/forge/test/operators/pytorch/eltwise_unary/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
diff --git a/forge/test/operators/pytorch/eltwise_unary/test_unary.py b/forge/test/operators/pytorch/eltwise_unary/test_unary.py
new file mode 100644
index 000000000..10d946f40
--- /dev/null
+++ b/forge/test/operators/pytorch/eltwise_unary/test_unary.py
@@ -0,0 +1,244 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type (e.g. add, matmul, conv2d, etc.)
+# 2. Operand source(s):
+#    (+)  2.1 From another op
+#           - Operator -> input
+#    (+)  2.2 From DRAM queue
+#           - Operator is first node in network
+#           - Input_queue flag = false
+#    (+)  2.3 Const Inputs (const eval pass)
+#           - Operator where all inputs are constants.
+#    (+)  2.4 From host
+#           - Input tensor as input of network
+#           - Operator is first node in network
+#           - Input_queue flag = true
+# 3. Tensor ranks:
+#    (+)  3.1 Full tensor (i.e. full expected shape)
+#           - 3-4 by default P1 (high prioriy)
+#           - 2, 5, ++ include P2 (lower prioriy)
+#    (+)  3.2 Tensor reduce on one or more dims to 1
+#           - Vector
+#           - Only one dim is not equal to 1
+#    (-)  3.3 Scalar P2
+#           - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+#    (+)  4.1 Divisible by 32
+#    (+)  4.2 Prime numbers
+#    (+)  4.3 Very large (thousands, 10s of thousands)
+#           - 100x100, 100x1000
+#           - maybe nightly only
+#    (+)  4.4 Extreme ratios between height/width
+#    (/)  4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+#    (/)  5.1 Output DF
+#    (/)  5.2 Intermediate DF
+#    (/)  5.3 Accumulation DF
+#    (+)  5.4 Operand DFs
+#           - Fix HiFi4 for math fidelity value
+#    (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+#           - Fix fp16b (default) for data format value
+#    (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+#    (/) 8. Special cases - if applicable
+# 9. Variable number of operands - if applicable
+#    (/) Few representative values
+#    (/) Reuse inputs for selected operators
+
+
+import pytest
+import torch
+import torch.nn as nn
+import forge
+from forge.op_repo import TensorShape
+
+from typing import List, Dict
+from loguru import logger
+from forge import MathFidelity, DataFormat
+
+from test.operators.utils import InputSourceFlags, VerifyUtils
+from test.operators.utils import InputSource
+from test.operators.utils import TestVector
+from test.operators.utils import TestPlan
+from test.operators.utils import FailingReasons
+from test.operators.utils.compat import TestDevice
+from test.operators.utils import TestCollection
+from test.operators.utils import TestCollectionCommon
+from test.operators.utils import ValueRanges
+
+
+class ModelFromAnotherOp(nn.Module):
+    def __init__(self, operator):
+        super().__init__()
+        self.testname = "Element_wise_unary_operators_test_op_src_from_another_op"
+        self.operator = operator
+
+    def forward(self, x):
+        xx = torch.add(x, x)
+        return self.operator(xx)
+
+
+class ModelDirect(nn.Module):
+    def __init__(self, operator):
+        super().__init__()
+        self.testname = "Element_wise_unary_operators_test_op_src_from_host"
+        self.operator = operator
+
+    def forward(self, x):
+        return self.operator(x)
+
+
+class ModelConstEvalPass(nn.Module):
+    def __init__(self, operator, shape: TensorShape):
+        super().__init__()
+        self.testname = "Element_wise_unary_operators_test_op_src_const_eval_pass"
+        self.operator = operator
+        self.c = (torch.rand(shape, requires_grad=False) - 0.5).detach()
+
+    def forward(self, x):
+        cc = self.operator(self.c)
+        xx = self.operator(x)
+        return torch.add(xx, cc)
+
+
+class TestVerification:
+
+    MODEL_TYPES = {
+        InputSource.FROM_ANOTHER_OP: ModelFromAnotherOp,
+        InputSource.FROM_HOST: ModelDirect,
+        InputSource.FROM_DRAM_QUEUE: ModelDirect,
+        InputSource.CONST_EVAL_PASS: ModelConstEvalPass,
+    }
+
+    @classmethod
+    def verify(
+        cls,
+        test_device: TestDevice,
+        test_vector: TestVector,
+        input_params: List[Dict] = [],
+        warm_reset: bool = False,
+    ):
+
+        input_source_flag: InputSourceFlags = None
+        if test_vector.input_source in (InputSource.FROM_DRAM_QUEUE,):
+            input_source_flag = InputSourceFlags.FROM_DRAM
+
+        operator = getattr(torch, test_vector.operator)
+
+        model_type = cls.MODEL_TYPES[test_vector.input_source]
+        pytorch_model = (
+            model_type(operator, test_vector.input_shape)
+            if test_vector.input_source in (InputSource.CONST_EVAL_PASS,)
+            else model_type(operator)
+        )
+
+        input_shapes = tuple([test_vector.input_shape])
+
+        logger.trace(f"***input_shapes: {input_shapes}")
+
+        VerifyUtils.verify(
+            model=pytorch_model,
+            test_device=test_device,
+            input_shapes=input_shapes,
+            input_params=input_params,
+            input_source_flag=input_source_flag,
+            dev_data_format=test_vector.dev_data_format,
+            math_fidelity=test_vector.math_fidelity,
+            pcc=test_vector.pcc,
+            warm_reset=warm_reset,
+            value_range=ValueRanges.SMALL,
+        )
+
+
+class TestParamsData:
+
+    __test__ = False
+
+    test_plan: TestPlan = None
+
+
+class TestCollectionData:
+
+    __test__ = False
+
+    implemented = TestCollection(
+        operators=["relu", "sqrt", "reciprocal", "sigmoid"],
+    )
+
+
+TestParamsData.test_plan = TestPlan(
+    verify=lambda test_device, test_vector: TestVerification.verify(
+        test_device,
+        test_vector,
+    ),
+    collections=[
+        # Test operators with all shapes and input sources collection:
+        TestCollection(
+            operators=TestCollectionData.implemented.operators,
+            input_sources=TestCollectionCommon.all.input_sources,
+            input_shapes=TestCollectionCommon.all.input_shapes,
+        ),
+        # Test Data formats collection:
+        TestCollection(
+            operators=TestCollectionData.implemented.operators,
+            input_sources=TestCollectionCommon.single.input_sources,
+            input_shapes=TestCollectionCommon.single.input_shapes,
+            dev_data_formats=[
+                item
+                for item in TestCollectionCommon.all.dev_data_formats
+                if item not in TestCollectionCommon.single.dev_data_formats
+            ],
+            math_fidelities=TestCollectionCommon.single.math_fidelities,
+        ),
+        # Test Math fidelities collection:
+        TestCollection(
+            operators=TestCollectionData.implemented.operators,
+            input_sources=TestCollectionCommon.single.input_sources,
+            input_shapes=TestCollectionCommon.single.input_shapes,
+            dev_data_formats=TestCollectionCommon.single.dev_data_formats,
+            math_fidelities=TestCollectionCommon.all.math_fidelities,
+        ),
+    ],
+    failing_rules=[
+        # Skip 2D shapes as we don't test them:
+        TestCollection(
+            criteria=lambda test_vector: len(test_vector.input_shape) in (2,),
+            skip_reason=FailingReasons.NOT_IMPLEMENTED,
+        ),
+        TestCollection(
+            operators=["reciprocal"],
+            input_sources=[InputSource.FROM_HOST],
+            input_shapes=[(1, 2, 3, 4)],
+            dev_data_formats=[
+                DataFormat.Int8,
+                DataFormat.Int32,
+            ],
+            math_fidelities=[MathFidelity.HiFi4],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        TestCollection(
+            operators=["sigmoid"],
+            input_sources=[InputSource.FROM_HOST],
+            input_shapes=[(1, 2, 3, 4)],
+            dev_data_formats=[
+                DataFormat.RawUInt8,
+                DataFormat.RawUInt16,
+                DataFormat.RawUInt32,
+                DataFormat.Int8,
+                DataFormat.UInt16,
+                DataFormat.Int32,
+            ],
+            math_fidelities=[MathFidelity.HiFi4],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ],
+)
+
+
+def get_test_plans() -> List[TestPlan]:
+    return [
+        TestParamsData.test_plan,
+    ]
diff --git a/forge/test/operators/utils/failing_reasons.py b/forge/test/operators/utils/failing_reasons.py
index 024181a67..21e8058de 100644
--- a/forge/test/operators/utils/failing_reasons.py
+++ b/forge/test/operators/utils/failing_reasons.py
@@ -89,6 +89,7 @@ def validate_exception_message(
         ],
         FailingReasons.DATA_MISMATCH: [
             lambda ex: isinstance(ex, AssertionError) and f"{ex}" == "PCC check failed",
+            lambda ex: isinstance(ex, AssertionError) and f"{ex}".startswith("Data mismatch"),
         ],
         FailingReasons.UNSUPPORTED_SPECIAL_CASE: [
             lambda ex: isinstance(ex, AssertionError) and f"{ex}" == "PCC check failed",

From 239ee5bd1b158b1818690424530963013f992fc4 Mon Sep 17 00:00:00 2001
From: Ashok Kumar Kannan
 <160501980+ashokkumarkannan1@users.noreply.github.com>
Date: Thu, 7 Nov 2024 21:32:33 +0530
Subject: [PATCH 17/18] Fix: Invalid padding calculation in
 FuseConvAndPoolPadding PatternCallback (#616)

---
 forge/test/mlir/test_ops.py | 83 +++++++++++++++++++++++++++++++++++++
 third_party/tvm             |  2 +-
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/forge/test/mlir/test_ops.py b/forge/test/mlir/test_ops.py
index 376e42209..dbc9de64b 100644
--- a/forge/test/mlir/test_ops.py
+++ b/forge/test/mlir/test_ops.py
@@ -1364,3 +1364,86 @@ def forward(self, x):
     co_out = [co.to("cpu") for co in co_out]
     fw_out = [fw_out] if isinstance(fw_out, torch.Tensor) else fw_out
     assert all([compare_with_golden(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
+
+
+@pytest.mark.parametrize("shape", [(1, 3, 32, 32)])
+@pytest.mark.parametrize(
+    "padding",
+    [
+        pytest.param(
+            (1, 1, 1, 1),
+            marks=pytest.mark.xfail(reason="'ttnn.conv2d' op Bias must only have data on the final dimenstion"),
+        ),
+        pytest.param(
+            (1, 1, 2, 2),
+            marks=pytest.mark.xfail(reason="'ttnn.conv2d' op Bias must only have data on the final dimenstion"),
+        ),
+        pytest.param(
+            (1, 2, 1, 2),
+            marks=pytest.mark.xfail(
+                reason="TTNN only supports padding height/width attributes. Thus, padding_top "
+                "must equal padding_bottom for the op to execute as expected."
+            ),
+        ),
+    ],
+)
+@pytest.mark.xfail(reason="'ttnn.conv2d' op Bias must only have data on the final dimenstion")
+def test_conv2d_with_padding(shape, padding):
+    class PaddingAndConv2d(nn.Module):
+        def __init__(self, padding):
+            super().__init__()
+            self.padding = padding
+            self.conv = nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0)
+
+        def forward(self, x):
+            x = nn.functional.pad(x, self.padding, mode="constant", value=0)
+            return self.conv(x)
+
+    framework_model = PaddingAndConv2d(padding=padding)
+
+    inputs = [torch.rand(shape)]
+    compiled_model = forge.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+
+    co_out = [co.to("cpu") for co in co_out]
+    fw_out = [fw_out] if isinstance(fw_out, torch.Tensor) else fw_out
+    assert all([compare_with_golden(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
+
+
+@pytest.mark.parametrize("shape", [(1, 3, 32, 32)])
+@pytest.mark.parametrize(
+    "padding",
+    [
+        pytest.param(
+            (1, 1, 1, 1),
+            marks=pytest.mark.xfail(reason="For interleaved-buffers page size should be divisible by buffer size"),
+        ),
+        pytest.param(
+            (1, 1, 2, 2),
+            marks=pytest.mark.xfail(
+                reason="Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values"
+            ),
+        ),
+    ],
+)
+def test_maxpool2d_with_padding(shape, padding):
+    class PaddingAndMaxPool2d(nn.Module):
+        def __init__(self, padding):
+            super().__init__()
+            self.padding = padding
+            self.pool = nn.MaxPool2d(3, stride=3, padding=0)
+
+        def forward(self, x):
+            x = nn.functional.pad(x, self.padding, mode="constant", value=0)
+            return self.pool(x)
+
+    framework_model = PaddingAndMaxPool2d(padding=padding)
+    framework_model = framework_model.to(torch.bfloat16)
+
+    inputs = [torch.rand(shape).to(torch.bfloat16)]
+    compiled_model = forge.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+
+    co_out = [co.to("cpu") for co in co_out]
+    fw_out = [fw_out] if isinstance(fw_out, torch.Tensor) else fw_out
+    assert all([compare_with_golden(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
diff --git a/third_party/tvm b/third_party/tvm
index a0fdca689..e91c07cfe 160000
--- a/third_party/tvm
+++ b/third_party/tvm
@@ -1 +1 @@
-Subproject commit a0fdca689d19a8e4bbc7feb78489776857a2b605
+Subproject commit e91c07cfe4710c72a9976c791205032aff60d52b

From e089fdc8310e0503711907a00617657e3744bd8f Mon Sep 17 00:00:00 2001
From: Vladimir Brkic <161027113+vbrkicTT@users.noreply.github.com>
Date: Thu, 7 Nov 2024 19:02:29 +0100
Subject: [PATCH 18/18] Test torch binary operators via test plan (#240)

Supported ops: add, sub, mul, div, ge

Select operand source models for Forge:
  - Supported:
    - from another op
    - from dram
    - from host
    - const eval pass
  - Removed tm edge and prologued constants
  - mf and df tests

Inconsistency tests

Issue #239
---
 .../pytorch/eltwise_binary/__init__.py        |   3 +
 .../pytorch/eltwise_binary/failing_rules.py   | 300 ++++++++++
 .../pytorch/eltwise_binary/test_binary.py     | 531 ++++++++++++++++++
 .../operators/pytorch/test_inconsistency.py   |  95 ++++
 forge/test/operators/utils/failing_reasons.py |   3 +-
 forge/test/operators/utils/test_data.py       |  41 ++
 6 files changed, 972 insertions(+), 1 deletion(-)
 create mode 100644 forge/test/operators/pytorch/eltwise_binary/__init__.py
 create mode 100644 forge/test/operators/pytorch/eltwise_binary/failing_rules.py
 create mode 100644 forge/test/operators/pytorch/eltwise_binary/test_binary.py
 create mode 100644 forge/test/operators/pytorch/test_inconsistency.py

diff --git a/forge/test/operators/pytorch/eltwise_binary/__init__.py b/forge/test/operators/pytorch/eltwise_binary/__init__.py
new file mode 100644
index 000000000..2332467ef
--- /dev/null
+++ b/forge/test/operators/pytorch/eltwise_binary/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
diff --git a/forge/test/operators/pytorch/eltwise_binary/failing_rules.py b/forge/test/operators/pytorch/eltwise_binary/failing_rules.py
new file mode 100644
index 000000000..acab8617f
--- /dev/null
+++ b/forge/test/operators/pytorch/eltwise_binary/failing_rules.py
@@ -0,0 +1,300 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Failing rules for element-wise binary operators
+
+
+import forge
+
+from test.operators.utils import InputSource
+from test.operators.utils import TestCollection
+from test.operators.utils import TestResultFailing
+from test.operators.utils import FailingReasons
+from test.operators.utils import TestCollectionCommon
+
+
+class FailingRulesData:
+
+    common = [
+        # PCC check fails for all input sources and buggy shapes
+        TestCollection(
+            input_shapes=[
+                (1, 1),
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            input_shapes=[
+                (11, 45, 17),
+                (10, 1000, 100),
+                (10, 10000, 1),
+                (32, 32, 64),
+                # fail only for const eval pass not for other models
+                (2, 3, 4),
+                (11, 1, 23),
+                (11, 64, 1),
+                (100, 100, 100),
+                (64, 160, 96),
+                (11, 17, 41),
+                (13, 89, 3),
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        # PCC check fails for FROM_HOST and all int dev data formats
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_HOST,
+            ],
+            dev_data_formats=TestCollectionCommon.int.dev_data_formats,
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ]
+
+    add = [
+        # PCC check fails for all input sources and buggy shapes
+        common[0],
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        common[1],
+        # PCC check fails for FROM_HOST and all int dev data formats
+        common[2],
+        # PCC check fails for buggy shapes for add
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_DRAM_QUEUE,
+                InputSource.CONST_EVAL_PASS,
+            ],
+            dev_data_formats=[
+                None,
+            ],
+            criteria=lambda test_vector: test_vector.kwargs is not None
+            and "alpha" in test_vector.kwargs
+            and test_vector.kwargs["alpha"] != 1,
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            dev_data_formats=[
+                None,
+            ],
+            criteria=lambda test_vector: test_vector.kwargs is not None
+            and "alpha" in test_vector.kwargs
+            and len(test_vector.input_shape) == 2
+            and test_vector.input_shape != (1, 1)
+            and test_vector.input_shape[-1] == 1,
+            failing_reason=None,
+        ),
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            input_shapes=[
+                (1, 3),
+            ],
+            kwargs=[
+                {
+                    "alpha": 0.17234435,
+                },
+            ],
+            # dev_data_formats=[None,],
+            failing_reason=None,
+        ),
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_HOST,
+            ],
+            # dev_data_formats=TestCollectionCommon.float.dev_data_formats,
+            criteria=lambda test_vector: test_vector.kwargs is not None
+            and "alpha" in test_vector.kwargs
+            and test_vector.kwargs["alpha"] != 1,
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_ANOTHER_OP,
+            ],
+            criteria=lambda test_vector: test_vector.kwargs is not None
+            and "alpha" in test_vector.kwargs
+            and test_vector.kwargs["alpha"] < 0,
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ]
+
+    sub = [
+        # PCC check fails for all input sources and buggy shapes
+        common[0],
+        # Exception from DATA_MISMATCH
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_ANOTHER_OP,
+            ],
+            input_shapes=[
+                (1, 1),
+            ],
+            failing_reason=None,
+        ),
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        common[1],
+        # PCC check fails for FROM_HOST and all int dev data formats
+        common[2],
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_ANOTHER_OP,
+            ],
+            input_shapes=[
+                (1, 1),
+            ],
+            criteria=lambda test_vector: test_vector.kwargs is not None
+            and "alpha" in test_vector.kwargs
+            and test_vector.kwargs["alpha"] != 1,
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ]
+
+    mul = [
+        # PCC check fails for all input sources and buggy shapes
+        common[0],
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        common[1],
+        # PCC check fails for FROM_HOST and all int dev data formats
+        common[2],
+        # Exception from DATA_MISMATCH
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            input_shapes=[
+                (2, 3, 4),
+            ],
+            failing_reason=None,
+        ),
+    ]
+
+    div = [
+        # PCC check fails for all input sources and buggy shapes
+        common[0],
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        common[1],
+        # PCC check fails for FROM_HOST and all int dev data formats
+        common[2],
+        # Failing when testing with LARGE
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_HOST,
+            ],
+            # input_shapes=[
+            #     (1, 2, 3, 4),
+            # ],
+            dev_data_formats=TestCollectionCommon.float.dev_data_formats,
+            kwargs=[
+                {
+                    "rounding_mode": "trunc",
+                },
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        # PCC check fails for buggy shapes for div
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_HOST,
+                InputSource.FROM_DRAM_QUEUE,
+            ],
+            input_shapes=[
+                (1, 4),  # TODO remove fixed
+                (1, 3),  # TODO remove fixed
+                (3, 4),  # TODO remove fixed
+                (1, 3, 4),  # TODO remove fixed
+                # (12, 64, 160, 96),
+            ],
+            kwargs=[
+                {
+                    "rounding_mode": "trunc",
+                },
+                {
+                    "rounding_mode": "floor",
+                },
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        # PCC check fails for buggy shapes for div
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            input_shapes=[
+                (1, 17),  # TODO remove fixed
+                (45, 17),  # TODO remove fixed
+            ],
+            kwargs=[
+                {
+                    "rounding_mode": "trunc",
+                },
+                {
+                    "rounding_mode": "floor",
+                },
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+        # PCC check fails for buggy shapes for div
+        TestCollection(
+            input_sources=[
+                InputSource.CONST_EVAL_PASS,
+            ],
+            input_shapes=[
+                (1, 41),
+                (17, 41),
+                (1, 2, 3, 4),
+                (2, 2, 3, 4),
+            ],
+            kwargs=[
+                {
+                    "rounding_mode": "trunc",
+                },
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ]
+
+    ge = [
+        # PCC check fails for all input sources and buggy shapes
+        common[0],
+        # Exception from DATA_MISMATCH
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_ANOTHER_OP,
+                InputSource.FROM_HOST,
+                InputSource.FROM_DRAM_QUEUE,
+            ],
+            input_shapes=[
+                (1, 1),
+            ],
+            failing_reason=None,
+        ),
+        # PCC check fails for CONST_EVAL_PASS and buggy shapes
+        common[1],
+        # PCC check fails for FROM_HOST and all int dev data formats
+        common[2],
+        # PCC check fails for buggy shapes for ge
+        TestCollection(
+            input_sources=[
+                InputSource.FROM_HOST,
+                InputSource.FROM_DRAM_QUEUE,
+            ],
+            input_shapes=[
+                (1, 1000),
+                (5, 11, 64, 1),
+                # fail when dtype=float32 or generator
+                # (17, 41),
+                # (89, 3),
+                # (1, 17, 41),
+                # (1, 89, 3),
+            ],
+            failing_reason=FailingReasons.DATA_MISMATCH,
+        ),
+    ]
diff --git a/forge/test/operators/pytorch/eltwise_binary/test_binary.py b/forge/test/operators/pytorch/eltwise_binary/test_binary.py
new file mode 100644
index 000000000..ff20acc11
--- /dev/null
+++ b/forge/test/operators/pytorch/eltwise_binary/test_binary.py
@@ -0,0 +1,531 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Tests for testing of element-wise binary operators
+#
+# In this test we test pytorch binary operators
+
+# GENERAL OP SUPPORT TEST PLAN:
+# 1. Operand type - any supported type
+# 2. Operand source(s):
+# (+)  2.1 From another op
+#       - Operator -> input
+# (+)  2.2 From DRAM queue
+#       - Operator is first node in network
+#       - Input_queue flag = false
+# (+)  2.3 Const Inputs (const eval pass)
+#       - Operator where all inputs are constants.
+# (+)  2.4 From host
+#       - Input tensor as input of network
+#       - Operator is first node in network
+#       - Input_queue flag = true
+# 3 Operand shapes type(s):
+# (+)  3.1 Full tensor (i.e. full expected shape)
+#       - 3-4 by default P1 (high prioriy)
+#       - 2, 5, ++ include P2 (lower prioriy)
+# (+)  3.2 Tensor reduce on one or more dims to 1
+#       - Vector
+#       - Only one dim is not equal to 1
+# (/)  3.3 Scalar P2
+#       - Create tensor of dimension equal to 0 (tensor from scalar) or just to use scalar as simple value
+# 4. Operand / output size of dimensions (few examples of each, 10 values total)
+# (+)  4.1 Divisible by 32
+# (+)  4.2 Prime numbers
+# (+)  4.3 Very large (thousands, 10s of thousands)
+#       - 100x100, 100x1000
+#       - maybe nightly only
+# (+)  4.4 Extreme ratios between height/width
+#      4.5 ...probably many more interesting combinations here
+# 5. Data format - all supported formats
+# (/)  5.1 Output DF
+# (/)  5.2 Intermediate DF
+# (/)  5.3 Accumulation DF
+# (+)  5.4 Operand DFs
+#       - Fix HiFi4 for math fidelity value
+# (+) 6. Math fidelity - LoFi, HiFi2a, Hifi2b, Hifi3, Hifi4
+#       - Fix fp16b (default) for data format value
+# (/) 7. Special attributes - if applicable.. like approx_mode for Exp, for example
+# (/) 8. Special cases - if applicable
+# 9. Variable number of operands - if applicable
+# (/) Few representative values
+# (/) Reuse inputs for selected operators
+
+
+from typing import Callable, List, Tuple, Dict, Union, Optional
+from loguru import logger
+
+import forge
+import torch
+
+from test.operators.utils import ValueRanges
+from test.operators.utils import InputSourceFlags, VerifyUtils
+from test.operators.utils import ShapeUtils
+from test.operators.utils import InputSource
+from test.operators.utils import TestVector
+from test.operators.utils import TestCollection
+from test.operators.utils import TestPlan
+from test.operators.utils import TestSuite
+from test.operators.utils import TestCollectionCommon
+from test.operators.utils import FailingReasons
+from test.operators.utils.compat import TestDevice
+
+from .failing_rules import FailingRulesData
+
+
+class ModelFromAnotherOp(torch.nn.Module):
+
+    model_name = "model_op_src_from_another_op"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelFromAnotherOp, self).__init__()
+        self.testname = "pytorch_eltwise_binary_" + opname + "_model_from_another_op"
+        self.operator = operator
+        self.kwargs = kwargs
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        # we use Add and Subtract operators to create two operands which are inputs for the binary operator
+        xx = torch.add(x, y)
+        yy = torch.add(x, y)  # TODO temporary we use add operator, return to sub later
+        output = self.operator(xx, yy, **self.kwargs)
+        return output
+
+
+class ModelDirect(torch.nn.Module):
+
+    model_name = "model_op_src_from_host"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelDirect, self).__init__()
+        self.testname = "pytorch_eltwise_binary_" + opname + "_model_direct"
+        self.operator = operator
+        self.kwargs = kwargs
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        output = self.operator(x, y, **self.kwargs)
+        return output
+
+
+class ModelConstEvalPass(torch.nn.Module):
+
+    model_name = "model_op_src_const_eval_pass"
+
+    def __init__(self, operator, opname, shape, kwargs):
+        super(ModelConstEvalPass, self).__init__()
+        self.testname = "pytorch_eltwise_binary_" + opname + "_model_const_eval_pass"
+        self.operator = operator
+        self.kwargs = kwargs
+
+        self.constant_shape = ShapeUtils.reduce_microbatch_size(shape)
+
+        self.c1 = torch.rand(*self.constant_shape) - 0.5
+        self.c2 = torch.rand(*self.constant_shape) - 0.5
+
+    def forward(self, x, y):
+        v1 = self.operator(self.c1, self.c2, **self.kwargs)
+        # v2 and v3 consume inputs
+        v2 = torch.add(x, y)
+        v3 = torch.add(v1, v2)
+        return v3
+
+
+class DivVerifyUtils(VerifyUtils):
+    @classmethod
+    def create_torch_inputs(
+        cls, input_shapes, dev_data_format=None, value_range=None, random_seed=None
+    ) -> List[torch.Tensor]:
+        inputs = super().create_torch_inputs(input_shapes, dev_data_format, value_range, random_seed)
+
+        # Avoid zero value in the second operand to avoid division by zero
+        tensor = inputs[1]
+        tensor = torch.where(tensor == 0, torch.tensor(1), tensor)
+        inputs[1] = tensor
+
+        return inputs
+
+
+class TestVerification:
+
+    MODEL_TYPES = {
+        InputSource.FROM_ANOTHER_OP: ModelFromAnotherOp,
+        InputSource.FROM_HOST: ModelDirect,
+        InputSource.FROM_DRAM_QUEUE: ModelDirect,
+        InputSource.CONST_EVAL_PASS: ModelConstEvalPass,
+    }
+
+    @classmethod
+    def verify(
+        cls,
+        test_device: TestDevice,
+        test_vector: TestVector,
+        value_range: Optional[ValueRanges] = None,
+        VerifyUtils=VerifyUtils,
+        # number_of_operands: int = 2,
+        # input_params: List[Dict] = [],
+    ):
+        """Common verification function for all tests"""
+
+        number_of_operands: int = 2
+        input_params: List[Dict] = []
+
+        warm_reset = False
+
+        input_source_flag: InputSourceFlags = None
+        if test_vector.input_source in (InputSource.FROM_DRAM_QUEUE,):
+            input_source_flag = InputSourceFlags.FROM_DRAM
+
+        dev_data_format = test_vector.dev_data_format
+        # if test_vector.dev_data_format is not None:
+        #     dev_data_format = test_vector.dev_data_format
+        # else:
+        #     dev_data_format = TestCollectionCommon.single.dev_data_formats[0]
+
+        if dev_data_format in TestCollectionCommon.int.dev_data_formats:
+            value_range = ValueRanges.LARGE
+
+        if value_range is None:
+            value_range = ValueRanges.SMALL
+
+        operator = getattr(torch, test_vector.operator)
+
+        kwargs = test_vector.kwargs if test_vector.kwargs else {}
+
+        model_type = cls.MODEL_TYPES[test_vector.input_source]
+        pytorch_model = model_type(
+            operator=operator, opname=test_vector.operator, shape=test_vector.input_shape, kwargs=kwargs
+        )
+        # forge_model = forge.PyTorchModule(pytorch_model.model_name, pytorch_model)
+
+        input_shapes = tuple([test_vector.input_shape for _ in range(number_of_operands)])
+        logger.trace(f"***input_shapes: {input_shapes}")
+
+        VerifyUtils.verify(
+            model=pytorch_model,
+            test_device=test_device,
+            input_shapes=input_shapes,
+            input_params=input_params,
+            input_source_flag=input_source_flag,
+            dev_data_format=dev_data_format,
+            math_fidelity=test_vector.math_fidelity,
+            value_range=value_range,
+            pcc=test_vector.pcc,
+            warm_reset=warm_reset,
+        )
+
+
+class TestParamsData:
+
+    __test__ = False  # Avoid collecting TestParamsData as a pytest test
+
+    no_kwargs = [
+        None,
+    ]
+
+    kwargs_alpha_int = [
+        {"alpha": 1},
+        {"alpha": -37},  # TODO test this
+        {"alpha": 37},
+        {},
+    ]
+
+    kwargs_alpha_float = [
+        {"alpha": -37},  # TODO test this
+        {"alpha": 1},  # TODO test this
+        {"alpha": 37},
+        {"alpha": 0.17234435},
+        {"alpha": 589.34546459345},
+        # { "alpha": None },
+        {},
+    ]
+
+    kwargs_rounding_modes = [
+        {"rounding_mode": "trunc"},
+        {"rounding_mode": "floor"},
+        {"rounding_mode": None},
+        {},
+    ]
+
+    @classmethod
+    def generate_kwargs_alpha(cls, test_vector: TestVector):
+        if test_vector.dev_data_format in TestCollectionCommon.int.dev_data_formats:
+            return cls.kwargs_alpha_int
+        else:
+            return cls.kwargs_alpha_float
+
+
+class TestCollectionData:
+
+    __test__ = False  # Avoid collecting TestCollectionData as a pytest test
+
+    implemented = TestCollection(
+        operators=[
+            "add",  #                   #00
+            "div",  #                   #01
+            # "divide",  #              #02     - Alias for div.
+            "mul",  #                   #03
+            # "multiply",  #            #04     - Alias for mul.
+            "sub",  #                   #05
+            # "subtract",  #            #06     - Alias for sub.
+            # "true_divide",  #         #07     - Alias for div with rounding_mode=None.
+            "ge",  #                    #08
+            # "greater_equal",  #       #09    - Alias for ge.
+            "ne",  #                    #16                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: not_equal      # working with model const
+            # "greater",  #             #18    - Alias for gt.      E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: greater
+            "gt",  #                    #19                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: greater        # working with model const
+            "lt",  #                    #21                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: less           # working with model const
+            # "less",  #                #22    - Alias for lt.      E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: less
+            "maximum",  #               #23                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: maximum        # working with model const
+            "minimum",  #               #24                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: minimum        # working with model const
+            # "not_equal",  #           #25    - Alias for ne.      E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: not_equal
+        ],
+    )
+
+    not_implemented = TestCollection(
+        operators=[
+            "atan2",  #                 #00                         - NotImplementedError: The following operators are not implemented: ['aten::atan2']
+            "arctan2",  #               #01                         - NotImplementedError: The following operators are not implemented: ['aten::atan2']
+            "bitwise_and",  #           #02                         - RuntimeError: "bitwise_and_cpu" not implemented for 'Float'
+            "bitwise_or",  #            #03                         - RuntimeError: "bitwise_or_cpu" not implemented for 'Float'
+            "bitwise_xor",  #           #04                         - RuntimeError: "bitwise_xor_cpu" not implemented for 'Float'
+            "bitwise_left_shift",  #    #05                         - RuntimeError: "lshift_cpu" not implemented for 'Float'
+            "bitwise_right_shift",  #   #06                         - RuntimeError: "rshift_cpu" not implemented for 'Float'
+            "floor_divide",  #          #07                         - AssertionError: Encountered unsupported op types. Check error logs for more details         # working with model const
+            "fmod",  #                  #08                         - AssertionError: Encountered unsupported op types. Check error logs for more details         # working with model const
+            "logaddexp",  #             #09                         - NotImplementedError: The following operators are not implemented: ['aten::logaddexp']
+            "logaddexp2",  #            #10                         - NotImplementedError: The following operators are not implemented: ['aten::logaddexp2']
+            "nextafter",  #             #11                         - NotImplementedError: The following operators are not implemented: ['aten::nextafter']
+            "remainder",  #             #12                         - AssertionError: Encountered unsupported op types. Check error logs for more details         # working with model const
+            "fmax",  #                  #13                         - NotImplementedError: The following operators are not implemented: ['aten::fmax']
+            "fmin",  #                  #14                         - NotImplementedError: The following operators are not implemented: ['aten::fmin']
+            "eq",  #                    #15                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: equal          # working with model const
+            "le",  #                    #17                         E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: less_equal     # working with model const
+            # "less_equal",  #          #20    - Alias for le.      E       RuntimeError: Unsupported operation for lowering from TTForge to TTIR: less_equal
+        ],
+    )
+
+    implemented_const_eval = TestCollection(
+        operators=[
+            "floor_divide",
+            "fmod",
+            "remainder",
+            "eq",
+            "ne",
+            "le",
+            "gt",
+            "lt",
+            "maximum",
+            "minimum",
+        ],
+    )
+
+    alpha = TestCollection(
+        operators=[
+            "add",  #                   #00
+            "sub",  #                   #05
+            # "subtract",  #            #06     - Alias for sub.
+        ],
+    )
+
+    rounding_mode = TestCollection(
+        operators=[
+            "div",  #                   #01
+            # "divide",  #              #02     - Alias for div.
+            # "true_divide",  #         #07     - Alias for div with rounding_mode=None.
+        ],
+    )
+
+    no_params = TestCollection(
+        operators=[
+            "mul",  #                   #03
+            # "multiply",  #            #04     - Alias for mul.
+            "ge",  #                    #08
+            # "greater_equal",  #       #09     - Alias for ge.
+        ],
+    )
+
+    bitwise = TestCollection(
+        operators=[
+            "bitwise_and",
+            "bitwise_or",
+            "bitwise_xor",
+            "bitwise_left_shift",
+            "bitwise_right_shift",
+        ],
+    )
+
+    all = TestCollection(
+        operators=implemented.operators,
+        input_sources=TestCollectionCommon.all.input_sources,
+        input_shapes=TestCollectionCommon.all.input_shapes,
+        dev_data_formats=TestCollectionCommon.all.dev_data_formats,
+        math_fidelities=TestCollectionCommon.all.math_fidelities,
+    )
+
+    single = TestCollection(
+        input_sources=TestCollectionCommon.single.input_sources,
+        input_shapes=TestCollectionCommon.single.input_shapes,
+        dev_data_formats=TestCollectionCommon.single.dev_data_formats,
+        math_fidelities=TestCollectionCommon.single.math_fidelities,
+    )
+
+
+class BinaryTestPlanBuilder:
+    """Helper class for building test plans for binary operators"""
+
+    @classmethod
+    def build_test_collections(
+        cls, operator: str, generate_kwargs: Optional[Callable[[TestVector], List[Dict]]] = None, quick_mix=False
+    ) -> List[TestCollection]:
+        """Build test plan collections for binary operator"""
+
+        operators = [operator]
+
+        collections = [
+            # Test plan:
+            # 2. Operand source(s):
+            # 3. Operand shapes type(s):
+            # 4. Operand / output size of dimensions
+            TestCollection(
+                operators=operators,
+                input_sources=TestCollectionData.all.input_sources,
+                input_shapes=TestCollectionData.all.input_shapes,
+                kwargs=generate_kwargs,
+            ),
+            # Test plan:
+            # 5. Data format
+            TestCollection(
+                operators=operators,
+                input_sources=TestCollectionData.single.input_sources,
+                input_shapes=TestCollectionData.single.input_shapes,
+                kwargs=generate_kwargs,
+                dev_data_formats=TestCollectionData.all.dev_data_formats,
+                math_fidelities=TestCollectionData.single.math_fidelities,
+            ),
+            # Test plan:
+            # 6. Math fidelity
+            TestCollection(
+                operators=operators,
+                input_sources=TestCollectionData.single.input_sources,
+                input_shapes=TestCollectionData.single.input_shapes,
+                kwargs=generate_kwargs,
+                dev_data_formats=TestCollectionData.single.dev_data_formats,
+                math_fidelities=TestCollectionData.all.math_fidelities,
+            ),
+        ]
+
+        if quick_mix:
+            collections.append(
+                # Quick mix
+                # Extended test plan with multiple input sources, shapes and data formats
+                TestCollection(
+                    operators=operators,
+                    input_sources=TestCollectionData.all.input_sources,
+                    input_shapes=TestCollectionCommon.quick.input_shapes,
+                    kwargs=generate_kwargs,
+                    dev_data_formats=TestCollectionCommon.quick.dev_data_formats,
+                    math_fidelities=TestCollectionData.single.math_fidelities,
+                )
+            )
+
+        return collections
+
+    @classmethod
+    def build_test_plan(
+        cls,
+        operator: str,
+        value_range: ValueRanges,
+        generate_kwargs: Optional[Callable[[TestVector], List[Dict]]] = None,
+        quick_mix: bool = False,
+        VerifyUtils=VerifyUtils,
+    ) -> List[TestCollection]:
+        """Build test plan for a binary operator"""
+
+        if generate_kwargs is None:
+            generate_kwargs = lambda test_vector: TestParamsData.no_kwargs
+
+        failing_rules = getattr(FailingRulesData, operator)
+
+        test_plan = TestPlan(
+            verify=lambda test_device, test_vector: TestVerification.verify(
+                test_device,
+                test_vector,
+                value_range=value_range,
+                VerifyUtils=VerifyUtils,
+            ),
+            collections=cls.build_test_collections(operator, generate_kwargs, quick_mix),
+            failing_rules=failing_rules,
+        )
+
+        return test_plan
+
+
+class TestPlansData:
+
+    __test__ = False  # Avoid collecting TestPlansData as a pytest test
+
+    add: TestPlan = BinaryTestPlanBuilder.build_test_plan(
+        "add",
+        value_range=ValueRanges.LARGE,
+        generate_kwargs=lambda test_vector: TestParamsData.generate_kwargs_alpha(test_vector),
+        quick_mix=False,
+    )
+
+    sub: TestPlan = BinaryTestPlanBuilder.build_test_plan(
+        "sub",
+        value_range=ValueRanges.SMALL,
+        generate_kwargs=lambda test_vector: TestParamsData.generate_kwargs_alpha(test_vector),
+        quick_mix=False,
+    )
+
+    mul: TestPlan = BinaryTestPlanBuilder.build_test_plan("mul", value_range=ValueRanges.SMALL, quick_mix=False)
+
+    div: TestPlan = BinaryTestPlanBuilder.build_test_plan(
+        "div",
+        value_range=ValueRanges.LARGE,
+        generate_kwargs=lambda test_vector: TestParamsData.kwargs_rounding_modes,
+        quick_mix=False,
+        VerifyUtils=DivVerifyUtils,
+    )
+
+    ge: TestPlan = BinaryTestPlanBuilder.build_test_plan("ge", value_range=ValueRanges.SMALL, quick_mix=False)
+
+    not_implemented: TestPlan = TestPlan(
+        verify=lambda test_device, test_vector: TestVerification.verify(
+            test_device, test_vector, value_range=ValueRanges.SMALL
+        ),
+        collections=[
+            # Unimplemented operators
+            TestCollection(
+                operators=TestCollectionData.not_implemented.operators,
+                input_sources=TestCollectionData.all.input_sources,
+                input_shapes=TestCollectionData.single.input_shapes,
+            ),
+        ],
+        failing_rules=[
+            # Not implemented operators
+            TestCollection(
+                operators=TestCollectionData.not_implemented.operators,
+                failing_reason=FailingReasons.NOT_IMPLEMENTED,
+            ),
+            # Not implemented operators for CONST_EVAL_PASS
+            # 10 operators are implemented for CONST_EVAL_PASS the are not for other input sources
+            TestCollection(
+                operators=TestCollectionData.implemented_const_eval.operators,
+                input_sources=[
+                    InputSource.CONST_EVAL_PASS,
+                ],
+                failing_reason=None,
+            ),
+        ],
+    )
+
+
+def get_test_plans() -> List[Union[TestPlan, TestSuite]]:
+    return [
+        TestPlansData.add,
+        TestPlansData.sub,
+        TestPlansData.mul,
+        TestPlansData.div,
+        TestPlansData.ge,
+        TestPlansData.not_implemented,
+    ]
diff --git a/forge/test/operators/pytorch/test_inconsistency.py b/forge/test/operators/pytorch/test_inconsistency.py
new file mode 100644
index 000000000..c1165bf36
--- /dev/null
+++ b/forge/test/operators/pytorch/test_inconsistency.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Tests for testing all pytorch operators
+
+
+# Examples
+# pytest -svv forge/test/operators/pytorch/test_inconsistency.py::test_binary_order1
+# pytest -svv forge/test/operators/pytorch/test_inconsistency.py::test_binary_order2
+# pytest -svv forge/test/operators/pytorch/test_inconsistency.py::test_binary_with_reset
+
+
+import os
+import pytest
+
+from loguru import logger
+
+from test.operators.utils import DeviceUtils
+from test.operators.utils import TestVector
+from test.operators.utils import TestCollection
+from test.operators.utils import TestPlanScanner
+
+
+class TestParamsData:
+
+    __test__ = False  # Avoid collecting TestParamsData as a pytest test
+
+    test_suite = TestPlanScanner.build_test_suite(current_directory=os.path.dirname(__file__))
+
+
+test_suite = TestParamsData.test_suite
+
+
+class TestInconsistency:
+
+    __test__ = False  # Avoid collecting TestInconsistency as a pytest test
+
+    test_ids = [
+        "div-CONST_EVAL_PASS-{}-(13, 89, 3)-None-None",
+        "mul-CONST_EVAL_PASS-{}-(2, 3, 4)-None-None",
+        "mul-CONST_EVAL_PASS-{}-(11, 45, 17)-None-None",
+    ]
+
+    # The collection defines test vectors that triggers a warm reset of the device before verification
+    # It allows specifying individual test vectors that should trigger a warm reset and skip warn reset for others
+    warm_reset_collection = TestCollection(
+        criteria=lambda test_vector: test_vector.get_id()
+        in [
+            "div-CONST_EVAL_PASS-{}-(13, 89, 3)-None-None",  # Resetting before this test does not cause halt on step 'Running model forward on device...'
+            "mul-CONST_EVAL_PASS-{}-(2, 3, 4)-None-None",  # Uncomment to cause halt on step 'Running model forward on device...'
+            # "mul-CONST_EVAL_PASS-{}-(11, 45, 17)-None-None",  # Uncomment to cause halt on step 'Running model forward on device...'
+        ],
+    )
+
+
+# The fixture is used to setup warm reset before the test, based on the warm_reset_collection_inconsistency collection
+@pytest.fixture
+def warm_reset_inconsistency(test_vector: TestVector):
+    if test_vector in TestInconsistency.warm_reset_collection:
+        logger.warning(f"Test vector {test_vector.get_id()} requires warm reset")
+        DeviceUtils.warm_reset()
+    yield
+
+
+# The fixture is used to setup warm reset before each test
+@pytest.fixture
+def warm_reset_all():
+    DeviceUtils.warm_reset()
+    yield
+
+
+@pytest.mark.parametrize(
+    "test_vector",
+    test_suite.query_from_id_list(TestInconsistency.test_ids).to_params(),
+)
+def test_binary_order1(test_vector: TestVector, test_device):
+    test_vector.verify(test_device)
+
+
+@pytest.mark.parametrize(
+    "test_vector",
+    test_suite.query_from_id_list(TestInconsistency.test_ids).reverse().to_params(),
+)
+def test_binary_order2(test_vector: TestVector, test_device):
+    test_vector.verify(test_device)
+
+
+@pytest.mark.parametrize(
+    "test_vector",
+    test_suite.query_from_id_list(TestInconsistency.test_ids).to_params(),
+)
+# @pytest.mark.usefixtures("warm_reset_inconsistency")
+def test_binary_with_reset(test_vector: TestVector, test_device, warm_reset_inconsistency):
+    test_vector.verify(test_device)
diff --git a/forge/test/operators/utils/failing_reasons.py b/forge/test/operators/utils/failing_reasons.py
index 21e8058de..21bd96dc5 100644
--- a/forge/test/operators/utils/failing_reasons.py
+++ b/forge/test/operators/utils/failing_reasons.py
@@ -85,7 +85,8 @@ def validate_exception_message(
         FailingReasons.UNSUPPORTED_DATA_FORMAT: [
             # lambda ex: FailingReasonsValidation.validate_exception_message(ex, RuntimeError, "Unsupported data type"),
             lambda ex: isinstance(ex, RuntimeError) and f"{ex}" == "Unsupported data type",
-            lambda ex: isinstance(ex, RuntimeError) and "/forge/csrc/passes/lower_to_mlir.cpp:466: false" in f"{ex}",
+            # lambda ex: isinstance(ex, RuntimeError) and "/forge/csrc/passes/lower_to_mlir.cpp:466: false" in f"{ex}",
+            lambda ex: isinstance(ex, RuntimeError) and "/forge/csrc/passes/lower_to_mlir.cpp:473: false" in f"{ex}",
         ],
         FailingReasons.DATA_MISMATCH: [
             lambda ex: isinstance(ex, AssertionError) and f"{ex}" == "PCC check failed",
diff --git a/forge/test/operators/utils/test_data.py b/forge/test/operators/utils/test_data.py
index 78704a5dc..d71fe4fac 100644
--- a/forge/test/operators/utils/test_data.py
+++ b/forge/test/operators/utils/test_data.py
@@ -254,3 +254,44 @@ class TestCollectionCommon:
             forge.MathFidelity.HiFi4,
         ],
     )
+
+    float = TestCollection(
+        dev_data_formats=[
+            pytest.param(forge.DataFormat.Bfp2, id="Bfp2"),
+            pytest.param(forge.DataFormat.Bfp2_b, id="Bfp2_b"),
+            pytest.param(forge.DataFormat.Bfp4, id="Bfp4"),
+            pytest.param(forge.DataFormat.Bfp4_b, id="Bfp4_b"),
+            pytest.param(forge.DataFormat.Bfp8, id="Bfp8"),
+            pytest.param(forge.DataFormat.Bfp8_b, id="Bfp8_b"),
+            pytest.param(forge.DataFormat.Float16, id="Float16"),
+            pytest.param(forge.DataFormat.Float16_b, id="Float16_b"),
+            pytest.param(forge.DataFormat.Float32, id="Float32"),
+            pytest.param(forge.DataFormat.Lf8, id="Lf8"),
+        ],
+    )
+
+    int = TestCollection(
+        dev_data_formats=[
+            pytest.param(forge.DataFormat.RawUInt8, id="RawUInt8"),
+            pytest.param(forge.DataFormat.RawUInt16, id="RawUInt16"),
+            pytest.param(forge.DataFormat.RawUInt32, id="RawUInt32"),
+            pytest.param(forge.DataFormat.Int8, id="Int8"),
+            pytest.param(forge.DataFormat.UInt16, id="UInt16"),
+            pytest.param(forge.DataFormat.Int32, id="Int32"),
+        ],
+    )
+
+    quick = TestCollection(
+        input_shapes=[]
+        + [shape for shape in all.input_shapes if len(shape) in (2,) and shape[0] == 1][:2]
+        + [shape for shape in all.input_shapes if len(shape) in (2,) and shape[0] != 1][:2]
+        + [shape for shape in all.input_shapes if len(shape) in (3,) and shape[0] == 1][:2]
+        + [shape for shape in all.input_shapes if len(shape) in (3,) and shape[0] != 1][:2]
+        + [shape for shape in all.input_shapes if len(shape) in (4,) and shape[0] == 1][:2]
+        + [shape for shape in all.input_shapes if len(shape) in (4,) and shape[0] != 1][:2],
+        dev_data_formats=[
+            None,
+            forge.DataFormat.Float16_b,
+            forge.DataFormat.Int8,
+        ],
+    )