diff --git a/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py
new file mode 100644
index 000000000..c675387be
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py
@@ -0,0 +1,307 @@
+# # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# # SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+sys.path.append("forge/test/models/pytorch/multimodal/trajectron/trajectron/")
+import pytest
+import forge
+from test.models.pytorch.multimodal.trajectron.trajectron.model import Trajectron
+from test.models.pytorch.multimodal.trajectron.trajectron.model.model_registrar import ModelRegistrar
+from test.models.pytorch.multimodal.trajectron.trajectron.model.dataset import (
+    EnvironmentDataset,
+    collate,
+    get_timesteps_data,
+)
+from forge.verify.compare import compare_with_golden
+import os
+import json
+import dill
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Any
+import torch.nn.utils.rnn as rnn
+import pytest
+
+
+def load_hyperparams():
+    conf_path = "forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json"
+    with open(conf_path, "r", encoding="utf-8") as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Set Default values
+    hyperparams["scene_freq_mult_eval"] = False
+    hyperparams["node_freq_mult_eval"] = False
+    hyperparams["edge_encoding"] = False
+    hyperparams["incl_robot_node"] = False
+    hyperparams["use_map_encoding"] = False
+
+    hyperparams["edge_addition_filter"] = [1, 1]
+    hyperparams["edge_removal_filter"] = [1, 1]
+
+    return hyperparams
+
+
+def load_env():
+    eval_data_path = "forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl"
+    with open(eval_data_path, "rb") as f:
+        eval_env = dill.load(f, encoding="latin1")
+    return eval_env
+
+
+class TrajectronWrapper(nn.Module):
+    def __init__(
+        self,
+        model_dir: str,
+        hyperparams: dict[str, Any],
+        env: Any,
+        scene_index: int,
+        num_samples: int = 1,
+        z_mode: bool = True,
+        gmm_mode: bool = True,
+        all_z_sep: bool = False,
+        full_dist: bool = False,
+    ):
+        super().__init__()
+
+        # Build Model registrar
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir, exist_ok=False)
+        model_config_path = model_dir + "/config.json"
+        if not os.path.exists(model_config_path):
+            with open(model_config_path, "w") as conf_json:
+                json.dump(hyperparams, conf_json)
+        model_registrar = ModelRegistrar(model_dir, "cpu")
+
+        # Build Trajectron Model
+        self.model = Trajectron(model_registrar=model_registrar, hyperparams=hyperparams, log_writer=None, device="cpu")
+        self.model.set_environment(env=env)
+
+        self.model_dir = model_dir
+        self.hyperparams = hyperparams
+        self.env = env
+
+        assert len(self.env.NodeType) == 1
+        self.node_type = self.env.NodeType[0]
+
+        self.scene_index = scene_index
+        self.num_samples = num_samples
+        self.z_mode = z_mode
+        self.gmm_mode = gmm_mode
+        self.all_z_sep = all_z_sep
+        self.full_dist = full_dist
+
+    def _build_packed_sequence(
+        self,
+        packed_sequence_data,
+        packed_sequence_batch_sizes,
+        packed_sequence_sorted_indices,
+        packed_sequence_unsorted_indices,
+    ):
+        packed_sequence = torch.nn.utils.rnn.PackedSequence(
+            data=packed_sequence_data.squeeze(),
+            batch_sizes=packed_sequence_batch_sizes.squeeze(),
+            sorted_indices=packed_sequence_sorted_indices.squeeze(),
+            unsorted_indices=packed_sequence_unsorted_indices.squeeze(),
+        )
+        return packed_sequence
+
+    def forward(
+        self,
+        x,
+        x_st_t,
+        packed_sequence_data,
+        packed_sequence_batch_sizes,
+        packed_sequence_sorted_indices,
+        packed_sequence_unsorted_indices,
+        first_history_index,
+    ):
+        neighbors_data_st = None
+        neighbors_edge_value = None
+        robot_traj_st_t = None
+        map = None
+
+        ph = self.hyperparams["prediction_horizon"]
+
+        packed_x_st_t = self._build_packed_sequence(
+            packed_sequence_data,
+            packed_sequence_batch_sizes,
+            packed_sequence_sorted_indices,
+            packed_sequence_unsorted_indices,
+        )
+
+        model = self.model.node_models_dict[self.node_type]
+        predictions = model.predict(
+            inputs=x,
+            inputs_st=x_st_t,  # Pack and send this
+            packed_inputs_st=packed_x_st_t,
+            first_history_indices=first_history_index,
+            neighbors=neighbors_data_st,
+            neighbors_edge_value=neighbors_edge_value,
+            robot=robot_traj_st_t,
+            map=map,
+            prediction_horizon=ph,
+            num_samples=self.num_samples,
+            z_mode=self.z_mode,
+            gmm_mode=self.gmm_mode,
+            full_dist=self.full_dist,
+            all_z_sep=self.all_z_sep,
+        )
+
+        return predictions
+
+    def eval(self):
+        super().eval()
+        self.model.eval()
+
+    def get_input_batch(self, scene):
+        ph = self.hyperparams["prediction_horizon"]
+        timesteps = scene.sample_timesteps(1, min_future_timesteps=ph)
+
+        min_future_timesteps = ph
+        min_history_timesteps = 1
+
+        node_type = self.node_type
+        assert node_type in self.model.pred_state
+        model = self.model.node_models_dict[node_type]
+
+        # Get Input data for node type and given timesteps
+        batch = get_timesteps_data(
+            env=self.env,
+            scene=scene,
+            t=timesteps,
+            node_type=node_type,
+            state=self.model.state,
+            pred_state=self.model.pred_state,
+            edge_types=model.edge_types,
+            min_ht=min_history_timesteps,
+            max_ht=self.model.max_ht,
+            min_ft=min_future_timesteps,
+            max_ft=min_future_timesteps,
+            hyperparams=self.hyperparams,
+        )
+
+        assert batch is not None
+
+        (
+            (
+                first_history_index,
+                x_t,
+                y_t,
+                x_st_t,
+                y_st_t,
+                neighbors_data_st,
+                neighbors_edge_value,
+                robot_traj_st_t,
+                map,
+            ),
+            nodes,
+            timesteps_o,
+        ) = batch
+
+        device = self.model.device
+        x = x_t.to(device)
+        x_st_t = x_st_t.to(device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(device)
+
+        if type(map) == torch.Tensor:
+            map = map.to(device)
+
+        return (x, x_st_t, first_history_index, neighbors_data_st, neighbors_edge_value, robot_traj_st_t, map), (
+            nodes,
+            timesteps_o,
+        )
+
+
+def pack_input_sequences(sequences, lower_indices=None, upper_indices=None, total_length=None):
+    bs, tf = sequences.shape[:2]
+    if lower_indices is None:
+        lower_indices = torch.zeros(bs, dtype=torch.int)
+    if upper_indices is None:
+        upper_indices = torch.ones(bs, dtype=torch.int) * (tf - 1)
+    if total_length is None:
+        total_length = max(upper_indices) + 1
+    # This is done so that we can just pass in self.prediction_timesteps
+    # (which we want to INCLUDE, so this will exclude the next timestep).
+    inclusive_break_indices = upper_indices + 1
+
+    pad_list = list()
+    for i, seq_len in enumerate(inclusive_break_indices):
+        pad_list.append(sequences[i, lower_indices[i] : seq_len])
+
+    packed_seqs = rnn.pack_sequence(pad_list, enforce_sorted=False)
+
+    return packed_seqs
+
+
+def get_packed_sequence_values(packed_sequence):
+    values = (
+        packed_sequence.data.unsqueeze(0).unsqueeze(0),
+        packed_sequence.batch_sizes.unsqueeze(0),
+        packed_sequence.sorted_indices.unsqueeze(0),
+        packed_sequence.unsorted_indices.unsqueeze(0),
+    )
+    return values
+
+
+@pytest.mark.nightly
+@pytest.mark.model_analysis
+def test_trajectronpp_pytorch():
+    env = load_env()
+    hyperparams = load_hyperparams()
+    model_dir = "forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir"
+
+    # Build Pytorch Model
+    pt_model = TrajectronWrapper(model_dir=model_dir, hyperparams=hyperparams, env=env, scene_index=0)
+    pt_model.eval()
+
+    scene = env.scenes[0]
+    inputs_batch = pt_model.get_input_batch(scene=scene)
+
+    (x, x_st_t, first_history_index, neighbors_data_st, neighbors_edge_value, robot_traj_st_t, map), (
+        nodes,
+        timesteps_o,
+    ) = inputs_batch
+
+    packed_x_st_t = pack_input_sequences(x_st_t, lower_indices=first_history_index)
+    (
+        packed_sequence_data,
+        packed_sequence_batch_sizes,
+        packed_sequence_sorted_indices,
+        packed_sequence_unsorted_indices,
+    ) = get_packed_sequence_values(packed_x_st_t)
+
+    assert neighbors_data_st is None
+    assert neighbors_edge_value is None
+    assert robot_traj_st_t is None
+    assert map is None
+    # Run CPU Inference
+    output = pt_model(
+        x,
+        x_st_t,
+        packed_sequence_data,
+        packed_sequence_batch_sizes,
+        packed_sequence_sorted_indices,
+        packed_sequence_unsorted_indices,
+        first_history_index,
+    )
+    inputs = [
+        x,
+        x_st_t,
+        packed_sequence_data,
+        packed_sequence_batch_sizes,
+        packed_sequence_sorted_indices,
+        packed_sequence_unsorted_indices,
+        first_history_index,
+    ]
+    compiled_model = forge.compile(pt_model, inputs)
+    co_out = compiled_model(*inputs)
+    fw_out = pt_model(*inputs)
+
+    co_out = [co.to("cpu") for co in co_out]
+    fw_out = [fw_out] if isinstance(fw_out, torch.Tensor) else fw_out
+
+    assert all([compare_with_golden(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py
new file mode 100644
index 000000000..e7543593d
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from model import Trajectron
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py
new file mode 100644
index 000000000..526e95fb3
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py
@@ -0,0 +1,138 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--conf", help="path to json config file for hyperparameters", type=str, default="../config/config.json"
+)
+
+parser.add_argument("--debug", help="disable all disk writing processes.", action="store_true")
+
+parser.add_argument("--preprocess_workers", help="number of processes to spawn for preprocessing", type=int, default=0)
+
+
+# Model Parameters
+parser.add_argument(
+    "--offline_scene_graph",
+    help="whether to precompute the scene graphs offline, options are 'no' and 'yes'",
+    type=str,
+    default="yes",
+)
+
+parser.add_argument(
+    "--dynamic_edges", help="whether to use dynamic edges or not, options are 'no' and 'yes'", type=str, default="yes"
+)
+
+parser.add_argument(
+    "--edge_state_combine_method",
+    help="the method to use for combining edges of the same type",
+    type=str,
+    default="sum",
+)
+
+parser.add_argument(
+    "--edge_influence_combine_method",
+    help="the method to use for combining edge influences",
+    type=str,
+    default="attention",
+)
+
+parser.add_argument(
+    "--edge_addition_filter",
+    nargs="+",
+    help="what scaling to use for edges as they're created",
+    type=float,
+    default=[0.25, 0.5, 0.75, 1.0],
+)  # We don't automatically pad left with 0.0, if you want a sharp
+# and short edge addition, then you need to have a 0.0 at the
+# beginning, e.g. [0.0, 1.0].
+
+parser.add_argument(
+    "--edge_removal_filter",
+    nargs="+",
+    help="what scaling to use for edges as they're removed",
+    type=float,
+    default=[1.0, 0.0],
+)  # We don't automatically pad right with 0.0, if you want a sharp drop off like
+# the default, then you need to have a 0.0 at the end.
+
+parser.add_argument(
+    "--override_attention_radius",
+    action="append",
+    help='Specify one attention radius to override. E.g. "PEDESTRIAN VEHICLE 10.0"',
+    default=[],
+)
+
+parser.add_argument(
+    "--incl_robot_node",
+    help="whether to include a robot node in the graph or simply model all agents",
+    action="store_true",
+)
+
+parser.add_argument("--map_encoding", help="Whether to use map encoding or not", action="store_true")
+
+parser.add_argument("--augment", help="Whether to augment the scene during training", action="store_true")
+
+parser.add_argument(
+    "--node_freq_mult_train", help="Whether to use frequency multiplying of nodes during training", action="store_true"
+)
+
+parser.add_argument(
+    "--node_freq_mult_eval", help="Whether to use frequency multiplying of nodes during evaluation", action="store_true"
+)
+
+parser.add_argument(
+    "--scene_freq_mult_train", help="Whether to use frequency multiplying of nodes during training", action="store_true"
+)
+
+parser.add_argument(
+    "--scene_freq_mult_eval",
+    help="Whether to use frequency multiplying of nodes during evaluation",
+    action="store_true",
+)
+
+parser.add_argument(
+    "--scene_freq_mult_viz", help="Whether to use frequency multiplying of nodes during evaluation", action="store_true"
+)
+
+parser.add_argument("--no_edge_encoding", help="Whether to use neighbors edge encoding", action="store_true")
+
+# Data Parameters
+parser.add_argument("--data_dir", help="what dir to look in for data", type=str, default="../experiments/processed")
+
+parser.add_argument("--train_data_dict", help="what file to load for training data", type=str, default="train.pkl")
+
+parser.add_argument("--eval_data_dict", help="what file to load for evaluation data", type=str, default="val.pkl")
+
+parser.add_argument(
+    "--log_dir",
+    help="what dir to save training information (i.e., saved models, logs, etc)",
+    type=str,
+    default="../experiments/logs",
+)
+
+parser.add_argument("--log_tag", help="tag for the log folder", type=str, default="")
+
+parser.add_argument("--device", help="what device to perform training on", type=str, default="cuda:0")
+
+parser.add_argument("--eval_device", help="what device to use during evaluation", type=str, default=None)
+
+# Training Parameters
+parser.add_argument("--train_epochs", help="number of iterations to train for", type=int, default=1)
+
+parser.add_argument("--batch_size", help="training batch size", type=int, default=256)
+
+parser.add_argument("--eval_batch_size", help="evaluation batch size", type=int, default=256)
+
+parser.add_argument("--k_eval", help="how many samples to take during evaluation", type=int, default=25)
+
+parser.add_argument("--seed", help="manual seed to use, default is 123", type=int, default=123)
+
+parser.add_argument("--eval_every", help="how often to evaluate during training, never if None", type=int, default=1)
+
+parser.add_argument("--vis_every", help="how often to visualize during training, never if None", type=int, default=1)
+
+parser.add_argument("--save_every", help="how often to save during training, never if None", type=int, default=1)
+args = parser.parse_args()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json
new file mode 100644
index 000000000..fca815729
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json
@@ -0,0 +1,90 @@
+{
+
+  "batch_size": 256,
+  "grad_clip": 1.0,
+
+  "learning_rate_style": "exp",
+  "learning_rate": 0.001,
+  "min_learning_rate": 0.00001,
+  "learning_decay_rate": 0.9999,
+
+  "prediction_horizon": 12,
+  "minimum_history_length": 1,
+  "maximum_history_length": 8,
+
+  "map_encoder": {
+    "PEDESTRIAN": {
+      "heading_state_index": 5,
+      "patch_size": [50, 10, 50, 90],
+      "map_channels": 3,
+      "hidden_channels": [10, 20, 10, 1],
+      "output_size": 32,
+      "masks": [5, 5, 5, 5],
+      "strides": [1, 1, 1, 1],
+      "dropout": 0.5
+    }
+  },
+
+  "k": 1,
+  "k_eval": 1,
+
+  "kl_min": 0.07,
+  "kl_weight": 100.0,
+  "kl_weight_start": 0,
+  "kl_decay_rate": 0.99995,
+  "kl_crossover": 400,
+  "kl_sigmoid_divisor": 4,
+
+  "rnn_kwargs": {
+    "dropout_keep_prob": 0.75
+  },
+  "MLP_dropout_keep_prob": 0.9,
+  "enc_rnn_dim_edge": 32,
+  "enc_rnn_dim_edge_influence": 32,
+  "enc_rnn_dim_history": 32,
+  "enc_rnn_dim_future": 32,
+  "dec_rnn_dim": 128,
+
+  "q_z_xy_MLP_dims": null,
+  "p_z_x_MLP_dims": 32,
+  "GMM_components": 1,
+
+  "log_p_yt_xz_max": 6,
+
+  "N": 1,
+  "K": 25,
+
+  "tau_init": 2.0,
+  "tau_final": 0.05,
+  "tau_decay_rate": 0.997,
+
+  "use_z_logit_clipping": true,
+  "z_logit_clip_start": 0.05,
+  "z_logit_clip_final": 5.0,
+  "z_logit_clip_crossover": 300,
+  "z_logit_clip_divisor": 5,
+
+  "dynamic": {
+      "PEDESTRIAN": {
+        "name": "SingleIntegrator",
+        "distribution": true,
+        "limits": {}
+      }
+  },
+
+  "state": {
+      "PEDESTRIAN": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"]
+    }
+  },
+
+  "pred_state": {
+    "PEDESTRIAN": {
+      "position": ["x", "y"]
+    }
+  },
+
+  "log_histograms": false
+}
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json
new file mode 100644
index 000000000..acebf8e1e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json
@@ -0,0 +1,109 @@
+{
+
+  "batch_size": 256,
+  "grad_clip": 1.0,
+
+  "learning_rate_style": "exp",
+  "learning_rate": 0.003,
+  "min_learning_rate": 0.00001,
+  "learning_decay_rate": 0.9999,
+
+  "prediction_horizon": 6,
+  "minimum_history_length": 1,
+  "maximum_history_length": 8,
+
+  "map_encoder": {
+    "VEHICLE": {
+      "heading_state_index": 6,
+      "patch_size": [50, 10, 50, 90],
+      "map_channels": 3,
+      "hidden_channels": [10, 20, 10, 1],
+      "output_size": 32,
+      "masks": [5, 5, 5, 3],
+      "strides": [2, 2, 1, 1],
+      "dropout": 0.5
+    }
+  },
+
+  "k": 1,
+  "k_eval": 1,
+
+  "kl_min": 0.07,
+  "kl_weight": 100.0,
+  "kl_weight_start": 0,
+  "kl_decay_rate": 0.99995,
+  "kl_crossover": 400,
+  "kl_sigmoid_divisor": 4,
+
+  "rnn_kwargs": {
+    "dropout_keep_prob": 0.75
+  },
+  "MLP_dropout_keep_prob": 0.9,
+  "enc_rnn_dim_edge": 32,
+  "enc_rnn_dim_edge_influence": 32,
+  "enc_rnn_dim_history": 32,
+  "enc_rnn_dim_future": 32,
+  "dec_rnn_dim": 128,
+
+  "q_z_xy_MLP_dims": null,
+  "p_z_x_MLP_dims": 32,
+  "GMM_components": 1,
+
+  "log_p_yt_xz_max": 6,
+
+  "N": 1,
+  "K": 25,
+
+  "tau_init": 2.0,
+  "tau_final": 0.05,
+  "tau_decay_rate": 0.997,
+
+  "use_z_logit_clipping": true,
+  "z_logit_clip_start": 0.05,
+  "z_logit_clip_final": 5.0,
+  "z_logit_clip_crossover": 300,
+  "z_logit_clip_divisor": 5,
+
+  "dynamic": {
+    "PEDESTRIAN": {
+      "name": "SingleIntegrator",
+      "distribution": true,
+      "limits": {}
+    },
+    "VEHICLE": {
+      "name": "Unicycle",
+      "distribution": true,
+      "limits": {
+          "max_a": 4,
+          "min_a": -5,
+          "max_heading_change": 0.7,
+          "min_heading_change": -0.7
+        }
+    }
+  },
+
+  "state": {
+    "PEDESTRIAN": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"]
+    },
+    "VEHICLE": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"],
+      "heading": ["°", "d°"]
+    }
+  },
+
+  "pred_state": {
+    "VEHICLE": {
+      "position": ["x", "y"]
+    },
+    "PEDESTRIAN": {
+      "position": ["x", "y"]
+    }
+  },
+
+  "log_histograms": false
+}
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl b/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl
new file mode 100644
index 000000000..8afc0d7c0
Binary files /dev/null and b/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl differ
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py
new file mode 100644
index 000000000..422f33fc3
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .data_structures import RingBuffer, SingleHeaderNumpyArray, DoubleHeaderNumpyArray
+from .scene import Scene
+from .node import Node
+from .scene_graph import TemporalSceneGraph, SceneGraph
+from .environment import Environment
+from .node_type import NodeTypeEnum
+from .data_utils import derivative_of
+from .map import GeometricMap
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py
new file mode 100644
index 000000000..20e50e83e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py
@@ -0,0 +1,282 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pandas as pd
+from collections.abc import Sequence
+from collections import OrderedDict
+
+
+class RingBuffer(Sequence):
+    def __init__(self, capacity, dtype=float, allow_overwrite=True):
+        """
+        Create a new ring buffer with the given capacity and element type.
+        Code copy-pasted from: https://github.com/eric-wieser/numpy_ringbuffer
+
+        Parameters
+        ----------
+        capacity: int
+            The maximum capacity of the ring buffer
+        dtype: data-type, optional
+            Desired type of buffer elements. Use a type like (float, 2) to
+            produce a buffer with shape (N, 2)
+        allow_overwrite: bool
+            If false, throw an IndexError when trying to append to an already
+            full buffer
+        """
+        self._arr = np.full(capacity, np.nan, dtype)
+        self._left_index = 0
+        self._right_index = 0
+        self._capacity = capacity
+        self._allow_overwrite = allow_overwrite
+
+    def _unwrap(self):
+        """Copy the data from this buffer into unwrapped form"""
+        return np.concatenate(
+            (
+                self._arr[self._left_index : min(self._right_index, self._capacity)],
+                self._arr[: max(self._right_index - self._capacity, 0)],
+            )
+        )
+
+    def _fix_indices(self):
+        """
+        Enforce our invariant that 0 <= self._left_index < self._capacity
+        """
+        if self._left_index >= self._capacity:
+            self._left_index -= self._capacity
+            self._right_index -= self._capacity
+        elif self._left_index < 0:
+            self._left_index += self._capacity
+            self._right_index += self._capacity
+
+    @property
+    def is_full(self):
+        """True if there is no more space in the buffer"""
+        return len(self) == self._capacity
+
+    # numpy compatibility
+    def __array__(self):
+        return self._unwrap()
+
+    @property
+    def dtype(self):
+        return self._arr.dtype
+
+    @property
+    def shape(self):
+        return (len(self),) + self._arr.shape[1:]
+
+    # these mirror methods from deque
+    @property
+    def maxlen(self):
+        return self._capacity
+
+    def append(self, value):
+        if self.is_full:
+            if not self._allow_overwrite:
+                raise IndexError("append to a full RingBuffer with overwrite disabled")
+            elif not len(self):
+                return
+            else:
+                self._left_index += 1
+
+        self._arr[self._right_index % self._capacity] = value
+        self._right_index += 1
+        self._fix_indices()
+
+    def appendleft(self, value):
+        if self.is_full:
+            if not self._allow_overwrite:
+                raise IndexError("append to a full RingBuffer with overwrite disabled")
+            elif not len(self):
+                return
+            else:
+                self._right_index -= 1
+
+        self._left_index -= 1
+        self._fix_indices()
+        self._arr[self._left_index] = value
+
+    def pop(self):
+        if len(self) == 0:
+            raise IndexError("pop from an empty RingBuffer")
+        self._right_index -= 1
+        self._fix_indices()
+        res = self._arr[self._right_index % self._capacity]
+        return res
+
+    def popleft(self):
+        if len(self) == 0:
+            raise IndexError("pop from an empty RingBuffer")
+        res = self._arr[self._left_index]
+        self._left_index += 1
+        self._fix_indices()
+        return res
+
+    def extend(self, values):
+        lv = len(values)
+        if len(self) + lv > self._capacity:
+            if not self._allow_overwrite:
+                raise IndexError("extend a RingBuffer such that it would overflow, with overwrite disabled")
+            elif not len(self):
+                return
+        if lv >= self._capacity:
+            # wipe the entire array! - this may not be threadsafe
+            self._arr[...] = values[-self._capacity :]
+            self._right_index = self._capacity
+            self._left_index = 0
+            return
+
+        ri = self._right_index % self._capacity
+        sl1 = np.s_[ri : min(ri + lv, self._capacity)]
+        sl2 = np.s_[: max(ri + lv - self._capacity, 0)]
+        self._arr[sl1] = values[: sl1.stop - sl1.start]
+        self._arr[sl2] = values[sl1.stop - sl1.start :]
+        self._right_index += lv
+
+        self._left_index = max(self._left_index, self._right_index - self._capacity)
+        self._fix_indices()
+
+    def extendleft(self, values):
+        lv = len(values)
+        if len(self) + lv > self._capacity:
+            if not self._allow_overwrite:
+                raise IndexError("extend a RingBuffer such that it would overflow, with overwrite disabled")
+            elif not len(self):
+                return
+        if lv >= self._capacity:
+            # wipe the entire array! - this may not be threadsafe
+            self._arr[...] = values[: self._capacity]
+            self._right_index = self._capacity
+            self._left_index = 0
+            return
+
+        self._left_index -= lv
+        self._fix_indices()
+        li = self._left_index
+        sl1 = np.s_[li : min(li + lv, self._capacity)]
+        sl2 = np.s_[: max(li + lv - self._capacity, 0)]
+        self._arr[sl1] = values[: sl1.stop - sl1.start]
+        self._arr[sl2] = values[sl1.stop - sl1.start :]
+
+        self._right_index = min(self._right_index, self._left_index + self._capacity)
+
+    # implement Sequence methods
+    def __len__(self):
+        return self._right_index - self._left_index
+
+    def __getitem__(self, item):
+        # handle simple (b[1]) and basic (b[np.array([1, 2, 3])]) fancy indexing specially
+        if not isinstance(item, tuple):
+            item_arr = np.asarray(item)
+            if issubclass(item_arr.dtype.type, np.integer):
+                item_arr = (item_arr + self._left_index) % self._capacity
+                return self._arr[item_arr]
+
+        # for everything else, get it right at the expense of efficiency
+        return self._unwrap()[item]
+
+    def __iter__(self):
+        # alarmingly, this is comparable in speed to using itertools.chain
+        return iter(self._unwrap())
+
+    # Everything else
+    def __repr__(self):
+        return "<RingBuffer of {!r}>".format(np.asarray(self))
+
+
+class DoubleHeaderNumpyArray(object):
+    def __init__(self, data: np.ndarray, header: list):
+        """
+        Data Structure mirroring some functionality of double indexed pandas DataFrames.
+        Indexing options are:
+        [:, (header1, header2)]
+        [:, [(header1, header2), (header1, header2)]]
+        [:, {header1: [header21, header22]}]
+
+        A SingleHeaderNumpyArray can is returned if an element of the first header is querried as attribut:
+        doubleHeaderNumpyArray.position -> SingleHeaderNumpyArray
+
+        :param data: The numpy array.
+        :param header: The double header structure as list of tuples [(header11, header21), (header11, header22) ...]
+        """
+        self.data = data
+        self.header = header
+        self.double_header_lookup = OrderedDict()
+        self.tree_header_lookup = OrderedDict()
+        for i, header_item in enumerate(header):
+            self.double_header_lookup[header_item] = i
+            if header_item[0] not in self.tree_header_lookup:
+                self.tree_header_lookup[header_item[0]] = dict()
+            self.tree_header_lookup[header_item[0]][header_item[1]] = i
+
+    def __mul__(self, other):
+        return DoubleHeaderNumpyArray(self.data * other, self.header)
+
+    def get_single_header_array(self, h1: str, rows=slice(None, None, None)):
+        data_integer_indices = list()
+        h2_list = list()
+        for h2 in self.tree_header_lookup[h1]:
+            data_integer_indices.append(self.tree_header_lookup[h1][h2])
+            h2_list.append(h2)
+        return SingleHeaderNumpyArray(self.data[rows, data_integer_indices], h2_list)
+
+    def __getitem__(self, item):
+        rows, columns = item
+        data_integer_indices = list()
+        if type(columns) is dict:
+            for h1, h2s in columns.items():
+                for h2 in h2s:
+                    data_integer_indices.append(self.double_header_lookup[(h1, h2)])
+            return self.data[rows, data_integer_indices]
+        elif type(columns) is list:
+            for column in columns:
+                assert type(column) is tuple, "If Index is list it hast to be list of double header tuples."
+                data_integer_indices.append(self.double_header_lookup[column])
+            return self.data[rows, data_integer_indices]
+        elif type(columns) is tuple:
+            return self.data[rows, self.double_header_lookup[columns]]
+        else:
+            assert type(item) is str, "Index must be str, list of tuples or dict of tree structure."
+            return self.get_single_header_array(item, rows=rows)
+
+    def __getattr__(self, item):
+        if not item.startswith("_"):
+            if item in self.tree_header_lookup.keys():
+                return self.get_single_header_array(item)
+            else:
+                try:
+                    return self.data.__getattribute__(item)
+                except AttributeError:
+                    return super().__getattribute__(item)
+        else:
+            return super().__getattribute__(item)
+
+
+class SingleHeaderNumpyArray(object):
+    def __init__(self, data: np.ndarray, header: list):
+        self.data = data
+        self.header_lookup = OrderedDict({h: i for i, h in enumerate(header)})
+
+    def __getitem__(self, item):
+        rows, columns = item
+        data_integer_indices = list()
+        if type(columns) is list or type(columns) is tuple:
+            for column in columns:
+                data_integer_indices.append(self.header_lookup[column])
+        else:
+            data_integer_indices = self.header_lookup[columns]
+        return self.data[rows, data_integer_indices]
+
+    def __getattr__(self, item):
+        if not item.startswith("_"):
+            if item in self.header_lookup.keys():
+                return self[:, item]
+            else:
+                try:
+                    return self.data.__getattribute__(item)
+                except AttributeError:
+                    return super().__getattribute__(item)
+        else:
+            return super().__getattribute__(item)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py
new file mode 100644
index 000000000..f8c5d1ff9
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+
+def make_continuous_copy(alpha):
+    alpha = (alpha + np.pi) % (2.0 * np.pi) - np.pi
+    continuous_x = np.zeros_like(alpha)
+    continuous_x[0] = alpha[0]
+    for i in range(1, len(alpha)):
+        if not (np.sign(alpha[i]) == np.sign(alpha[i - 1])) and np.abs(alpha[i]) > np.pi / 2:
+            continuous_x[i] = (
+                continuous_x[i - 1] + (alpha[i] - alpha[i - 1]) - np.sign((alpha[i] - alpha[i - 1])) * 2 * np.pi
+            )
+        else:
+            continuous_x[i] = continuous_x[i - 1] + (alpha[i] - alpha[i - 1])
+
+    return continuous_x
+
+
+def derivative_of(x, dt=1, radian=False):
+    if radian:
+        x = make_continuous_copy(x)
+
+    not_nan_mask = ~np.isnan(x)
+    masked_x = x[not_nan_mask]
+
+    if masked_x.shape[-1] < 2:
+        return np.zeros_like(x)
+
+    dx = np.full_like(x, np.nan)
+    dx[not_nan_mask] = np.ediff1d(masked_x, to_begin=(masked_x[1] - masked_x[0])) / dt
+
+    return dx
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py
new file mode 100644
index 000000000..48bf80d4f
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import json
+import numpy as np
+from itertools import product
+from .node_type import NodeTypeEnum
+
+
+class Environment(object):
+    def __init__(self, node_type_list, standardization, scenes=None, attention_radius=None, robot_type=None):
+        self.scenes = scenes
+        self.node_type_list = node_type_list
+        self.attention_radius = attention_radius
+        self.NodeType = NodeTypeEnum(node_type_list)
+        self.robot_type = robot_type
+
+        self.standardization = standardization
+        self.standardize_param_memo = dict()
+
+        self._scenes_resample_prop = None
+
+    def get_edge_types(self):
+        return list(product(self.NodeType, repeat=2))
+
+    def get_standardize_params(self, state, node_type):
+        memo_key = (json.dumps(state), node_type)
+        if memo_key in self.standardize_param_memo:
+            return self.standardize_param_memo[memo_key]
+
+        standardize_mean_list = list()
+        standardize_std_list = list()
+        for entity, dims in state.items():
+            for dim in dims:
+                standardize_mean_list.append(self.standardization[node_type][entity][dim]["mean"])
+                standardize_std_list.append(self.standardization[node_type][entity][dim]["std"])
+        standardize_mean = np.stack(standardize_mean_list)
+        standardize_std = np.stack(standardize_std_list)
+
+        self.standardize_param_memo[memo_key] = (standardize_mean, standardize_std)
+        return standardize_mean, standardize_std
+
+    def standardize(self, array, state, node_type, mean=None, std=None):
+        if mean is None and std is None:
+            mean, std = self.get_standardize_params(state, node_type)
+        elif mean is None and std is not None:
+            mean, _ = self.get_standardize_params(state, node_type)
+        elif mean is not None and std is None:
+            _, std = self.get_standardize_params(state, node_type)
+        return np.where(np.isnan(array), np.array(np.nan), (array - mean) / std)
+
+    def unstandardize(self, array, state, node_type, mean=None, std=None):
+        if mean is None and std is None:
+            mean, std = self.get_standardize_params(state, node_type)
+        elif mean is None and std is not None:
+            mean, _ = self.get_standardize_params(state, node_type)
+        elif mean is not None and std is None:
+            _, std = self.get_standardize_params(state, node_type)
+        return array * std + mean
+
+    @property
+    def scenes_resample_prop(self):
+        if self._scenes_resample_prop is None:
+            self._scenes_resample_prop = np.array([scene.resample_prob for scene in self.scenes])
+            self._scenes_resample_prop = self._scenes_resample_prop / np.sum(self._scenes_resample_prop)
+        return self._scenes_resample_prop
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py
new file mode 100644
index 000000000..47cbb84ca
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import numpy as np
+from model.dataset.homography_warper import get_rotation_matrix2d, warp_affine_crop
+
+
+class Map(object):
+    def __init__(self, data, homography, description=None):
+        self.data = data
+        self.homography = homography
+        self.description = description
+
+    def as_image(self):
+        raise NotImplementedError
+
+    def get_cropped_maps(self, world_pts, patch_size, rotation=None, device="cpu"):
+        raise NotImplementedError
+
+    def to_map_points(self, scene_pts):
+        raise NotImplementedError
+
+
+class GeometricMap(Map):
+    """
+    A Geometric Map is a int tensor of shape [layers, x, y]. The homography must transform a point in scene
+    coordinates to the respective point in map coordinates.
+
+    :param data: Numpy array of shape [layers, x, y]
+    :param homography: Numpy array of shape [3, 3]
+    """
+
+    def __init__(self, data, homography, description=None):
+        # assert isinstance(data.dtype, np.floating), "Geometric Maps must be float values."
+        super(GeometricMap, self).__init__(data, homography, description=description)
+
+        self._last_padding = None
+        self._last_padded_map = None
+        self._torch_map = None
+
+    def torch_map(self, device):
+        if self._torch_map is not None:
+            return self._torch_map
+        self._torch_map = torch.tensor(self.data, dtype=torch.uint8, device=device)
+        return self._torch_map
+
+    def as_image(self):
+        # We have to transpose x and y to rows and columns. Assumes origin is lower left for image
+        # Also we move the channels to the last dimension
+        return (np.transpose(self.data, (2, 1, 0))).astype(np.uint)
+
+    def get_padded_map(self, padding_x, padding_y, device):
+        if self._last_padding == (padding_x, padding_y):
+            return self._last_padded_map
+        else:
+            self._last_padding = (padding_x, padding_y)
+            self._last_padded_map = torch.full(
+                (self.data.shape[0], self.data.shape[1] + 2 * padding_x, self.data.shape[2] + 2 * padding_y),
+                False,
+                dtype=torch.uint8,
+            )
+            self._last_padded_map[..., padding_x:-padding_x, padding_y:-padding_y] = self.torch_map(device)
+            return self._last_padded_map
+
+    @staticmethod
+    def batch_rotate(map_batched, centers, angles, out_height, out_width):
+        """
+        As the input is a map and the warp_affine works on an image coordinate system we would have to
+        flip the y axis updown, negate the angles, and flip it back after transformation.
+        This, however, is the same as not flipping at and not negating the radian.
+
+        :param map_batched:
+        :param centers:
+        :param angles:
+        :param out_height:
+        :param out_width:
+        :return:
+        """
+        M = get_rotation_matrix2d(centers, angles, torch.ones_like(angles))
+        rotated_map_batched = warp_affine_crop(
+            map_batched, centers, M, dsize=(out_height, out_width), padding_mode="zeros"
+        )
+
+        return rotated_map_batched
+
+    @classmethod
+    def get_cropped_maps_from_scene_map_batch(cls, maps, scene_pts, patch_size, rotation=None, device="cpu"):
+        """
+        Returns rotated patches of each map around the transformed scene points.
+        ___________________
+        |       |          |
+        |       |ps[3]     |
+        |       |          |
+        |       |          |
+        |      o|__________|
+        |       |    ps[2] |
+        |       |          |
+        |_______|__________|
+        ps = patch_size
+
+        :param maps: List of GeometricMap objects [bs]
+        :param scene_pts: Scene points: [bs, 2]
+        :param patch_size: Extracted Patch size after rotation: [-x, -y, +x, +y]
+        :param rotation: Rotations in degrees: [bs]
+        :param device: Device on which the rotated tensors should be returned.
+        :return: Rotated and cropped tensor patches.
+        """
+        batch_size = scene_pts.shape[0]
+        lat_size = 2 * np.max((patch_size[0], patch_size[2]))
+        long_size = 2 * np.max((patch_size[1], patch_size[3]))
+        assert lat_size % 2 == 0, "Patch width must be divisible by 2"
+        assert long_size % 2 == 0, "Patch length must be divisible by 2"
+        lat_size_half = lat_size // 2
+        long_size_half = long_size // 2
+
+        context_padding_x = int(np.ceil(np.sqrt(2) * lat_size))
+        context_padding_y = int(np.ceil(np.sqrt(2) * long_size))
+
+        centers = torch.tensor(
+            [s_map.to_map_points(scene_pts[np.newaxis, i]) for i, s_map in enumerate(maps)],
+            dtype=torch.long,
+            device=device,
+        ).squeeze(dim=1) + torch.tensor([context_padding_x, context_padding_y], device=device, dtype=torch.long)
+
+        padded_map = [s_map.get_padded_map(context_padding_x, context_padding_y, device=device) for s_map in maps]
+
+        padded_map_batched = torch.stack(
+            [
+                padded_map[i][
+                    ...,
+                    centers[i, 0] - context_padding_x : centers[i, 0] + context_padding_x,
+                    centers[i, 1] - context_padding_y : centers[i, 1] + context_padding_y,
+                ]
+                for i in range(centers.shape[0])
+            ],
+            dim=0,
+        )
+
+        center_patches = torch.tensor([[context_padding_y, context_padding_x]], dtype=torch.int, device=device).repeat(
+            batch_size, 1
+        )
+
+        if rotation is not None:
+            angles = torch.Tensor(rotation)
+        else:
+            angles = torch.zeros(batch_size)
+
+        rotated_map_batched = cls.batch_rotate(
+            padded_map_batched / 255.0, center_patches.float(), angles, long_size, lat_size
+        )
+
+        del padded_map_batched
+
+        return rotated_map_batched[
+            ...,
+            long_size_half - patch_size[1] : (long_size_half + patch_size[3]),
+            lat_size_half - patch_size[0] : (lat_size_half + patch_size[2]),
+        ]
+
+    def get_cropped_maps(self, scene_pts, patch_size, rotation=None, device="cpu"):
+        """
+        Returns rotated patches of the map around the transformed scene points.
+        ___________________
+        |       |          |
+        |       |ps[3]     |
+        |       |          |
+        |       |          |
+        |      o|__________|
+        |       |    ps[2] |
+        |       |          |
+        |_______|__________|
+        ps = patch_size
+
+        :param scene_pts: Scene points: [bs, 2]
+        :param patch_size: Extracted Patch size after rotation: [-lat, -long, +lat, +long]
+        :param rotation: Rotations in degrees: [bs]
+        :param device: Device on which the rotated tensors should be returned.
+        :return: Rotated and cropped tensor patches.
+        """
+        return self.get_cropped_maps_from_scene_map_batch(
+            [self] * scene_pts.shape[0], scene_pts, patch_size, rotation=rotation, device=device
+        )
+
+    def to_map_points(self, scene_pts):
+        org_shape = None
+        if len(scene_pts.shape) > 2:
+            org_shape = scene_pts.shape
+            scene_pts = scene_pts.reshape((-1, 2))
+        N, dims = scene_pts.shape
+        points_with_one = np.ones((dims + 1, N))
+        points_with_one[:dims] = scene_pts.T
+        map_points = (self.homography @ points_with_one).T[..., :dims]
+        if org_shape is not None:
+            map_points = map_points.reshape(org_shape)
+        return map_points
+
+
+class ImageMap(Map):  # TODO Implement for image maps -> watch flipped coordinate system
+    def __init__(self):
+        raise NotImplementedError
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py
new file mode 100644
index 000000000..b27b3ce17
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py
@@ -0,0 +1,256 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import random
+import numpy as np
+import pandas as pd
+from .data_structures import DoubleHeaderNumpyArray
+
+# from ncls import NCLS
+
+
+class Node(object):
+    def __init__(
+        self,
+        node_type,
+        node_id,
+        data,
+        length=None,
+        width=None,
+        height=None,
+        first_timestep=0,
+        is_robot=False,
+        description="",
+        frequency_multiplier=1,
+        non_aug_node=None,
+    ):
+        self.type = node_type
+        self.id = node_id
+        self.length = length
+        self.width = width
+        self.height = height
+        self.first_timestep = first_timestep
+        self.non_aug_node = non_aug_node
+
+        if data is not None:
+            if isinstance(data, pd.DataFrame):
+                self.data = DoubleHeaderNumpyArray(data.values, list(data.columns))
+            elif isinstance(data, DoubleHeaderNumpyArray):
+                self.data = data
+        else:
+            self.data = None
+
+        self.is_robot = is_robot
+        self._last_timestep = None
+        self.description = description
+        self.frequency_multiplier = frequency_multiplier
+
+        self.forward_in_time_on_next_override = False
+
+    def __eq__(self, other):
+        return (
+            (isinstance(other, self.__class__) or isinstance(self, other.__class__))
+            and self.id == other.id
+            and self.type == other.type
+        )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash((self.type, self.id))
+
+    def __repr__(self):
+        return "/".join([self.type.name, self.id])
+
+    def overwrite_data(self, data, header, forward_in_time_on_next_overwrite=False):
+        """
+        This function hard overwrites the data matrix. When using it you have to make sure that the columns
+        in the new data matrix correspond to the old structure. As well as setting first_timestep.
+
+        :param data: New data matrix
+        :param forward_in_time_on_next_overwrite: On the !!NEXT!! call of overwrite_data first_timestep will be increased.
+        :return:  None
+        """
+        if header is None:
+            self.data.data = data
+        else:
+            self.data = DoubleHeaderNumpyArray(data, header)
+
+        self._last_timestep = None
+        if self.forward_in_time_on_next_override:
+            self.first_timestep += 1
+        self.forward_in_time_on_next_override = forward_in_time_on_next_overwrite
+
+    def scene_ts_to_node_ts(self, scene_ts) -> (np.ndarray, int, int):
+        """
+        Transforms timestamp from scene into timeframe of node data.
+
+        :param scene_ts: Scene timesteps
+        :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in
+                node data before data is available. paddingu: Number of timesteps in scene range which are not
+                available in node data after data is available.
+        """
+        paddingl = (self.first_timestep - scene_ts[0]).clip(0)
+        paddingu = (scene_ts[1] - self.last_timestep).clip(0)
+        ts = np.array(scene_ts).clip(min=self.first_timestep, max=self.last_timestep) - self.first_timestep
+        return ts, paddingl, paddingu
+
+    def history_points_at(self, ts) -> int:
+        """
+        Number of history points in trajectory. Timestep is exclusive.
+
+        :param ts: Scene timestep where the number of history points are queried.
+        :return: Number of history timesteps.
+        """
+        return ts - self.first_timestep
+
+    def get(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        """
+        Returns a time range of multiple properties of the node.
+
+        :param tr_scene: The timestep range (inklusive).
+        :param state: The state description for which the properties are returned.
+        :param padding: The value which should be used for padding if not enough information is available.
+        :return: Array of node property values.
+        """
+        if tr_scene.size == 1:
+            tr_scene = np.array([tr_scene[0], tr_scene[0]])
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+        tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene)
+        data_array = self.data[tr[0] : tr[1] + 1, state]
+        padded_data_array = np.full((length, data_array.shape[1]), fill_value=padding)
+        padded_data_array[paddingl : length - paddingu] = data_array
+        return padded_data_array
+
+    @property
+    def timesteps(self) -> int:
+        """
+        Number of available timesteps for node.
+
+        :return: Number of available timesteps.
+        """
+        return self.data.shape[0]
+
+    @property
+    def last_timestep(self) -> int:
+        """
+        Nodes last timestep in the Scene.
+
+        :return: Nodes last timestep.
+        """
+        if self._last_timestep is None:
+            self._last_timestep = self.first_timestep + self.timesteps - 1
+        return self._last_timestep
+
+
+class MultiNode(Node):
+    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
+        super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot)
+        self.nodes_list = nodes_list
+        for node in self.nodes_list:
+            node.is_robot = is_robot
+
+        self.first_timestep = min(node.first_timestep for node in self.nodes_list)
+        self._last_timestep = max(node.last_timestep for node in self.nodes_list)
+
+        starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64)
+        ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64)
+        ids = np.arange(len(self.nodes_list), dtype=np.int64)
+        self.interval_tree = NCLS(starts, ends, ids)
+
+    @staticmethod
+    def find_non_overlapping_nodes(nodes_list, min_timesteps=1) -> list:
+        """
+        Greedily finds a set of non-overlapping nodes in the provided scene.
+
+        :return: A list of non-overlapping nodes.
+        """
+        non_overlapping_nodes = list()
+        nodes = sorted(nodes_list, key=lambda n: n.last_timestep)
+        current_time = 0
+        for node in nodes:
+            if node.first_timestep >= current_time and node.timesteps >= min_timesteps:
+                # Include the node
+                non_overlapping_nodes.append(node)
+                current_time = node.last_timestep
+
+        return non_overlapping_nodes
+
+    def get_node_at_timesteps(self, scene_ts) -> Node:
+        possible_node_ranges = list(self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
+        if not possible_node_ranges:
+            return Node(
+                node_type=self.type, node_id="EMPTY", data=self.nodes_list[0].data * np.nan, is_robot=self.is_robot
+            )
+
+        node_idx = random.choice(possible_node_ranges)[2]
+        return self.nodes_list[node_idx]
+
+    def scene_ts_to_node_ts(self, scene_ts) -> (Node, np.ndarray, int, int):
+        """
+        Transforms timestamp from scene into timeframe of node data.
+
+        :param scene_ts: Scene timesteps
+        :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in
+                node data before data is available. paddingu: Number of timesteps in scene range which are not
+                available in node data after data is available.
+        """
+        possible_node_ranges = list(self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
+        if not possible_node_ranges:
+            return None, None, None, None
+
+        node_idx = random.choice(possible_node_ranges)[2]
+        node = self.nodes_list[node_idx]
+
+        paddingl = (node.first_timestep - scene_ts[0]).clip(0)
+        paddingu = (scene_ts[1] - node.last_timestep).clip(0)
+        ts = np.array(scene_ts).clip(min=node.first_timestep, max=node.last_timestep) - node.first_timestep
+        return node, ts, paddingl, paddingu
+
+    def get(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        if tr_scene.size == 1:
+            tr_scene = np.array([tr_scene, tr_scene])
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+
+        node, tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene)
+        if node is None:
+            state_length = sum([len(entity_dims) for entity_dims in state.values()])
+            return np.full((length, state_length), fill_value=padding)
+
+        data_array = node.data[tr[0] : tr[1] + 1, state]
+        padded_data_array = np.full((length, data_array.shape[1]), fill_value=padding)
+        padded_data_array[paddingl : length - paddingu] = data_array
+        return padded_data_array
+
+    def get_all(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        # Assumption here is that the user is asking for all of the data in this MultiNode and to return it within a
+        # full scene-sized output array.
+        assert tr_scene.size == 2 and tr_scene[0] == 0 and self.last_timestep <= tr_scene[1]
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+        state_length = sum([len(entity_dims) for entity_dims in state.values()])
+        padded_data_array = np.full((length, state_length), fill_value=padding)
+        for node in self.nodes_list:
+            padded_data_array[node.first_timestep : node.last_timestep + 1] = node.data[:, state]
+
+        return padded_data_array
+
+    def history_points_at(self, ts) -> int:
+        """
+        Number of history points in trajectory. Timestep is exclusive.
+
+        :param ts: Scene timestep where the number of history points are queried.
+        :return: Number of history timesteps.
+        """
+        node_idx = next(self.interval_tree.find_overlap(ts, ts + 1))[2]
+        node = self.nodes_list[node_idx]
+        return ts - node.first_timestep
+
+    @property
+    def timesteps(self) -> int:
+        """
+        Number of available timesteps for node.
+
+        :return: Number of available timesteps.
+        """
+        return self._last_timestep - self.first_timestep + 1
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py
new file mode 100644
index 000000000..1513e487a
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+class NodeType(object):
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, other):
+        if type(other) == str and self.name == other:
+            return True
+        else:
+            return isinstance(other, self.__class__) and self.name == other.name
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __add__(self, other):
+        return self.name + other
+
+
+class NodeTypeEnum(list):
+    def __init__(self, node_type_list):
+        self.node_type_list = node_type_list
+        node_types = [NodeType(name, node_type_list.index(name) + 1) for name in node_type_list]
+        super().__init__(node_types)
+
+    def __getattr__(self, name):
+        if not name.startswith("_") and name in object.__getattribute__(self, "node_type_list"):
+            return self[object.__getattribute__(self, "node_type_list").index(name)]
+        else:
+            return object.__getattribute__(self, name)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py
new file mode 100644
index 000000000..38430e607
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py
@@ -0,0 +1,218 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import copy
+import numpy as np
+from .scene_graph import TemporalSceneGraph, SceneGraph
+from .node import MultiNode
+
+
+class Scene(object):
+    def __init__(self, timesteps, map=None, dt=1, name="", frequency_multiplier=1, aug_func=None, non_aug_scene=None):
+        self.map = map
+        self.timesteps = timesteps
+        self.dt = dt
+        self.name = name
+
+        self.nodes = []
+
+        self.robot = None
+
+        self.temporal_scene_graph = None
+
+        self.frequency_multiplier = frequency_multiplier
+
+        self.description = ""
+
+        self.aug_func = aug_func
+        self.non_aug_scene = non_aug_scene
+
+    def add_robot_from_nodes(self, robot_type):
+        scenes = [self]
+        if hasattr(self, "augmented"):
+            scenes += self.augmented
+
+        for scn in scenes:
+            nodes_list = [node for node in scn.nodes if node.type == robot_type]
+            non_overlapping_nodes = MultiNode.find_non_overlapping_nodes(nodes_list, min_timesteps=3)
+            scn.robot = MultiNode(robot_type, "ROBOT", non_overlapping_nodes, is_robot=True)
+
+            for node in non_overlapping_nodes:
+                scn.nodes.remove(node)
+            scn.nodes.append(scn.robot)
+
+    def get_clipped_input_dict(self, timestep, state):
+        input_dict = dict()
+        existing_nodes = self.get_nodes_clipped_at_time(timesteps=np.array([timestep]), state=state)
+        tr_scene = np.array([timestep, timestep])
+        for node in existing_nodes:
+            input_dict[node] = node.get(tr_scene, state[node.type])
+
+        return input_dict
+
+    def get_scene_graph(
+        self, timestep, attention_radius=None, edge_addition_filter=None, edge_removal_filter=None
+    ) -> SceneGraph:
+        """
+        Returns the Scene Graph for a given timestep. If the Temporal Scene Graph was pre calculated,
+        the temporal scene graph is sliced. Otherwise the scene graph is calculated on the spot.
+
+        :param timestep: Timestep for which the scene graph is returned.
+        :param attention_radius: Attention radius for each node type permutation. (Only online)
+        :param edge_addition_filter: Filter for adding edges (Only online)
+        :param edge_removal_filter:  Filter for removing edges (Only online)
+        :return: Scene Graph for given timestep.
+        """
+        if self.temporal_scene_graph is None:
+            timestep_range = np.array([timestep - len(edge_removal_filter), timestep])
+            node_pos_dict = dict()
+            present_nodes = self.present_nodes(np.array([timestep]))
+
+            for node in present_nodes[timestep]:
+                node_pos_dict[node] = np.squeeze(node.get(timestep_range, {"position": ["x", "y"]}))
+            tsg = TemporalSceneGraph.create_from_temp_scene_dict(
+                node_pos_dict,
+                attention_radius,
+                duration=(len(edge_removal_filter) + 1),
+                edge_addition_filter=edge_addition_filter,
+                edge_removal_filter=edge_removal_filter,
+            )
+
+            return tsg.to_scene_graph(
+                t=len(edge_removal_filter), t_hist=len(edge_removal_filter), t_fut=len(edge_addition_filter)
+            )
+        else:
+            return self.temporal_scene_graph.to_scene_graph(
+                timestep, len(edge_removal_filter), len(edge_addition_filter)
+            )
+
+    def calculate_scene_graph(self, attention_radius, edge_addition_filter=None, edge_removal_filter=None) -> None:
+        """
+        Calculate the Temporal Scene Graph for the entire Scene.
+
+        :param attention_radius: Attention radius for each node type permutation.
+        :param edge_addition_filter: Filter for adding edges.
+        :param edge_removal_filter: Filter for removing edges.
+        :return: None
+        """
+        timestep_range = np.array([0, self.timesteps - 1])
+        node_pos_dict = dict()
+
+        for node in self.nodes:
+            if type(node) is MultiNode:
+                node_pos_dict[node] = np.squeeze(node.get_all(timestep_range, {"position": ["x", "y"]}))
+            else:
+                node_pos_dict[node] = np.squeeze(node.get(timestep_range, {"position": ["x", "y"]}))
+
+        self.temporal_scene_graph = TemporalSceneGraph.create_from_temp_scene_dict(
+            node_pos_dict,
+            attention_radius,
+            duration=self.timesteps,
+            edge_addition_filter=edge_addition_filter,
+            edge_removal_filter=edge_removal_filter,
+        )
+
+    def duration(self):
+        """
+        Calculates the duration of the scene.
+
+        :return: Duration of the scene in s.
+        """
+        return self.timesteps * self.dt
+
+    def present_nodes(
+        self, timesteps, type=None, min_history_timesteps=0, min_future_timesteps=0, return_robot=True
+    ) -> dict:
+        """
+        Finds all present nodes in the scene at a given timestemp
+
+        :param timesteps: Timestep(s) for which all present nodes should be returned
+        :param type: Node type which should be returned. If None all node types are returned.
+        :param min_history_timesteps: Minimum history timesteps of a node to be returned.
+        :param min_future_timesteps: Minimum future timesteps of a node to be returned.
+        :param return_robot: Return a node if it is the robot.
+        :return: Dictionary with timesteps as keys and list of nodes as value.
+        """
+
+        present_nodes = {}
+
+        for node in self.nodes:
+            if node.is_robot and not return_robot:
+                continue
+            if type is None or node.type == type:
+                lower_bound = timesteps - min_history_timesteps
+                upper_bound = timesteps + min_future_timesteps
+                mask = (node.first_timestep <= lower_bound) & (upper_bound <= node.last_timestep)
+                if mask.any():
+                    timestep_indices_present = np.nonzero(mask)[0]
+                    for timestep_index_present in timestep_indices_present:
+                        if timesteps[timestep_index_present] in present_nodes.keys():
+                            present_nodes[timesteps[timestep_index_present]].append(node)
+                        else:
+                            present_nodes[timesteps[timestep_index_present]] = [node]
+
+        return present_nodes
+
+    def get_nodes_clipped_at_time(self, timesteps, state):
+        clipped_nodes = list()
+
+        existing_nodes = self.present_nodes(timesteps)
+        all_nodes = set().union(*existing_nodes.values())
+        if not all_nodes:
+            return clipped_nodes
+
+        tr_scene = np.array([timesteps.min(), timesteps.max()])
+        data_header_memo = dict()
+        for node in all_nodes:
+            if isinstance(node, MultiNode):
+                copied_node = copy.deepcopy(node.get_node_at_timesteps(tr_scene))
+                copied_node.id = self.robot.id
+            else:
+                copied_node = copy.deepcopy(node)
+
+            clipped_value = node.get(tr_scene, state[node.type])
+
+            if node.type not in data_header_memo:
+                data_header = list()
+                for quantity, values in state[node.type].items():
+                    for value in values:
+                        data_header.append((quantity, value))
+
+                data_header_memo[node.type] = data_header
+
+            copied_node.overwrite_data(clipped_value, data_header_memo[node.type])
+            copied_node.first_timestep = tr_scene[0]
+
+            clipped_nodes.append(copied_node)
+
+        return clipped_nodes
+
+    def sample_timesteps(self, batch_size, min_future_timesteps=0) -> np.ndarray:
+        """
+        Sample a batch size of possible timesteps for the scene.
+
+        :param batch_size: Number of timesteps to sample.
+        :param min_future_timesteps: Minimum future timesteps in the scene for a timestep to be returned.
+        :return: Numpy Array of sampled timesteps.
+        """
+        if batch_size > self.timesteps:
+            batch_size = self.timesteps
+        return np.random.choice(np.arange(0, self.timesteps - min_future_timesteps), size=batch_size, replace=False)
+
+    def augment(self):
+        if self.aug_func is not None:
+            return self.aug_func(self)
+        else:
+            return self
+
+    def get_node_by_id(self, id):
+        for node in self.nodes:
+            if node.id == id:
+                return node
+
+    def __repr__(self):
+        return (
+            f"Scene: Duration: {self.duration()}s,"
+            f" Nodes: {len(self.nodes)},"
+            f" Map: {'Yes' if self.map is not None else 'No'}."
+        )
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py
new file mode 100644
index 000000000..63c15bb2c
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py
@@ -0,0 +1,536 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+from scipy.spatial.distance import pdist, squareform
+import scipy.signal as ss
+from collections import defaultdict
+import warnings
+from .node import Node
+
+
+class Edge(object):
+    def __init__(self, curr_node, other_node):
+        self.id = self.get_edge_id(curr_node, other_node)
+        self.type = self.get_edge_type(curr_node, other_node)
+        self.curr_node = curr_node
+        self.other_node = other_node
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    def __eq__(self, other):
+        return isinstance(other, self.__class__) and self.id == other.id
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __repr__(self):
+        return self.id
+
+
+class UndirectedEdge(Edge):
+    def __init__(self, curr_node, other_node):
+        super(UndirectedEdge, self).__init__(curr_node, other_node)
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        return "-".join(sorted([str(n1), str(n2)]))
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        return "-".join(sorted([nt1.name, nt2.name]))
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        return "-".join(sorted([n1.type.name, n2.type.name]))
+
+
+class DirectedEdge(Edge):
+    def __init__(self, curr_node, other_node):
+        super(DirectedEdge, self).__init__(curr_node, other_node)
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        return "->".join([str(n1), str(n2)])
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        return "->".join([nt1.name, nt2.name])
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        return "->".join([n1.type.name, n2.type.name])
+
+
+class TemporalSceneGraph(object):
+    def __init__(
+        self,
+        edge_radius,
+        nodes=None,
+        adj_cube=np.zeros((1, 0, 0)),
+        weight_cube=np.zeros((1, 0, 0)),
+        node_type_mat=np.zeros((0, 0)),
+        edge_scaling=None,
+    ):
+        self.edge_radius = edge_radius
+        self.nodes = nodes
+        if nodes is None:
+            self.nodes = np.array([])
+        self.adj_cube = adj_cube
+        self.weight_cube = weight_cube
+        self.node_type_mat = node_type_mat
+        self.adj_mat = np.max(self.adj_cube, axis=0).clip(max=1.0)
+        self.edge_scaling = edge_scaling
+        self.node_index_lookup = None
+        self.calculate_node_index_lookup()
+
+    def calculate_node_index_lookup(self):
+        node_index_lookup = dict()
+        for i, node in enumerate(self.nodes):
+            node_index_lookup[node] = i
+
+        self.node_index_lookup = node_index_lookup
+
+    def get_num_edges(self, t=0):
+        return np.sum(self.adj_cube[t]) // 2
+
+    def get_index(self, node):
+        return self.node_index_lookup[node]
+
+    @classmethod
+    def create_from_temp_scene_dict(
+        cls,
+        scene_temp_dict,
+        attention_radius,
+        duration=1,
+        edge_addition_filter=None,
+        edge_removal_filter=None,
+        online=False,
+    ):
+        """
+        Construct a spatiotemporal graph from node positions in a dataset.
+
+        :param scene_temp_dict: Dict with all nodes in scene as keys and np.ndarray with positions as value
+        :param attention_radius: Attention radius dict.
+        :param duration: Temporal duration of the graph.
+        :param edge_addition_filter: -
+        :param edge_removal_filter: -
+        :return: TemporalSceneGraph
+        """
+
+        nodes = scene_temp_dict.keys()
+        N = len(nodes)
+        total_timesteps = duration
+
+        if N == 0:
+            return TemporalSceneGraph(attention_radius)
+
+        position_cube = np.full((total_timesteps, N, 2), np.nan)
+
+        adj_cube = np.zeros((total_timesteps, N, N), dtype=np.int8)
+        dist_cube = np.zeros((total_timesteps, N, N), dtype=np.float)
+
+        node_type_mat = np.zeros((N, N), dtype=np.int8)
+        node_attention_mat = np.zeros((N, N), dtype=np.float)
+
+        for node_idx, node in enumerate(nodes):
+            if online:
+                # RingBuffers do not have a fixed constant size. Instead, they grow up to their capacity. Thus,
+                # we need to fill the values preceding the RingBuffer values with NaNs to make them fill the
+                # position_cube.
+                position_cube[-scene_temp_dict[node].shape[0] :, node_idx] = scene_temp_dict[node]
+            else:
+                position_cube[:, node_idx] = scene_temp_dict[node]
+
+            node_type_mat[:, node_idx] = node.type.value
+            for node_idx_from, node_from in enumerate(nodes):
+                node_attention_mat[node_idx_from, node_idx] = attention_radius[(node_from.type, node.type)]
+
+        np.fill_diagonal(node_type_mat, 0)
+
+        for timestep in range(position_cube.shape[0]):
+            dists = squareform(pdist(position_cube[timestep], metric="euclidean"))
+
+            # Put a 1 for all agent pairs which are closer than the edge_radius.
+            # Can produce a warning as dists can be nan if no data for node is available.
+            # This is accepted as nan <= x evaluates to False
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                adj_matrix = (dists <= node_attention_mat).astype(np.int8) * node_type_mat
+
+            # Remove self-loops.
+            np.fill_diagonal(adj_matrix, 0)
+
+            adj_cube[timestep] = adj_matrix
+            dist_cube[timestep] = dists
+
+        dist_cube[np.isnan(dist_cube)] = 0.0
+        weight_cube = np.divide(1.0, dist_cube, out=np.zeros_like(dist_cube), where=(dist_cube > 0.0))
+        edge_scaling = None
+        if edge_addition_filter is not None and edge_removal_filter is not None:
+            edge_scaling = cls.calculate_edge_scaling(adj_cube, edge_addition_filter, edge_removal_filter)
+        tsg = cls(
+            attention_radius, np.array(list(nodes)), adj_cube, weight_cube, node_type_mat, edge_scaling=edge_scaling
+        )
+        return tsg
+
+    @staticmethod
+    def calculate_edge_scaling(adj_cube, edge_addition_filter, edge_removal_filter):
+        shifted_right = np.pad(
+            adj_cube, ((len(edge_addition_filter) - 1, 0), (0, 0), (0, 0)), "constant", constant_values=0
+        )
+
+        new_edges = np.minimum(ss.convolve(shifted_right, np.reshape(edge_addition_filter, (-1, 1, 1)), "full"), 1.0)[
+            (len(edge_addition_filter) - 1) : -(len(edge_addition_filter) - 1)
+        ]
+
+        new_edges[adj_cube == 0] = 0
+
+        result = np.minimum(ss.convolve(new_edges, np.reshape(edge_removal_filter, (-1, 1, 1)), "full"), 1.0)[
+            : -(len(edge_removal_filter) - 1)
+        ]
+
+        return result
+
+    def to_scene_graph(self, t, t_hist=0, t_fut=0):
+        """
+        Creates a Scene Graph from a Temporal Scene Graph
+
+        :param t: Time in Temporal Scene Graph for which Scene Graph is created.
+        :param t_hist: Number of history timesteps which are considered to form edges in Scene Graph.
+        :param t_fut: Number of future timesteps which are considered to form edges in Scene Graph.
+        :return: SceneGraph
+        """
+        lower_t = np.clip(t - t_hist, a_min=0, a_max=None)
+        higher_t = np.clip(t + t_fut + 1, a_min=None, a_max=self.adj_cube.shape[0] + 1)
+        adj_mat = np.max(self.adj_cube[lower_t:higher_t], axis=0)
+        weight_mat = np.max(self.weight_cube[lower_t:higher_t], axis=0)
+        return SceneGraph(
+            self.edge_radius,
+            self.nodes,
+            adj_mat,
+            weight_mat,
+            self.node_type_mat,
+            self.node_index_lookup,
+            edge_scaling=self.edge_scaling[t] if self.edge_scaling is not None else None,
+        )
+
+
+class SceneGraph(object):
+    def __init__(
+        self,
+        edge_radius,
+        nodes=None,
+        adj_mat=np.zeros((0, 0)),
+        weight_mat=np.zeros((0, 0)),
+        node_type_mat=np.zeros((0, 0)),
+        node_index_lookup=None,
+        edge_scaling=None,
+    ):
+        self.edge_radius = edge_radius
+        self.nodes = nodes
+        if nodes is None:
+            self.nodes = np.array([])
+        self.node_type_mat = node_type_mat
+        self.adj_mat = adj_mat
+        self.weight_mat = weight_mat
+        self.edge_scaling = edge_scaling
+        self.node_index_lookup = node_index_lookup
+
+    def get_index(self, node):
+        return self.node_index_lookup[node]
+
+    def get_num_edges(self):
+        return np.sum(self.adj_mat) // 2
+
+    def get_neighbors(self, node, node_type):
+        """
+        Get all neighbors of a node.
+
+        :param node: Node for which all neighbors are returned.
+        :param node_type: Specifies node types which are returned.
+        :return: List of all neighbors.
+        """
+        node_index = self.get_index(node)
+        connection_mask = self.get_connection_mask(node_index)
+        mask = (self.node_type_mat[node_index] == node_type.value) * connection_mask
+        return self.nodes[mask]
+
+    def get_edge_scaling(self, node=None):
+        if node is None:
+            return self.edge_scaling
+        else:
+            node_index = self.get_index(node)
+            connection_mask = self.get_connection_mask(node_index)
+            return self.edge_scaling[node_index, connection_mask]
+
+    def get_edge_weight(self, node=None):
+        if node is None:
+            return self.weight_mat
+        else:
+            node_index = self.get_index(node)
+            connection_mask = self.get_connection_mask(node_index)
+            return self.weight_mat[node_index, connection_mask]
+
+    def get_connection_mask(self, node_index):
+        if self.edge_scaling is None:  # We do not use edge scaling
+            return self.adj_mat[node_index] > 0.0
+        else:
+            return self.edge_scaling[node_index] > 1e-2
+
+    def __sub__(self, other):
+        new_nodes = [node for node in self.nodes if node not in other.nodes]
+        removed_nodes = [node for node in other.nodes if node not in self.nodes]
+
+        our_types = set(node.type for node in self.nodes)
+        other_types = set(node.type for node in other.nodes)
+        all_node_types = our_types | other_types
+
+        new_neighbors = defaultdict(lambda: defaultdict(set))
+        for node in self.nodes:
+            if node in removed_nodes:
+                continue
+
+            if node in other.nodes:
+                for node_type in all_node_types:
+                    new_items = set(self.get_neighbors(node, node_type)) - set(other.get_neighbors(node, node_type))
+                    if len(new_items) > 0:
+                        new_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = new_items
+            else:
+                for node_type in our_types:
+                    neighbors = self.get_neighbors(node, node_type)
+                    if len(neighbors) > 0:
+                        new_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = set(
+                            neighbors
+                        )
+
+        removed_neighbors = defaultdict(lambda: defaultdict(set))
+        for node in other.nodes:
+            if node in removed_nodes:
+                continue
+
+            if node in self.nodes:
+                for node_type in all_node_types:
+                    removed_items = set(other.get_neighbors(node, node_type)) - set(self.get_neighbors(node, node_type))
+                    if len(removed_items) > 0:
+                        removed_neighbors[node][
+                            DirectedEdge.get_edge_type(node, Node(node_type, None, None))
+                        ] = removed_items
+            else:
+                for node_type in other_types:
+                    neighbors = other.get_neighbors(node, node_type)
+                    if len(neighbors) > 0:
+                        removed_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = set(
+                            neighbors
+                        )
+
+        return new_nodes, removed_nodes, new_neighbors, removed_neighbors
+
+
+if __name__ == "__main__":
+    from environment import NodeTypeEnum
+    import time
+
+    # # # # # # # # # # # # # # # # #
+    # Testing edge mask calculation #
+    # # # # # # # # # # # # # # # # #
+    B = np.array(
+        [
+            [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
+            [1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0],
+        ]
+    )[:, :, np.newaxis, np.newaxis]
+    print(B.shape)
+
+    edge_addition_filter = [0.25, 0.5, 0.75, 1.0]
+    edge_removal_filter = [1.0, 0.5, 0.0]
+    for i in range(B.shape[0]):
+        A = B[i]  # (time, N, N)
+
+        print(A[:, 0, 0])
+
+        start = time.time()
+        new_edges = np.minimum(ss.convolve(A, np.reshape(edge_addition_filter, (-1, 1, 1)), "full"), 1.0)[
+            (len(edge_addition_filter) - 1) :
+        ]
+        old_edges = np.minimum(ss.convolve(A, np.reshape(edge_removal_filter, (-1, 1, 1)), "full"), 1.0)[
+            : -(len(edge_removal_filter) - 1)
+        ]
+        res = np.minimum(new_edges + old_edges, 1.0)[:, 0, 0]
+        end = time.time()
+        print(end - start)
+        print(res)
+
+        start = time.time()
+        res = TemporalSceneGraph.calculate_edge_scaling(A, edge_addition_filter, edge_removal_filter)[:, 0, 0]
+        end = time.time()
+        print(end - start)
+        print(res)
+
+        print("-" * 40)
+
+    # # # # # # # # # # # # # # #
+    # Testing graph subtraction #
+    # # # # # # # # # # # # # # #
+    print("\n" + "-" * 40 + "\n")
+
+    node_type_list = ["PEDESTRIAN", "BICYCLE", "VEHICLE"]
+    nte = NodeTypeEnum(node_type_list)
+
+    attention_radius = dict()
+    attention_radius[(nte.PEDESTRIAN, nte.PEDESTRIAN)] = 5.0
+    attention_radius[(nte.PEDESTRIAN, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.PEDESTRIAN, nte.BICYCLE)] = 10.0
+    attention_radius[(nte.VEHICLE, nte.PEDESTRIAN)] = 20.0
+    attention_radius[(nte.VEHICLE, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.VEHICLE, nte.BICYCLE)] = 20.0
+    attention_radius[(nte.BICYCLE, nte.PEDESTRIAN)] = 10.0
+    attention_radius[(nte.BICYCLE, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.BICYCLE, nte.BICYCLE)] = 10.0
+
+    scene_dict1 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([0, 1]),
+    }
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    scene_dict2 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([1, 1]),
+    }
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print("New Nodes:", new_nodes)
+    print("Removed Nodes:", removed_nodes)
+    print("New Neighbors:", new_neighbors)
+    print("Removed Neighbors:", removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print("\n" + "-" * 40 + "\n")
+
+    scene_dict1 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([0, 1]),
+    }
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    scene_dict2 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([1, 1]),
+        Node(nte.PEDESTRIAN, node_id="3"): np.array([20, 1]),
+    }
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print("New Nodes:", new_nodes)
+    print("Removed Nodes:", removed_nodes)
+    print("New Neighbors:", new_neighbors)
+    print("Removed Neighbors:", removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print("\n" + "-" * 40 + "\n")
+
+    scene_dict1 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([0, 1]),
+    }
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    scene_dict2 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([1, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([10, 1]),
+        Node(nte.PEDESTRIAN, node_id="3"): np.array([20, 1]),
+    }
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print("New Nodes:", new_nodes)
+    print("Removed Nodes:", removed_nodes)
+    print("New Neighbors:", new_neighbors)
+    print("Removed Neighbors:", removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print("\n" + "-" * 40 + "\n")
+
+    scene_dict1 = {
+        Node(nte.PEDESTRIAN, node_id="1"): np.array([0, 0]),
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([0, 1]),
+    }
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    scene_dict2 = {
+        Node(nte.PEDESTRIAN, node_id="2"): np.array([10, 1]),
+        Node(nte.PEDESTRIAN, node_id="3"): np.array([12, 1]),
+        Node(nte.PEDESTRIAN, node_id="4"): np.array([13, 1]),
+    }
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0],
+    ).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print("New Nodes:", new_nodes)
+    print("Removed Nodes:", removed_nodes)
+    print("New Neighbors:", new_neighbors)
+    print("Removed Neighbors:", removed_neighbors)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py
new file mode 100644
index 000000000..91ce29390
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .evaluation import compute_batch_statistics, log_batch_errors, print_batch_errors
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py
new file mode 100644
index 000000000..fac4a45eb
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py
@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+from scipy.interpolate import RectBivariateSpline
+from scipy.ndimage import binary_dilation
+from scipy.stats import gaussian_kde
+from utils import prediction_output_to_trajectories
+import visualization
+from matplotlib import pyplot as plt
+
+
+def compute_ade(predicted_trajs, gt_traj):
+    error = np.linalg.norm(predicted_trajs - gt_traj, axis=-1)
+    ade = np.mean(error, axis=-1)
+    return ade.flatten()
+
+
+def compute_fde(predicted_trajs, gt_traj):
+    final_error = np.linalg.norm(predicted_trajs[:, :, -1] - gt_traj[-1], axis=-1)
+    return final_error.flatten()
+
+
+def compute_kde_nll(predicted_trajs, gt_traj):
+    kde_ll = 0.0
+    log_pdf_lower_bound = -20
+    num_timesteps = gt_traj.shape[0]
+    num_batches = predicted_trajs.shape[0]
+
+    for batch_num in range(num_batches):
+        for timestep in range(num_timesteps):
+            try:
+                kde = gaussian_kde(predicted_trajs[batch_num, :, timestep].T)
+                pdf = np.clip(kde.logpdf(gt_traj[timestep].T), a_min=log_pdf_lower_bound, a_max=None)[0]
+                kde_ll += pdf / (num_timesteps * num_batches)
+            except np.linalg.LinAlgError:
+                kde_ll = np.nan
+
+    return -kde_ll
+
+
+def compute_obs_violations(predicted_trajs, map):
+    obs_map = map.data
+
+    interp_obs_map = RectBivariateSpline(
+        range(obs_map.shape[1]), range(obs_map.shape[0]), binary_dilation(obs_map.T, iterations=4), kx=1, ky=1
+    )
+
+    old_shape = predicted_trajs.shape
+    pred_trajs_map = map.to_map_points(predicted_trajs.reshape((-1, 2)))
+
+    traj_obs_values = interp_obs_map(pred_trajs_map[:, 0], pred_trajs_map[:, 1], grid=False)
+    traj_obs_values = traj_obs_values.reshape((old_shape[0], old_shape[1]))
+    num_viol_trajs = np.sum(traj_obs_values.max(axis=1) > 0, dtype=float)
+
+    return num_viol_trajs
+
+
+def compute_batch_statistics(
+    prediction_output_dict,
+    dt,
+    max_hl,
+    ph,
+    node_type_enum,
+    kde=True,
+    obs=False,
+    map=None,
+    prune_ph_to_future=False,
+    best_of=False,
+):
+
+    (prediction_dict, _, futures_dict) = prediction_output_to_trajectories(
+        prediction_output_dict, dt, max_hl, ph, prune_ph_to_future=prune_ph_to_future
+    )
+
+    batch_error_dict = dict()
+    for node_type in node_type_enum:
+        batch_error_dict[node_type] = {"ade": list(), "fde": list(), "kde": list(), "obs_viols": list()}
+
+    for t in prediction_dict.keys():
+        for node in prediction_dict[t].keys():
+            ade_errors = compute_ade(prediction_dict[t][node], futures_dict[t][node])
+            fde_errors = compute_fde(prediction_dict[t][node], futures_dict[t][node])
+            if kde:
+                kde_ll = compute_kde_nll(prediction_dict[t][node], futures_dict[t][node])
+            else:
+                kde_ll = 0
+            if obs:
+                obs_viols = compute_obs_violations(prediction_dict[t][node], map)
+            else:
+                obs_viols = 0
+            if best_of:
+                ade_errors = np.min(ade_errors, keepdims=True)
+                fde_errors = np.min(fde_errors, keepdims=True)
+                kde_ll = np.min(kde_ll)
+            batch_error_dict[node.type]["ade"].extend(list(ade_errors))
+            batch_error_dict[node.type]["fde"].extend(list(fde_errors))
+            batch_error_dict[node.type]["kde"].extend([kde_ll])
+            batch_error_dict[node.type]["obs_viols"].extend([obs_viols])
+
+    return batch_error_dict
+
+
+def log_batch_errors(batch_errors_list, log_writer, namespace, curr_iter, bar_plot=[], box_plot=[]):
+    for node_type in batch_errors_list[0].keys():
+        for metric in batch_errors_list[0][node_type].keys():
+            metric_batch_error = []
+            for batch_errors in batch_errors_list:
+                metric_batch_error.extend(batch_errors[node_type][metric])
+
+            if len(metric_batch_error) > 0:
+                log_writer.add_histogram(f"{node_type.name}/{namespace}/{metric}", metric_batch_error, curr_iter)
+                log_writer.add_scalar(
+                    f"{node_type.name}/{namespace}/{metric}_mean", np.mean(metric_batch_error), curr_iter
+                )
+                log_writer.add_scalar(
+                    f"{node_type.name}/{namespace}/{metric}_median", np.median(metric_batch_error), curr_iter
+                )
+
+                if metric in bar_plot:
+                    pd = {"dataset": [namespace] * len(metric_batch_error), metric: metric_batch_error}
+                    kde_barplot_fig, ax = plt.subplots(figsize=(5, 5))
+                    visualization.visualization_utils.plot_barplots(ax, pd, "dataset", metric)
+                    log_writer.add_figure(f"{node_type.name}/{namespace}/{metric}_bar_plot", kde_barplot_fig, curr_iter)
+
+                if metric in box_plot:
+                    mse_fde_pd = {"dataset": [namespace] * len(metric_batch_error), metric: metric_batch_error}
+                    fig, ax = plt.subplots(figsize=(5, 5))
+                    visualization.visualization_utils.plot_boxplots(ax, mse_fde_pd, "dataset", metric)
+                    log_writer.add_figure(f"{node_type.name}/{namespace}/{metric}_box_plot", fig, curr_iter)
+
+
+def print_batch_errors(batch_errors_list, namespace, curr_iter):
+    for node_type in batch_errors_list[0].keys():
+        for metric in batch_errors_list[0][node_type].keys():
+            metric_batch_error = []
+            for batch_errors in batch_errors_list:
+                metric_batch_error.extend(batch_errors[node_type][metric])
+
+            if len(metric_batch_error) > 0:
+                print(f"{curr_iter}: {node_type.name}/{namespace}/{metric}_mean", np.mean(metric_batch_error))
+                print(f"{curr_iter}: {node_type.name}/{namespace}/{metric}_median", np.median(metric_batch_error))
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py
new file mode 100644
index 000000000..be76653b0
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from model.trajectron import Trajectron
+from model.mgcvae import MultimodalGenerativeCVAE
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py
new file mode 100644
index 000000000..ebf3ee86c
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .discrete_latent import DiscreteLatent
+from .gmm2d import GMM2D
+from .map_encoder import CNNMapEncoder
+from .additive_attention import AdditiveAttention, TemporallyBatchedAdditiveAttention
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py
new file mode 100644
index 000000000..0d1ec7f2d
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AdditiveAttention(nn.Module):
+    # Implementing the attention module of Bahdanau et al. 2015 where
+    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
+    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
+        super(AdditiveAttention, self).__init__()
+
+        if internal_dim is None:
+            internal_dim = int((encoder_hidden_state_dim + decoder_hidden_state_dim) / 2)
+
+        self.w1 = nn.Linear(encoder_hidden_state_dim, internal_dim, bias=False)
+        self.w2 = nn.Linear(decoder_hidden_state_dim, internal_dim, bias=False)
+        self.v = nn.Linear(internal_dim, 1, bias=False)
+
+    def score(self, encoder_state, decoder_state):
+        # encoder_state is of shape (batch, enc_dim)
+        # decoder_state is of shape (batch, dec_dim)
+        # return value should be of shape (batch, 1)
+        return self.v(torch.tanh(self.w1(encoder_state) + self.w2(decoder_state)))
+
+    def forward(self, encoder_states, decoder_state):
+        # encoder_states is of shape (batch, num_enc_states, enc_dim)
+        # decoder_state is of shape (batch, dec_dim)
+        score_vec = torch.cat(
+            [self.score(encoder_states[:, i], decoder_state) for i in range(encoder_states.shape[1])], dim=1
+        )
+        # score_vec is of shape (batch, num_enc_states)
+
+        attention_probs = torch.unsqueeze(F.softmax(score_vec, dim=1), dim=2)
+        # attention_probs is of shape (batch, num_enc_states, 1)
+
+        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
+        # final_context_vec is of shape (batch, enc_dim)
+
+        return final_context_vec, attention_probs
+
+
+class TemporallyBatchedAdditiveAttention(AdditiveAttention):
+    # Implementing the attention module of Bahdanau et al. 2015 where
+    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
+    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
+        super(TemporallyBatchedAdditiveAttention, self).__init__(
+            encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim
+        )
+
+    def score(self, encoder_state, decoder_state):
+        # encoder_state is of shape (batch, num_enc_states, max_time, enc_dim)
+        # decoder_state is of shape (batch, max_time, dec_dim)
+        # return value should be of shape (batch, num_enc_states, max_time, 1)
+        return self.v(torch.tanh(self.w1(encoder_state) + torch.unsqueeze(self.w2(decoder_state), dim=1)))
+
+    def forward(self, encoder_states, decoder_state):
+        # encoder_states is of shape (batch, num_enc_states, max_time, enc_dim)
+        # decoder_state is of shape (batch, max_time, dec_dim)
+        score_vec = self.score(encoder_states, decoder_state)
+        # score_vec is of shape (batch, num_enc_states, max_time, 1)
+
+        attention_probs = F.softmax(score_vec, dim=1)
+        # attention_probs is of shape (batch, num_enc_states, max_time, 1)
+
+        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
+        # final_context_vec is of shape (batch, max_time, enc_dim)
+
+        return final_context_vec, torch.squeeze(torch.transpose(attention_probs, 1, 2), dim=3)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py
new file mode 100644
index 000000000..222d826b1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.distributions as td
+import numpy as np
+from ..model_utils import ModeKeys
+
+
+class DiscreteLatent(object):
+    def __init__(self, hyperparams, device):
+        self.hyperparams = hyperparams
+        self.z_dim = hyperparams["N"] * hyperparams["K"]
+        self.N = hyperparams["N"]
+        self.K = hyperparams["K"]
+        self.kl_min = hyperparams["kl_min"]
+        self.device = device
+        self.temp = None  # filled in by MultimodalGenerativeCVAE.set_annealing_params
+        self.z_logit_clip = None  # filled in by MultimodalGenerativeCVAE.set_annealing_params
+        self.p_dist = None  # filled in by MultimodalGenerativeCVAE.encoder
+        self.q_dist = None  # filled in by MultimodalGenerativeCVAE.encoder
+
+    def dist_from_h(self, h, mode):
+        logits_separated = torch.reshape(h, (-1, self.N, self.K))
+        logits_separated_mean_zero = logits_separated - torch.mean(logits_separated, dim=-1, keepdim=True)
+        if self.z_logit_clip is not None and mode == ModeKeys.TRAIN:
+            c = self.z_logit_clip
+            logits = torch.clamp(logits_separated_mean_zero, min=-c, max=c)
+        else:
+            logits = logits_separated_mean_zero
+
+        return td.OneHotCategorical(logits=logits)
+
+    def sample_q(self, num_samples, mode):
+        bs = self.p_dist.probs.size()[0]
+        num_components = self.N * self.K
+        z_NK = (
+            torch.from_numpy(self.all_one_hot_combinations(self.N, self.K))
+            .float()
+            .to(self.device)
+            .repeat(num_samples, bs)
+        )
+        return torch.reshape(z_NK, (num_samples * num_components, -1, self.z_dim))
+
+    def sample_p(self, num_samples, mode, most_likely_z=False, full_dist=True, all_z_sep=False):
+        num_components = 1
+        if full_dist:
+            bs = self.p_dist.probs.size()[0]
+            z_NK = (
+                torch.from_numpy(self.all_one_hot_combinations(self.N, self.K))
+                .float()
+                .to(self.device)
+                .repeat(num_samples, bs)
+            )
+            num_components = self.K**self.N
+            k = num_samples * num_components
+        elif all_z_sep:
+            bs = self.p_dist.probs.size()[0]
+            z_NK = torch.from_numpy(self.all_one_hot_combinations(self.N, self.K)).float().to(self.device).repeat(1, bs)
+            k = self.K**self.N
+            num_samples = k
+        elif most_likely_z:
+            # Sampling the most likely z from p(z|x).
+            eye_mat = torch.eye(self.p_dist.event_shape[-1], device=self.device)
+            argmax_idxs = torch.argmax(self.p_dist.probs, dim=2)
+            z_NK = torch.unsqueeze(eye_mat[argmax_idxs], dim=0).expand(num_samples, -1, -1, -1)
+            k = num_samples
+        else:
+            z_NK = self.p_dist.sample((num_samples,))
+            k = num_samples
+
+        if mode == ModeKeys.PREDICT:
+            return torch.reshape(z_NK, (k, -1, self.N * self.K)), num_samples, num_components
+        else:
+            return torch.reshape(z_NK, (k, -1, self.N * self.K))
+
+    def kl_q_p(self, log_writer=None, prefix=None, curr_iter=None):
+        kl_separated = td.kl_divergence(self.q_dist, self.p_dist)
+        if len(kl_separated.size()) < 2:
+            kl_separated = torch.unsqueeze(kl_separated, dim=0)
+
+        kl_minibatch = torch.mean(kl_separated, dim=0, keepdim=True)
+
+        if log_writer is not None:
+            log_writer.add_scalar(prefix + "/true_kl", torch.sum(kl_minibatch), curr_iter)
+
+        if self.kl_min > 0:
+            kl_lower_bounded = torch.clamp(kl_minibatch, min=self.kl_min)
+            kl = torch.sum(kl_lower_bounded)
+        else:
+            kl = torch.sum(kl_minibatch)
+
+        return kl
+
+    def q_log_prob(self, z):
+        k = z.size()[0]
+        z_NK = torch.reshape(z, [k, -1, self.N, self.K])
+        return torch.sum(self.q_dist.log_prob(z_NK), dim=2)
+
+    def p_log_prob(self, z):
+        k = z.size()[0]
+        z_NK = torch.reshape(z, [k, -1, self.N, self.K])
+        return torch.sum(self.p_dist.log_prob(z_NK), dim=2)
+
+    def get_p_dist_probs(self):
+        return self.p_dist.probs
+
+    @staticmethod
+    def all_one_hot_combinations(N, K):
+        return np.eye(K).take(np.reshape(np.indices([K] * N), [N, -1]).T, axis=0).reshape(-1, N * K)  # [K**N, N*K]
+
+    def summarize_for_tensorboard(self, log_writer, prefix, curr_iter):
+        log_writer.add_histogram(prefix + "/latent/p_z_x", self.p_dist.probs, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/q_z_xy", self.q_dist.probs, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/p_z_x_logits", self.p_dist.logits, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/q_z_xy_logits", self.q_dist.logits, curr_iter)
+        if self.z_dim <= 9:
+            for i in range(self.N):
+                for j in range(self.K):
+                    log_writer.add_histogram(
+                        prefix + "/latent/q_z_xy_logit{0}{1}".format(i, j), self.q_dist.logits[:, i, j], curr_iter
+                    )
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py
new file mode 100644
index 000000000..999c0a303
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.distributions as td
+import numpy as np
+from ..model_utils import to_one_hot
+
+
+class GMM2D(td.Distribution):
+    r"""
+    Gaussian Mixture Model using 2D Multivariate Gaussians each of as N components:
+    Cholesky decompesition and affine transformation for sampling:
+
+    .. math:: Z \sim N(0, I)
+
+    .. math:: S = \mu + LZ
+
+    .. math:: S \sim N(\mu, \Sigma) \rightarrow N(\mu, LL^T)
+
+    where :math:`L = chol(\Sigma)` and
+
+    .. math:: \Sigma = \left[ {\begin{array}{cc} \sigma^2_x & \rho \sigma_x \sigma_y \\ \rho \sigma_x \sigma_y & \sigma^2_y \\ \end{array} } \right]
+
+    such that
+
+    .. math:: L = chol(\Sigma) = \left[ {\begin{array}{cc} \sigma_x & 0 \\ \rho \sigma_y & \sigma_y \sqrt{1-\rho^2} \\ \end{array} } \right]
+
+    :param log_pis: Log Mixing Proportions :math:`log(\pi)`. [..., N]
+    :param mus: Mixture Components mean :math:`\mu`. [..., N * 2]
+    :param log_sigmas: Log Standard Deviations :math:`log(\sigma_d)`. [..., N * 2]
+    :param corrs: Cholesky factor of correlation :math:`\rho`. [..., N]
+    :param clip_lo: Clips the lower end of the standard deviation.
+    :param clip_hi: Clips the upper end of the standard deviation.
+    """
+
+    def __init__(self, log_pis, mus, log_sigmas, corrs):
+        super(GMM2D, self).__init__(batch_shape=log_pis.shape[0], event_shape=log_pis.shape[1:])
+        self.components = log_pis.shape[-1]
+        self.dimensions = 2
+        self.device = log_pis.device
+
+        log_pis = torch.clamp(log_pis, min=-1e5)
+        self.log_pis = log_pis - torch.logsumexp(log_pis, dim=-1, keepdim=True)  # [..., N]
+        self.mus = self.reshape_to_components(mus)  # [..., N, 2]
+        self.log_sigmas = self.reshape_to_components(log_sigmas)  # [..., N, 2]
+        self.sigmas = torch.exp(self.log_sigmas)  # [..., N, 2]
+        self.one_minus_rho2 = 1 - corrs**2  # [..., N]
+        self.one_minus_rho2 = torch.clamp(self.one_minus_rho2, min=1e-5, max=1)  # otherwise log can be nan
+        self.corrs = corrs  # [..., N]
+
+        self.L = torch.stack(
+            [
+                torch.stack([self.sigmas[..., 0], torch.zeros_like(self.log_pis)], dim=-1),
+                torch.stack(
+                    [self.sigmas[..., 1] * self.corrs, self.sigmas[..., 1] * torch.sqrt(self.one_minus_rho2)], dim=-1
+                ),
+            ],
+            dim=-2,
+        )
+
+        self.pis_cat_dist = td.Categorical(logits=log_pis)
+
+    @classmethod
+    def from_log_pis_mus_cov_mats(cls, log_pis, mus, cov_mats):
+        corrs_sigma12 = cov_mats[..., 0, 1]
+        sigma_1 = torch.clamp(cov_mats[..., 0, 0], min=1e-8)
+        sigma_2 = torch.clamp(cov_mats[..., 1, 1], min=1e-8)
+        sigmas = torch.stack([torch.sqrt(sigma_1), torch.sqrt(sigma_2)], dim=-1)
+        log_sigmas = torch.log(sigmas)
+        corrs = corrs_sigma12 / (torch.prod(sigmas, dim=-1))
+        return cls(log_pis, mus, log_sigmas, corrs)
+
+    def rsample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched.
+
+        :param sample_shape: Shape of the samples
+        :return: Samples from the GMM.
+        """
+        mvn_samples = self.mus + torch.squeeze(
+            torch.matmul(
+                self.L, torch.unsqueeze(torch.randn(size=sample_shape + self.mus.shape, device=self.device), dim=-1)
+            ),
+            dim=-1,
+        )
+        component_cat_samples = self.pis_cat_dist.sample(sample_shape)
+        selector = torch.unsqueeze(to_one_hot(component_cat_samples, self.components), dim=-1)
+        return torch.sum(mvn_samples * selector, dim=-2)
+
+    def log_prob(self, value):
+        r"""
+        Calculates the log probability of a value using the PDF for bivariate normal distributions:
+
+        .. math::
+            f(x | \mu, \sigma, \rho)={\frac {1}{2\pi \sigma _{x}\sigma _{y}{\sqrt {1-\rho ^{2}}}}}\exp
+            \left(-{\frac {1}{2(1-\rho ^{2})}}\left[{\frac {(x-\mu _{x})^{2}}{\sigma _{x}^{2}}}+
+            {\frac {(y-\mu _{y})^{2}}{\sigma _{y}^{2}}}-{\frac {2\rho (x-\mu _{x})(y-\mu _{y})}
+            {\sigma _{x}\sigma _{y}}}\right]\right)
+
+        :param value: The log probability density function is evaluated at those values.
+        :return: Log probability
+        """
+        # x: [..., 2]
+        value = torch.unsqueeze(value, dim=-2)  # [..., 1, 2]
+        dx = value - self.mus  # [..., N, 2]
+
+        exp_nominator = torch.sum(
+            (dx / self.sigmas) ** 2, dim=-1
+        ) - 2 * self.corrs * torch.prod(  # first and second term of exp nominator
+            dx, dim=-1
+        ) / torch.prod(
+            self.sigmas, dim=-1
+        )  # [..., N]
+
+        component_log_p = (
+            -(
+                2 * np.log(2 * np.pi)
+                + torch.log(self.one_minus_rho2)
+                + 2 * torch.sum(self.log_sigmas, dim=-1)
+                + exp_nominator / self.one_minus_rho2
+            )
+            / 2
+        )
+
+        return torch.logsumexp(self.log_pis + component_log_p, dim=-1)
+
+    def get_for_node_at_time(self, n, t):
+        return self.__class__(
+            self.log_pis[:, n : n + 1, t : t + 1],
+            self.mus[:, n : n + 1, t : t + 1],
+            self.log_sigmas[:, n : n + 1, t : t + 1],
+            self.corrs[:, n : n + 1, t : t + 1],
+        )
+
+    def mode(self):
+        """
+        Calculates the mode of the GMM by calculating probabilities of a 2D mesh grid
+
+        :param required_accuracy: Accuracy of the meshgrid
+        :return: Mode of the GMM
+        """
+        if self.mus.shape[-2] > 1:
+            samp, bs, time, comp, _ = self.mus.shape
+            assert samp == 1, "For taking the mode only one sample makes sense."
+            mode_node_list = []
+            for n in range(bs):
+                mode_t_list = []
+                for t in range(time):
+                    nt_gmm = self.get_for_node_at_time(n, t)
+                    x_min = self.mus[:, n, t, :, 0].min()
+                    x_max = self.mus[:, n, t, :, 0].max()
+                    y_min = self.mus[:, n, t, :, 1].min()
+                    y_max = self.mus[:, n, t, :, 1].max()
+                    search_grid = (
+                        torch.stack(
+                            torch.meshgrid([torch.arange(x_min, x_max, 0.01), torch.arange(y_min, y_max, 0.01)]), dim=2
+                        )
+                        .view(-1, 2)
+                        .float()
+                        .to(self.device)
+                    )
+
+                    ll_score = nt_gmm.log_prob(search_grid)
+                    argmax = torch.argmax(ll_score.squeeze(), dim=0)
+                    mode_t_list.append(search_grid[argmax])
+                mode_node_list.append(torch.stack(mode_t_list, dim=0))
+            return torch.stack(mode_node_list, dim=0).unsqueeze(dim=0)
+        return torch.squeeze(self.mus, dim=-2)
+
+    def reshape_to_components(self, tensor):
+        if len(tensor.shape) == 5:
+            return tensor
+        return torch.reshape(tensor, list(tensor.shape[:-1]) + [self.components, self.dimensions])
+
+    def get_covariance_matrix(self):
+        cov = self.corrs * torch.prod(self.sigmas, dim=-1)
+        E = torch.stack(
+            [
+                torch.stack([self.sigmas[..., 0] ** 2, cov], dim=-1),
+                torch.stack([cov, self.sigmas[..., 1] ** 2], dim=-1),
+            ],
+            dim=-2,
+        )
+        return E
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py
new file mode 100644
index 000000000..6c9516753
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+import math
+import numbers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init, Parameter
+
+
+class GraphMultiTypeAttention(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features, bias=True, types=1):
+        super(GraphMultiTypeAttention, self).__init__()
+        self.types = types
+        self.in_features = in_features
+        self.out_features = out_features
+        self.node_self_loop_weight = Parameter(torch.Tensor(hidden_features, in_features[0]))
+
+        self.weight_per_type = nn.ParameterList()
+        for i in range(types):
+            self.weight_per_type.append(Parameter(torch.Tensor(hidden_features, in_features[i])))
+        if bias:
+            self.bias = Parameter(torch.Tensor(hidden_features))
+        else:
+            self.register_parameter("bias", None)
+
+        self.linear_to_out = nn.Linear(hidden_features, out_features, bias=bias)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for weight in self.weight_per_type:
+            bound = 1 / math.sqrt(weight.size(1))
+            init.uniform_(weight, -bound, bound)
+        bound = 1 / math.sqrt(self.node_self_loop_weight.size(1))
+        init.uniform_(self.node_self_loop_weight, -bound, bound)
+        if self.bias is not None:
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, inputs, types, edge_weights):
+        weight_list = list()
+        for i, type in enumerate(types):
+            weight_list.append((edge_weights[i] / len(edge_weights)) * self.weight_per_type[type].T)
+        weight_list.append(self.node_self_loop_weight.T)
+        weight = torch.cat(weight_list, dim=0)
+        stacked_input = torch.cat(inputs, dim=-1)
+        output = stacked_input.matmul(weight)
+
+        output = output
+
+        if self.bias is not None:
+            output += self.bias
+
+        return torch.relu(self.linear_to_out(torch.relu(output)))
+
+    def extra_repr(self):
+        return "in_features={}, hidden_features={},, out_features={}, types={}, bias={}".format(
+            self.in_features, self.hidden_features, self.out_features, self.types, self.bias is not None
+        )
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py
new file mode 100644
index 000000000..369be7db4
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class CNNMapEncoder(nn.Module):
+    def __init__(self, map_channels, hidden_channels, output_size, masks, strides, patch_size):
+        super(CNNMapEncoder, self).__init__()
+        self.convs = nn.ModuleList()
+        patch_size_x = patch_size[0] + patch_size[2]
+        patch_size_y = patch_size[1] + patch_size[3]
+        input_size = (map_channels, patch_size_x, patch_size_y)
+        x_dummy = torch.ones(input_size).unsqueeze(0) * torch.tensor(float("nan"))
+
+        for i, hidden_size in enumerate(hidden_channels):
+            self.convs.append(
+                nn.Conv2d(
+                    map_channels if i == 0 else hidden_channels[i - 1], hidden_channels[i], masks[i], stride=strides[i]
+                )
+            )
+            x_dummy = self.convs[i](x_dummy)
+
+        self.fc = nn.Linear(x_dummy.numel(), output_size)
+
+    def forward(self, x, training):
+        for conv in self.convs:
+            x = F.leaky_relu(conv(x), 0.2)
+        x = torch.flatten(x, start_dim=1)
+        x = self.fc(x)
+        return x
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py
new file mode 100644
index 000000000..e0d2ee4c0
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .dataset import EnvironmentDataset, NodeTypeDataset
+from .preprocessing import collate, get_node_timestep_data, get_timesteps_data, restore, get_relative_robot_traj
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py
new file mode 100644
index 000000000..4769eae88
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from torch.utils import data
+import numpy as np
+from .preprocessing import get_node_timestep_data
+
+
+class EnvironmentDataset(object):
+    def __init__(self, env, state, pred_state, node_freq_mult, scene_freq_mult, hyperparams, **kwargs):
+        self.env = env
+        self.state = state
+        self.pred_state = pred_state
+        self.hyperparams = hyperparams
+        self.max_ht = self.hyperparams["maximum_history_length"]
+        self.max_ft = kwargs["min_future_timesteps"]
+        self.node_type_datasets = list()
+        self._augment = False
+        for node_type in env.NodeType:
+            if node_type not in hyperparams["pred_state"]:
+                continue
+            self.node_type_datasets.append(
+                NodeTypeDataset(
+                    env, node_type, state, pred_state, node_freq_mult, scene_freq_mult, hyperparams, **kwargs
+                )
+            )
+
+    @property
+    def augment(self):
+        return self._augment
+
+    @augment.setter
+    def augment(self, value):
+        self._augment = value
+        for node_type_dataset in self.node_type_datasets:
+            node_type_dataset.augment = value
+
+    def __iter__(self):
+        return iter(self.node_type_datasets)
+
+
+class NodeTypeDataset(data.Dataset):
+    def __init__(
+        self, env, node_type, state, pred_state, node_freq_mult, scene_freq_mult, hyperparams, augment=False, **kwargs
+    ):
+        self.env = env
+        self.state = state
+        self.pred_state = pred_state
+        self.hyperparams = hyperparams
+        self.max_ht = self.hyperparams["maximum_history_length"]
+        self.max_ft = kwargs["min_future_timesteps"]
+
+        self.augment = augment
+
+        self.node_type = node_type
+        self.index = self.index_env(node_freq_mult, scene_freq_mult, **kwargs)
+        self.len = len(self.index)
+        self.edge_types = [edge_type for edge_type in env.get_edge_types() if edge_type[0] is node_type]
+
+    def index_env(self, node_freq_mult, scene_freq_mult, **kwargs):
+        index = list()
+        for scene in self.env.scenes:
+            present_node_dict = scene.present_nodes(np.arange(0, scene.timesteps), type=self.node_type, **kwargs)
+            for t, nodes in present_node_dict.items():
+                for node in nodes:
+                    index += (
+                        [(scene, t, node)]
+                        * (scene.frequency_multiplier if scene_freq_mult else 1)
+                        * (node.frequency_multiplier if node_freq_mult else 1)
+                    )
+
+        return index
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, i):
+        (scene, t, node) = self.index[i]
+
+        if self.augment:
+            scene = scene.augment()
+            node = scene.get_node_by_id(node.id)
+
+        return get_node_timestep_data(
+            self.env,
+            scene,
+            t,
+            node,
+            self.state,
+            self.pred_state,
+            self.edge_types,
+            self.max_ht,
+            self.max_ft,
+            self.hyperparams,
+        )
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py
new file mode 100644
index 000000000..5cc3a5f4d
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py
@@ -0,0 +1,467 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+
+
+pi = torch.tensor(3.14159265358979323846)
+
+
+def deg2rad(tensor: torch.Tensor) -> torch.Tensor:
+    r"""Function that converts angles from degrees to radians.
+    Args:
+        tensor (torch.Tensor): Tensor of arbitrary shape.
+    Returns:
+        torch.Tensor: tensor with same shape as input.
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(type(tensor)))
+
+    return tensor * pi.to(tensor.device).type(tensor.dtype) / 180.0
+
+
+def angle_to_rotation_matrix(angle: torch.Tensor) -> torch.Tensor:
+    """
+    Creates a rotation matrix out of angles in degrees
+    Args:
+        angle: (torch.Tensor): tensor of angles in degrees, any shape.
+    Returns:
+        torch.Tensor: tensor of *x2x2 rotation matrices.
+    Shape:
+        - Input: :math:`(*)`
+        - Output: :math:`(*, 2, 2)`
+    Example:
+        >>> input = torch.rand(1, 3)  # Nx3
+        >>> output = kornia.angle_to_rotation_matrix(input)  # Nx3x2x2
+    """
+    ang_rad = deg2rad(angle)
+    cos_a: torch.Tensor = torch.cos(ang_rad)
+    sin_a: torch.Tensor = torch.sin(ang_rad)
+    return torch.stack([cos_a, sin_a, -sin_a, cos_a], dim=-1).view(*angle.shape, 2, 2)
+
+
+def get_rotation_matrix2d(center: torch.Tensor, angle: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    r"""Calculates an affine matrix of 2D rotation.
+    The function calculates the following matrix:
+    .. math::
+        \begin{bmatrix}
+            \alpha & \beta & (1 - \alpha) \cdot \text{x}
+            - \beta \cdot \text{y} \\
+            -\beta & \alpha & \beta \cdot \text{x}
+            + (1 - \alpha) \cdot \text{y}
+        \end{bmatrix}
+    where
+    .. math::
+        \alpha = \text{scale} \cdot cos(\text{radian}) \\
+        \beta = \text{scale} \cdot sin(\text{radian})
+    The transformation maps the rotation center to itself
+    If this is not the target, adjust the shift.
+    Args:
+        center (Tensor): center of the rotation in the source image.
+        angle (Tensor): rotation radian in degrees. Positive values mean
+            counter-clockwise rotation (the coordinate origin is assumed to
+            be the top-left corner).
+        scale (Tensor): isotropic scale factor.
+    Returns:
+        Tensor: the affine matrix of 2D rotation.
+    Shape:
+        - Input: :math:`(B, 2)`, :math:`(B)` and :math:`(B)`
+        - Output: :math:`(B, 2, 3)`
+    Example:
+        >>> center = torch.zeros(1, 2)
+        >>> scale = torch.ones(1)
+        >>> radian = 45. * torch.ones(1)
+        >>> M = kornia.get_rotation_matrix2d(center, radian, scale)
+        tensor([[[ 0.7071,  0.7071,  0.0000],
+                 [-0.7071,  0.7071,  0.0000]]])
+    """
+    if not torch.is_tensor(center):
+        raise TypeError("Input center type is not a torch.Tensor. Got {}".format(type(center)))
+    if not torch.is_tensor(angle):
+        raise TypeError("Input radian type is not a torch.Tensor. Got {}".format(type(angle)))
+    if not torch.is_tensor(scale):
+        raise TypeError("Input scale type is not a torch.Tensor. Got {}".format(type(scale)))
+    if not (len(center.shape) == 2 and center.shape[1] == 2):
+        raise ValueError("Input center must be a Bx2 tensor. Got {}".format(center.shape))
+    if not len(angle.shape) == 1:
+        raise ValueError("Input radian must be a B tensor. Got {}".format(angle.shape))
+    if not len(scale.shape) == 1:
+        raise ValueError("Input scale must be a B tensor. Got {}".format(scale.shape))
+    if not (center.shape[0] == angle.shape[0] == scale.shape[0]):
+        raise ValueError(
+            "Inputs must have same batch size dimension. Got {}".format(center.shape, angle.shape, scale.shape)
+        )
+    # convert radian and apply scale
+    scaled_rotation: torch.Tensor = angle_to_rotation_matrix(angle) * scale.view(-1, 1, 1)
+    alpha: torch.Tensor = scaled_rotation[:, 0, 0]
+    beta: torch.Tensor = scaled_rotation[:, 0, 1]
+
+    # unpack the center to x, y coordinates
+    x: torch.Tensor = center[..., 0]
+    y: torch.Tensor = center[..., 1]
+
+    # create output tensor
+    batch_size: int = center.shape[0]
+    M: torch.Tensor = torch.zeros(batch_size, 2, 3, device=center.device, dtype=center.dtype)
+    M[..., 0:2, 0:2] = scaled_rotation
+    M[..., 0, 2] = (torch.tensor(1.0) - alpha) * x - beta * y
+    M[..., 1, 2] = beta * x + (torch.tensor(1.0) - alpha) * y
+    return M
+
+
+def convert_points_to_homogeneous(points: torch.Tensor) -> torch.Tensor:
+    r"""Function that converts points from Euclidean to homogeneous space.
+    Examples::
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = kornia.convert_points_to_homogeneous(input)  # BxNx4
+    """
+    if not isinstance(points, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(points.shape))
+
+    return torch.nn.functional.pad(points, [0, 1], "constant", 1.0)
+
+
+def convert_points_from_homogeneous(points: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
+    r"""Function that converts points from homogeneous to Euclidean space.
+    Examples::
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = kornia.convert_points_from_homogeneous(input)  # BxNx2
+    """
+    if not isinstance(points, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(type(points)))
+
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(points.shape))
+
+    # we check for points at infinity
+    z_vec: torch.Tensor = points[..., -1:]
+
+    # set the results of division by zeror/near-zero to 1.0
+    # follow the convention of opencv:
+    # https://github.com/opencv/opencv/pull/14411/files
+    mask: torch.Tensor = torch.abs(z_vec) > eps
+    scale: torch.Tensor = torch.ones_like(z_vec).masked_scatter_(
+        mask, torch.tensor(1.0).to(points.device) / z_vec[mask]
+    )
+
+    return scale * points[..., :-1]
+
+
+def transform_points(trans_01: torch.Tensor, points_1: torch.Tensor) -> torch.Tensor:
+    r"""Function that applies transformations to a set of points.
+    Args:
+        trans_01 (torch.Tensor): tensor for transformations of shape
+          :math:`(B, D+1, D+1)`.
+        points_1 (torch.Tensor): tensor of points of shape :math:`(B, N, D)`.
+    Returns:
+        torch.Tensor: tensor of N-dimensional points.
+    Shape:
+        - Output: :math:`(B, N, D)`
+    Examples:
+        >>> points_1 = torch.rand(2, 4, 3)  # BxNx3
+        >>> trans_01 = torch.eye(4).view(1, 4, 4)  # Bx4x4
+        >>> points_0 = kornia.transform_points(trans_01, points_1)  # BxNx3
+    """
+    if not torch.is_tensor(trans_01) or not torch.is_tensor(points_1):
+        raise TypeError("Input type is not a torch.Tensor")
+    if not trans_01.device == points_1.device:
+        raise TypeError("Tensor must be in the same device")
+    if not trans_01.shape[0] == points_1.shape[0] and trans_01.shape[0] != 1:
+        raise ValueError("Input batch size must be the same for both tensors or 1")
+    if not trans_01.shape[-1] == (points_1.shape[-1] + 1):
+        raise ValueError("Last input dimensions must differe by one unit")
+    # to homogeneous
+    points_1_h = convert_points_to_homogeneous(points_1)  # BxNxD+1
+    # transform coordinates
+    points_0_h = torch.matmul(trans_01.unsqueeze(1), points_1_h.unsqueeze(-1))
+    points_0_h = torch.squeeze(points_0_h, dim=-1)
+    # to euclidean
+    points_0 = convert_points_from_homogeneous(points_0_h)  # BxNxD
+    return points_0
+
+
+def multi_linspace(a, b, num, endpoint=True, device="cpu", dtype=torch.float):
+    """This function is just like np.linspace, but will create linearly
+    spaced vectors from a start to end vector.
+    Inputs:
+        a - Start vector.
+        b - End vector.
+        num - Number of samples to generate. Default is 50. Must be above 0.
+        endpoint - If True, b is the last sample.
+                   Otherwise, it is not included. Default is True.
+    """
+
+    return a[..., None] + (b - a)[..., None] / (num - endpoint) * torch.arange(num, device=device, dtype=dtype)
+
+
+def create_batched_meshgrid(
+    x_min: torch.Tensor,
+    y_min: torch.Tensor,
+    x_max: torch.Tensor,
+    y_max: torch.Tensor,
+    height: int,
+    width: int,
+    device: Optional[torch.device] = torch.device("cpu"),
+) -> torch.Tensor:
+    """Generates a coordinate grid for an image.
+    When the flag `normalized_coordinates` is set to True, the grid is
+    normalized to be in the range [-1,1] to be consistent with the pytorch
+    function grid_sample.
+    http://pytorch.org/docs/master/nn.html#torch.nn.functional.grid_sample
+    Args:
+        height (int): the image height (rows).
+        width (int): the image width (cols).
+        normalized_coordinates (Optional[bool]): whether to normalize
+          coordinates in the range [-1, 1] in order to be consistent with the
+          PyTorch function grid_sample.
+    Return:
+        torch.Tensor: returns a grid tensor with shape :math:`(1, H, W, 2)`.
+    """
+    # generate coordinates
+    xs = multi_linspace(x_min, x_max, width, device=device, dtype=torch.float)
+    ys = multi_linspace(y_min, y_max, height, device=device, dtype=torch.float)
+
+    # generate grid by stacking coordinates
+    bs = x_min.shape[0]
+    batched_grid_i_list = list()
+    for i in range(bs):
+        batched_grid_i_list.append(torch.stack(torch.meshgrid([xs[i], ys[i]])).transpose(1, 2))  # 2xHxW
+    batched_grid: torch.Tensor = torch.stack(batched_grid_i_list, dim=0)
+    return batched_grid.permute(0, 2, 3, 1)  # BxHxWx2
+
+
+def homography_warp(
+    patch_src: torch.Tensor,
+    centers: torch.Tensor,
+    dst_homo_src: torch.Tensor,
+    dsize: Tuple[int, int],
+    mode: str = "bilinear",
+    padding_mode: str = "zeros",
+) -> torch.Tensor:
+    r"""Function that warps image patchs or tensors by homographies.
+    See :class:`~kornia.geometry.warp.HomographyWarper` for details.
+    Args:
+        patch_src (torch.Tensor): The image or tensor to warp. Should be from
+                                  source of shape :math:`(N, C, H, W)`.
+        dst_homo_src (torch.Tensor): The homography or stack of homographies
+                                     from source to destination of shape
+                                     :math:`(N, 3, 3)`.
+        dsize (Tuple[int, int]): The height and width of the image to warp.
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+    Return:
+        torch.Tensor: Patch sampled at locations from source to destination.
+    Example:
+        >>> input = torch.rand(1, 3, 32, 32)
+        >>> homography = torch.eye(3).view(1, 3, 3)
+        >>> output = kornia.homography_warp(input, homography, (32, 32))
+    """
+
+    out_height, out_width = dsize
+    image_height, image_width = patch_src.shape[-2:]
+    x_min = 2.0 * (centers[..., 0] - out_width / 2) / image_width - 1.0
+    y_min = 2.0 * (centers[..., 1] - out_height / 2) / image_height - 1.0
+    x_max = 2.0 * (centers[..., 0] + out_width / 2) / image_width - 1.0
+    y_max = 2.0 * (centers[..., 1] + out_height / 2) / image_height - 1.0
+    warper = HomographyWarper(x_min, y_min, x_max, y_max, out_height, out_width, mode, padding_mode)
+    return warper(patch_src, dst_homo_src)
+
+
+def normal_transform_pixel(height, width):
+
+    tr_mat = torch.Tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0], [0.0, 0.0, 1.0]])  # 1x3x3
+
+    tr_mat[0, 0] = tr_mat[0, 0] * 2.0 / (width - 1.0)
+    tr_mat[1, 1] = tr_mat[1, 1] * 2.0 / (height - 1.0)
+
+    tr_mat = tr_mat.unsqueeze(0)
+
+    return tr_mat
+
+
+def src_norm_to_dst_norm(
+    dst_pix_trans_src_pix: torch.Tensor, dsize_src: Tuple[int, int], dsize_dst: Tuple[int, int]
+) -> torch.Tensor:
+    # source and destination sizes
+    src_h, src_w = dsize_src
+    dst_h, dst_w = dsize_dst
+    # the devices and types
+    device: torch.device = dst_pix_trans_src_pix.device
+    dtype: torch.dtype = dst_pix_trans_src_pix.dtype
+    # compute the transformation pixel/norm for src/dst
+    src_norm_trans_src_pix: torch.Tensor = normal_transform_pixel(src_h, src_w).to(device, dtype)
+    src_pix_trans_src_norm = torch.inverse(src_norm_trans_src_pix)
+    dst_norm_trans_dst_pix: torch.Tensor = normal_transform_pixel(dst_h, dst_w).to(device, dtype)
+    # compute chain transformations
+    dst_norm_trans_src_norm: torch.Tensor = dst_norm_trans_dst_pix @ (dst_pix_trans_src_pix @ src_pix_trans_src_norm)
+    return dst_norm_trans_src_norm
+
+
+def transform_warp_impl(
+    src: torch.Tensor,
+    centers: torch.Tensor,
+    dst_pix_trans_src_pix: torch.Tensor,
+    dsize_src: Tuple[int, int],
+    dsize_dst: Tuple[int, int],
+    grid_mode: str,
+    padding_mode: str,
+) -> torch.Tensor:
+    """Compute the transform in normalized cooridnates and perform the warping."""
+    dst_norm_trans_src_norm: torch.Tensor = src_norm_to_dst_norm(dst_pix_trans_src_pix, dsize_src, dsize_src)
+
+    src_norm_trans_dst_norm = torch.inverse(dst_norm_trans_src_norm)
+    return homography_warp(src, centers, src_norm_trans_dst_norm, dsize_dst, grid_mode, padding_mode)
+
+
+class HomographyWarper(nn.Module):
+    r"""Warps image patches or tensors by homographies.
+    .. math::
+        X_{dst} = H_{src}^{\{dst\}} * X_{src}
+    Args:
+        height (int): The height of the image to warp.
+        width (int): The width of the image to warp.
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+    """
+
+    def __init__(
+        self,
+        x_min: torch.Tensor,
+        y_min: torch.Tensor,
+        x_max: torch.Tensor,
+        y_max: torch.Tensor,
+        height: int,
+        width: int,
+        mode: str = "bilinear",
+        padding_mode: str = "zeros",
+    ) -> None:
+        super(HomographyWarper, self).__init__()
+        self.width: int = width
+        self.height: int = height
+        self.mode: str = mode
+        self.padding_mode: str = padding_mode
+
+        # create base grid to compute the flow
+        self.grid: torch.Tensor = create_batched_meshgrid(x_min, y_min, x_max, y_max, height, width)
+
+    def warp_grid(self, dst_homo_src: torch.Tensor) -> torch.Tensor:
+        r"""Computes the grid to warp the coordinates grid by an homography.
+        Args:
+            dst_homo_src (torch.Tensor): Homography or homographies (stacked) to
+                              transform all points in the grid. Shape of the
+                              homography has to be :math:`(N, 3, 3)`.
+        Returns:
+            torch.Tensor: the transformed grid of shape :math:`(N, H, W, 2)`.
+        """
+        batch_size: int = dst_homo_src.shape[0]
+        device: torch.device = dst_homo_src.device
+        dtype: torch.dtype = dst_homo_src.dtype
+        # expand grid to match the input batch size
+        grid: torch.Tensor = self.grid
+        if len(dst_homo_src.shape) == 3:  # local homography case
+            dst_homo_src = dst_homo_src.view(batch_size, 1, 3, 3)  # NxHxWx3x3
+        # perform the actual grid transformation,
+        # the grid is copied to input device and casted to the same type
+        flow: torch.Tensor = transform_points(dst_homo_src, grid.to(device).to(dtype))  # NxHxWx2
+        return flow.view(batch_size, self.height, self.width, 2)  # NxHxWx2
+
+    def forward(self, patch_src: torch.Tensor, dst_homo_src: torch.Tensor) -> torch.Tensor:  # type: ignore
+        r"""Warps an image or tensor from source into reference frame.
+        Args:
+            patch_src (torch.Tensor): The image or tensor to warp.
+                                      Should be from source.
+            dst_homo_src (torch.Tensor): The homography or stack of homographies
+             from source to destination. The homography assumes normalized
+             coordinates [-1, 1].
+        Return:
+            torch.Tensor: Patch sampled at locations from source to destination.
+        Shape:
+            - Input: :math:`(N, C, H, W)` and :math:`(N, 3, 3)`
+            - Output: :math:`(N, C, H, W)`
+        Example:
+            >>> input = torch.rand(1, 3, 32, 32)
+            >>> homography = torch.eye(3).view(1, 3, 3)
+            >>> warper = kornia.HomographyWarper(32, 32)
+            >>> output = warper(input, homography)  # NxCxHxW
+        """
+        if not dst_homo_src.device == patch_src.device:
+            raise TypeError(
+                "Patch and homography must be on the same device. \
+                            Got patch.device: {} dst_H_src.device: {}.".format(
+                    patch_src.device, dst_homo_src.device
+                )
+            )
+
+        return F.grid_sample(
+            patch_src,
+            self.warp_grid(dst_homo_src),  # type: ignore
+            mode=self.mode,
+            padding_mode=self.padding_mode,
+            align_corners=True,
+        )
+
+
+def warp_affine_crop(
+    src: torch.Tensor,
+    centers: torch.Tensor,
+    M: torch.Tensor,
+    dsize: Tuple[int, int],
+    flags: str = "bilinear",
+    padding_mode: str = "zeros",
+) -> torch.Tensor:
+    r"""Applies an affine transformation to a tensor.
+
+    The function warp_affine transforms the source tensor using
+    the specified matrix:
+
+    .. math::
+        \text{dst}(x, y) = \text{src} \left( M_{11} x + M_{12} y + M_{13} ,
+        M_{21} x + M_{22} y + M_{23} \right )
+
+    Args:
+        src (torch.Tensor): input tensor of shape :math:`(B, C, H, W)`.
+        M (torch.Tensor): affine transformation of shape :math:`(B, 2, 3)`.
+        dsize (Tuple[int, int]): size of the output image (height, width).
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+
+    Returns:
+        torch.Tensor: the warped tensor.
+
+    Shape:
+        - Output: :math:`(B, C, H, W)`
+
+    .. note::
+       See a working example `here <https://kornia.readthedocs.io/en/latest/
+       tutorials/warp_affine.html>`__.
+    """
+    if not torch.is_tensor(src):
+        raise TypeError("Input src type is not a torch.Tensor. Got {}".format(type(src)))
+
+    if not torch.is_tensor(M):
+        raise TypeError("Input M type is not a torch.Tensor. Got {}".format(type(M)))
+
+    if not len(src.shape) == 4:
+        raise ValueError("Input src must be a BxCxHxW tensor. Got {}".format(src.shape))
+
+    if not (len(M.shape) == 3 or M.shape[-2:] == (2, 3)):
+        raise ValueError("Input M must be a Bx2x3 tensor. Got {}".format(src.shape))
+
+    # we generate a 3x3 transformation matrix from 2x3 affine
+    M_3x3: torch.Tensor = F.pad(M, [0, 0, 0, 1, 0, 0], mode="constant", value=0)
+    M_3x3[:, 2, 2] += 1.0
+
+    # launches the warper
+    h, w = src.shape[-2:]
+    return transform_warp_impl(src, centers, M_3x3, (h, w), dsize, flags, padding_mode)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py
new file mode 100644
index 000000000..21d42d9b6
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py
@@ -0,0 +1,261 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import numpy as np
+import collections.abc
+from torch.utils.data._utils.collate import default_collate
+import dill
+
+container_abcs = collections.abc
+
+
+def restore(data):
+    """
+    In case we dilled some structures to share between multiple process this function will restore them.
+    If the data input are not bytes we assume it was not dilled in the first place
+
+    :param data: Possibly dilled data structure
+    :return: Un-dilled data structure
+    """
+    if type(data) is bytes:
+        return dill.loads(data)
+    return data
+
+
+def collate(batch):
+    if len(batch) == 0:
+        return batch
+    elem = batch[0]
+    if elem is None:
+        return None
+    elif isinstance(elem, container_abcs.Sequence):
+        if len(elem) == 4:  # We assume those are the maps, map points, headings and patch_size
+            scene_map, scene_pts, heading_angle, patch_size = zip(*batch)
+            if heading_angle[0] is None:
+                heading_angle = None
+            else:
+                heading_angle = torch.Tensor(heading_angle)
+            map = scene_map[0].get_cropped_maps_from_scene_map_batch(
+                scene_map, scene_pts=torch.Tensor(scene_pts), patch_size=patch_size[0], rotation=heading_angle
+            )
+            return map
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+    elif isinstance(elem, container_abcs.Mapping):
+        # We have to dill the neighbors structures. Otherwise each tensor is put into
+        # shared memory separately -> slow, file pointer overhead
+        # we only do this in multiprocessing
+        neighbor_dict = {key: [d[key] for d in batch] for key in elem}
+        return dill.dumps(neighbor_dict) if torch.utils.data.get_worker_info() else neighbor_dict
+    return default_collate(batch)
+
+
+def get_relative_robot_traj(env, state, node_traj, robot_traj, node_type, robot_type):
+    # TODO: We will have to make this more generic if robot_type != node_type
+    # Make Robot State relative to node
+    _, std = env.get_standardize_params(state[robot_type], node_type=robot_type)
+    std[0:2] = env.attention_radius[(node_type, robot_type)]
+    robot_traj_st = env.standardize(robot_traj, state[robot_type], node_type=robot_type, mean=node_traj, std=std)
+    robot_traj_st_t = torch.tensor(robot_traj_st, dtype=torch.float)
+
+    return robot_traj_st_t
+
+
+def get_node_timestep_data(
+    env, scene, t, node, state, pred_state, edge_types, max_ht, max_ft, hyperparams, scene_graph=None
+):
+    """
+    Pre-processes the data for a single batch element: node state over time for a specific time in a specific scene
+    as well as the neighbour data for it.
+
+    :param env: Environment
+    :param scene: Scene
+    :param t: Timestep in scene
+    :param node: Node
+    :param state: Specification of the node state
+    :param pred_state: Specification of the prediction state
+    :param edge_types: List of all Edge Types for which neighbours are pre-processed
+    :param max_ht: Maximum history timesteps
+    :param max_ft: Maximum future timesteps (prediction horizon)
+    :param hyperparams: Model hyperparameters
+    :param scene_graph: If scene graph was already computed for this scene and time you can pass it here
+    :return: Batch Element
+    """
+
+    # Node
+    timestep_range_x = np.array([t - max_ht, t])
+    timestep_range_y = np.array([t + 1, t + max_ft])
+
+    x = node.get(timestep_range_x, state[node.type])
+    y = node.get(timestep_range_y, pred_state[node.type])
+    first_history_index = (max_ht - node.history_points_at(t)).clip(0)
+
+    _, std = env.get_standardize_params(state[node.type], node.type)
+    std[0:2] = env.attention_radius[(node.type, node.type)]
+    rel_state = np.zeros_like(x[0])
+    rel_state[0:2] = np.array(x)[-1, 0:2]
+    x_st = env.standardize(x, state[node.type], node.type, mean=rel_state, std=std)
+    if list(pred_state[node.type].keys())[0] == "position":  # If we predict position we do it relative to current pos
+        y_st = env.standardize(y, pred_state[node.type], node.type, mean=rel_state[0:2])
+    else:
+        y_st = env.standardize(y, pred_state[node.type], node.type)
+
+    x_t = torch.tensor(x, dtype=torch.float)
+    y_t = torch.tensor(y, dtype=torch.float)
+    x_st_t = torch.tensor(x_st, dtype=torch.float)
+    y_st_t = torch.tensor(y_st, dtype=torch.float)
+
+    # Neighbors
+    neighbors_data_st = None
+    neighbors_edge_value = None
+    if hyperparams["edge_encoding"]:
+        # Scene Graph
+        scene_graph = (
+            scene.get_scene_graph(
+                t, env.attention_radius, hyperparams["edge_addition_filter"], hyperparams["edge_removal_filter"]
+            )
+            if scene_graph is None
+            else scene_graph
+        )
+
+        neighbors_data_st = dict()
+        neighbors_edge_value = dict()
+        for edge_type in edge_types:
+            neighbors_data_st[edge_type] = list()
+            # We get all nodes which are connected to the current node for the current timestep
+            connected_nodes = scene_graph.get_neighbors(node, edge_type[1])
+
+            if hyperparams["dynamic_edges"] == "yes":
+                # We get the edge masks for the current node at the current timestep
+                edge_masks = torch.tensor(scene_graph.get_edge_scaling(node), dtype=torch.float)
+                neighbors_edge_value[edge_type] = edge_masks
+
+            for connected_node in connected_nodes:
+                neighbor_state_np = connected_node.get(
+                    np.array([t - max_ht, t]), state[connected_node.type], padding=0.0
+                )
+
+                # Make State relative to node where neighbor and node have same state
+                _, std = env.get_standardize_params(state[connected_node.type], node_type=connected_node.type)
+                std[0:2] = env.attention_radius[edge_type]
+                equal_dims = np.min((neighbor_state_np.shape[-1], x.shape[-1]))
+                rel_state = np.zeros_like(neighbor_state_np)
+                rel_state[:, ..., :equal_dims] = x[-1, ..., :equal_dims]
+                neighbor_state_np_st = env.standardize(
+                    neighbor_state_np,
+                    state[connected_node.type],
+                    node_type=connected_node.type,
+                    mean=rel_state,
+                    std=std,
+                )
+
+                neighbor_state = torch.tensor(neighbor_state_np_st, dtype=torch.float)
+                neighbors_data_st[edge_type].append(neighbor_state)
+
+    # Robot
+    robot_traj_st_t = None
+    if hyperparams["incl_robot_node"]:
+        timestep_range_r = np.array([t, t + max_ft])
+        if scene.non_aug_scene is not None:
+            robot = scene.get_node_by_id(scene.non_aug_scene.robot.id)
+        else:
+            robot = scene.robot
+        robot_type = robot.type
+        robot_traj = robot.get(timestep_range_r, state[robot_type], padding=0.0)
+        node_state = np.zeros_like(robot_traj[0])
+        node_state[: x.shape[1]] = x[-1]
+        robot_traj_st_t = get_relative_robot_traj(env, state, node_state, robot_traj, node.type, robot_type)
+
+    # Map
+    map_tuple = None
+    if hyperparams["use_map_encoding"]:
+        if node.type in hyperparams["map_encoder"]:
+            if node.non_aug_node is not None:
+                x = node.non_aug_node.get(np.array([t]), state[node.type])
+            me_hyp = hyperparams["map_encoder"][node.type]
+            if "heading_state_index" in me_hyp:
+                heading_state_index = me_hyp["heading_state_index"]
+                # We have to rotate the map in the opposit direction of the agent to match them
+                if type(heading_state_index) is list:  # infer from velocity or heading vector
+                    heading_angle = (
+                        -np.arctan2(x[-1, heading_state_index[1]], x[-1, heading_state_index[0]]) * 180 / np.pi
+                    )
+                else:
+                    heading_angle = -x[-1, heading_state_index] * 180 / np.pi
+            else:
+                heading_angle = None
+
+            scene_map = scene.map[node.type]
+            map_point = x[-1, :2]
+
+            patch_size = hyperparams["map_encoder"][node.type]["patch_size"]
+            map_tuple = (scene_map, map_point, heading_angle, patch_size)
+
+    return (
+        first_history_index,
+        x_t,
+        y_t,
+        x_st_t,
+        y_st_t,
+        neighbors_data_st,
+        neighbors_edge_value,
+        robot_traj_st_t,
+        map_tuple,
+    )
+
+
+def get_timesteps_data(
+    env, scene, t, node_type, state, pred_state, edge_types, min_ht, max_ht, min_ft, max_ft, hyperparams
+):
+    """
+    Puts together the inputs for ALL nodes in a given scene and timestep in it.
+
+    :param env: Environment
+    :param scene: Scene
+    :param t: Timestep in scene
+    :param node_type: Node Type of nodes for which the data shall be pre-processed
+    :param state: Specification of the node state
+    :param pred_state: Specification of the prediction state
+    :param edge_types: List of all Edge Types for which neighbors are pre-processed
+    :param max_ht: Maximum history timesteps
+    :param max_ft: Maximum future timesteps (prediction horizon)
+    :param hyperparams: Model hyperparameters
+    :return:
+    """
+    nodes_per_ts = scene.present_nodes(
+        t,
+        type=node_type,
+        min_history_timesteps=min_ht,
+        min_future_timesteps=max_ft,
+        return_robot=not hyperparams["incl_robot_node"],
+    )
+    batch = list()
+    nodes = list()
+    out_timesteps = list()
+    for timestep in nodes_per_ts.keys():
+        scene_graph = scene.get_scene_graph(
+            timestep, env.attention_radius, hyperparams["edge_addition_filter"], hyperparams["edge_removal_filter"]
+        )
+        present_nodes = nodes_per_ts[timestep]
+        for node in present_nodes:
+            nodes.append(node)
+            out_timesteps.append(timestep)
+            batch.append(
+                get_node_timestep_data(
+                    env,
+                    scene,
+                    timestep,
+                    node,
+                    state,
+                    pred_state,
+                    edge_types,
+                    max_ht,
+                    max_ft,
+                    hyperparams,
+                    scene_graph=scene_graph,
+                )
+            )
+    if len(out_timesteps) == 0:
+        return None
+    return collate(batch), nodes, out_timesteps
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py
new file mode 100644
index 000000000..968fc5653
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from model.dynamics.dynamic import Dynamic
+from model.dynamics.single_integrator import SingleIntegrator
+from model.dynamics.unicycle import Unicycle
+from model.dynamics.linear import Linear
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py
new file mode 100644
index 000000000..4fd3b6b0f
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Dynamic(object):
+    def __init__(self, dt, dyn_limits, device, model_registrar, xz_size, node_type):
+        self.dt = dt
+        self.device = device
+        self.dyn_limits = dyn_limits
+        self.initial_conditions = None
+        self.model_registrar = model_registrar
+        self.node_type = node_type
+        self.init_constants()
+        self.create_graph(xz_size)
+
+    def set_initial_condition(self, init_con):
+        self.initial_conditions = init_con
+
+    def init_constants(self):
+        pass
+
+    def create_graph(self, xz_size):
+        pass
+
+    def integrate_samples(self, s, x):
+        raise NotImplementedError
+
+    def integrate_distribution(self, dist, x):
+        raise NotImplementedError
+
+    def create_graph(self, xz_size):
+        pass
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py
new file mode 100644
index 000000000..228df8008
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from ..dynamics import Dynamic
+
+
+class Linear(Dynamic):
+    def init_constants(self):
+        pass
+
+    def integrate_samples(self, v, x):
+        return v
+
+    def integrate_distribution(self, v_dist, x):
+        return v_dist
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py
new file mode 100644
index 000000000..cb2cfeb2e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from model.dynamics import Dynamic
+from utils import block_diag
+from model.components import GMM2D
+
+
+class SingleIntegrator(Dynamic):
+    def init_constants(self):
+        self.F = torch.eye(4, device=self.device, dtype=torch.float32)
+        self.F[0:2, 2:] = torch.eye(2, device=self.device, dtype=torch.float32) * self.dt
+        self.F_t = self.F.transpose(-2, -1)
+
+    def integrate_samples(self, v, x=None):
+        """
+        Integrates deterministic samples of velocity.
+
+        :param v: Velocity samples
+        :param x: Not used for SI.
+        :return: Position samples
+        """
+        p_0 = self.initial_conditions["pos"].unsqueeze(1)
+        return torch.cumsum(v, dim=2) * self.dt + p_0
+
+    def integrate_distribution(self, v_dist, x=None):
+        r"""
+        Integrates the GMM velocity distribution to a distribution over position.
+        The Kalman Equations are used.
+
+        .. math:: \mu_{t+1} =\textbf{F} \mu_{t}
+
+        .. math:: \mathbf{\Sigma}_{t+1}={\textbf {F}} \mathbf{\Sigma}_{t} {\textbf {F}}^{T}
+
+        .. math::
+            \textbf{F} = \left[
+                            \begin{array}{cccc}
+                                \sigma_x^2 & \rho_p \sigma_x \sigma_y & 0 & 0 \\
+                                \rho_p \sigma_x \sigma_y & \sigma_y^2 & 0 & 0 \\
+                                0 & 0 & \sigma_{v_x}^2 & \rho_v \sigma_{v_x} \sigma_{v_y} \\
+                                0 & 0 & \rho_v \sigma_{v_x} \sigma_{v_y} & \sigma_{v_y}^2 \\
+                            \end{array}
+                        \right]_{t}
+
+        :param v_dist: Joint GMM Distribution over velocity in x and y direction.
+        :param x: Not used for SI.
+        :return: Joint GMM Distribution over position in x and y direction.
+        """
+        p_0 = self.initial_conditions["pos"].unsqueeze(1)
+        ph = v_dist.mus.shape[-3]
+        sample_batch_dim = list(v_dist.mus.shape[0:2])
+        pos_dist_sigma_matrix_list = []
+
+        pos_mus = p_0[:, None] + torch.cumsum(v_dist.mus, dim=2) * self.dt
+
+        vel_dist_sigma_matrix = v_dist.get_covariance_matrix()
+        pos_dist_sigma_matrix_t = torch.zeros(sample_batch_dim + [v_dist.components, 2, 2], device=self.device)
+
+        for t in range(ph):
+            vel_sigma_matrix_t = vel_dist_sigma_matrix[:, :, t]
+            full_sigma_matrix_t = block_diag([pos_dist_sigma_matrix_t, vel_sigma_matrix_t])
+            pos_dist_sigma_matrix_t = self.F[..., :2, :].matmul(full_sigma_matrix_t.matmul(self.F_t)[..., :2])
+            pos_dist_sigma_matrix_list.append(pos_dist_sigma_matrix_t)
+
+        pos_dist_sigma_matrix = torch.stack(pos_dist_sigma_matrix_list, dim=2)
+        return GMM2D.from_log_pis_mus_cov_mats(v_dist.log_pis, pos_mus, pos_dist_sigma_matrix)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py
new file mode 100644
index 000000000..b46820063
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py
@@ -0,0 +1,239 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+from model.dynamics import Dynamic
+from utils import block_diag
+from model.components import GMM2D
+
+
+class Unicycle(Dynamic):
+    def init_constants(self):
+        self.F_s = torch.eye(4, device=self.device, dtype=torch.float32)
+        self.F_s[0:2, 2:] = torch.eye(2, device=self.device, dtype=torch.float32) * self.dt
+        self.F_s_t = self.F_s.transpose(-2, -1)
+
+    def create_graph(self, xz_size):
+        model_if_absent = nn.Linear(xz_size + 1, 1)
+        self.p0_model = self.model_registrar.get_model(f"{self.node_type}/unicycle_initializer", model_if_absent)
+
+    def dynamic(self, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        x_p = x[0]
+        y_p = x[1]
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        d1 = torch.stack(
+            [
+                (x_p + (a / dphi) * dcos_domega + v * dsin_domega + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt),
+                (y_p - v * dcos_domega + (a / dphi) * dsin_domega - (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt),
+                phi + dphi * self.dt,
+                v + a * self.dt,
+            ],
+            dim=0,
+        )
+        d2 = torch.stack(
+            [
+                x_p + v * torch.cos(phi) * self.dt + (a / 2) * torch.cos(phi) * self.dt**2,
+                y_p + v * torch.sin(phi) * self.dt + (a / 2) * torch.sin(phi) * self.dt**2,
+                phi * torch.ones_like(a),
+                v + a * self.dt,
+            ],
+            dim=0,
+        )
+        return torch.where(~mask, d1, d2)
+
+    def integrate_samples(self, control_samples, x=None):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        ph = control_samples.shape[-2]
+        p_0 = self.initial_conditions["pos"].unsqueeze(1)
+        v_0 = self.initial_conditions["vel"].unsqueeze(1)
+
+        # In case the input is batched because of the robot in online use we repeat this to match the batch size of x.
+        if p_0.size()[0] != x.size()[0]:
+            p_0 = p_0.repeat(x.size()[0], 1, 1)
+            v_0 = v_0.repeat(x.size()[0], 1, 1)
+
+        phi_0 = torch.atan2(v_0[..., 1], v_0[..., 0])
+
+        phi_0 = phi_0 + torch.tanh(self.p0_model(torch.cat((x, phi_0), dim=-1)))
+
+        u = torch.stack([control_samples[..., 0], control_samples[..., 1]], dim=0)
+        x = torch.stack([p_0[..., 0], p_0[..., 1], phi_0, torch.norm(v_0, dim=-1)], dim=0).squeeze(dim=-1)
+
+        mus_list = []
+        for t in range(ph):
+            x = self.dynamic(x, u[..., t])
+            mus_list.append(torch.stack((x[0], x[1]), dim=-1))
+
+        pos_mus = torch.stack(mus_list, dim=2)
+        return pos_mus
+
+    def compute_control_jacobian(self, sample_batch_dim, components, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        F = torch.zeros(sample_batch_dim + [components, 4, 2], device=self.device, dtype=torch.float32)
+
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        F[..., 0, 0] = (
+            (v / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+            - (v / dphi) * dsin_domega
+            - (2 * a / dphi**2) * torch.sin(phi_p_omega_dt) * self.dt
+            - (2 * a / dphi**2) * dcos_domega
+            + (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt**2
+        )
+        F[..., 0, 1] = (1 / dphi) * dcos_domega + (1 / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+
+        F[..., 1, 0] = (
+            (v / dphi) * dcos_domega
+            - (2 * a / dphi**2) * dsin_domega
+            + (2 * a / dphi**2) * torch.cos(phi_p_omega_dt) * self.dt
+            + (v / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+            + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt**2
+        )
+        F[..., 1, 1] = (1 / dphi) * dsin_domega - (1 / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+
+        F[..., 2, 0] = self.dt
+
+        F[..., 3, 1] = self.dt
+
+        F_sm = torch.zeros(sample_batch_dim + [components, 4, 2], device=self.device, dtype=torch.float32)
+
+        F_sm[..., 0, 1] = (torch.cos(phi) * self.dt**2) / 2
+
+        F_sm[..., 1, 1] = (torch.sin(phi) * self.dt**2) / 2
+
+        F_sm[..., 3, 1] = self.dt
+
+        return torch.where(~mask.unsqueeze(-1).unsqueeze(-1), F, F_sm)
+
+    def compute_jacobian(self, sample_batch_dim, components, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        one = torch.tensor(1)
+        F = torch.zeros(sample_batch_dim + [components, 4, 4], device=self.device, dtype=torch.float32)
+
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        F[..., 0, 0] = one
+        F[..., 1, 1] = one
+        F[..., 2, 2] = one
+        F[..., 3, 3] = one
+
+        F[..., 0, 2] = v * dcos_domega - (a / dphi) * dsin_domega + (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+        F[..., 0, 3] = dsin_domega
+
+        F[..., 1, 2] = v * dsin_domega + (a / dphi) * dcos_domega + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+        F[..., 1, 3] = -dcos_domega
+
+        F_sm = torch.zeros(sample_batch_dim + [components, 4, 4], device=self.device, dtype=torch.float32)
+
+        F_sm[..., 0, 0] = one
+        F_sm[..., 1, 1] = one
+        F_sm[..., 2, 2] = one
+        F_sm[..., 3, 3] = one
+
+        F_sm[..., 0, 2] = -v * torch.sin(phi) * self.dt - (a * torch.sin(phi) * self.dt**2) / 2
+        F_sm[..., 0, 3] = torch.cos(phi) * self.dt
+
+        F_sm[..., 1, 2] = v * torch.cos(phi) * self.dt + (a * torch.cos(phi) * self.dt**2) / 2
+        F_sm[..., 1, 3] = torch.sin(phi) * self.dt
+
+        return torch.where(~mask.unsqueeze(-1).unsqueeze(-1), F, F_sm)
+
+    def integrate_distribution(self, control_dist_dphi_a, x):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        sample_batch_dim = list(control_dist_dphi_a.mus.shape[0:2])
+        ph = control_dist_dphi_a.mus.shape[-3]
+        p_0 = self.initial_conditions["pos"].unsqueeze(1)
+        v_0 = self.initial_conditions["vel"].unsqueeze(1)
+
+        # In case the input is batched because of the robot in online use we repeat this to match the batch size of x.
+        if p_0.size()[0] != x.size()[0]:
+            p_0 = p_0.repeat(x.size()[0], 1, 1)
+            v_0 = v_0.repeat(x.size()[0], 1, 1)
+
+        phi_0 = torch.atan2(v_0[..., 1], v_0[..., 0])
+
+        phi_0 = phi_0 + torch.tanh(self.p0_model(torch.cat((x, phi_0), dim=-1)))
+
+        dist_sigma_matrix = control_dist_dphi_a.get_covariance_matrix()
+        pos_dist_sigma_matrix_t = torch.zeros(
+            sample_batch_dim + [control_dist_dphi_a.components, 4, 4], device=self.device
+        )
+
+        u = torch.stack([control_dist_dphi_a.mus[..., 0], control_dist_dphi_a.mus[..., 1]], dim=0)
+        x = torch.stack([p_0[..., 0], p_0[..., 1], phi_0, torch.norm(v_0, dim=-1)], dim=0)
+
+        pos_dist_sigma_matrix_list = []
+        mus_list = []
+        for t in range(ph):
+            F_t = self.compute_jacobian(sample_batch_dim, control_dist_dphi_a.components, x, u[:, :, :, t])
+            G_t = self.compute_control_jacobian(sample_batch_dim, control_dist_dphi_a.components, x, u[:, :, :, t])
+            dist_sigma_matrix_t = dist_sigma_matrix[:, :, t]
+            pos_dist_sigma_matrix_t = F_t.matmul(pos_dist_sigma_matrix_t.matmul(F_t.transpose(-2, -1))) + G_t.matmul(
+                dist_sigma_matrix_t.matmul(G_t.transpose(-2, -1))
+            )
+            pos_dist_sigma_matrix_list.append(pos_dist_sigma_matrix_t[..., :2, :2])
+
+            x = self.dynamic(x, u[:, :, :, t])
+            mus_list.append(torch.stack((x[0], x[1]), dim=-1))
+
+        pos_dist_sigma_matrix = torch.stack(pos_dist_sigma_matrix_list, dim=2)
+        pos_mus = torch.stack(mus_list, dim=2)
+        return GMM2D.from_log_pis_mus_cov_mats(control_dist_dphi_a.log_pis, pos_mus, pos_dist_sigma_matrix)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py
new file mode 100644
index 000000000..c05e86229
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py
@@ -0,0 +1,1240 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from model.components import *
+from model.model_utils import *
+import model.dynamics as dynamic_module
+from environment.scene_graph import DirectedEdge
+
+
+class MultimodalGenerativeCVAE(torch.nn.Module):
+    def __init__(self, env, node_type, model_registrar, hyperparams, device, edge_types, log_writer=None):
+        super().__init__()
+        self.hyperparams = hyperparams
+        self.env = env
+        self.node_type = node_type
+        self.model_registrar = model_registrar
+        self.log_writer = log_writer
+        self.device = device
+        self.edge_types = [edge_type for edge_type in edge_types if edge_type[0] is node_type]
+        self.curr_iter = 0
+
+        self.node_modules = dict()
+        self.node_modules = torch.nn.ModuleDict()
+
+        self.min_hl = self.hyperparams["minimum_history_length"]
+        self.max_hl = self.hyperparams["maximum_history_length"]
+        self.ph = self.hyperparams["prediction_horizon"]
+        self.state = self.hyperparams["state"]
+        self.pred_state = self.hyperparams["pred_state"][node_type]
+        self.state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[node_type].values()]))
+        if self.hyperparams["incl_robot_node"]:
+            self.robot_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[env.robot_type].values()])
+            )
+        self.pred_state_length = int(np.sum([len(entity_dims) for entity_dims in self.pred_state.values()]))
+
+        edge_types_str = [DirectedEdge.get_str_from_types(*edge_type) for edge_type in self.edge_types]
+        self.create_graphical_model(edge_types_str)
+
+        dynamic_class = getattr(dynamic_module, hyperparams["dynamic"][self.node_type]["name"])
+        dyn_limits = hyperparams["dynamic"][self.node_type]["limits"]
+        self.dynamic = dynamic_class(
+            self.env.scenes[0].dt, dyn_limits, device, self.model_registrar, self.x_size, self.node_type
+        )
+
+    def eval(self):
+        super().eval()
+        for key in self.node_modules.keys():
+            self.node_modules[key].eval()
+
+    def set_curr_iter(self, curr_iter):
+        self.curr_iter = curr_iter
+
+    def add_submodule(self, name, model_if_absent):
+        self.node_modules[name] = self.model_registrar.get_model(name, model_if_absent)
+
+    def clear_submodules(self):
+        self.node_modules.clear()
+
+    def create_node_models(self):
+        ############################
+        #   Node History Encoder   #
+        ############################
+        self.add_submodule(
+            self.node_type + "/node_history_encoder",
+            model_if_absent=nn.LSTM(
+                input_size=self.state_length, hidden_size=self.hyperparams["enc_rnn_dim_history"], batch_first=True
+            ),
+        )
+
+        ###########################
+        #   Node Future Encoder   #
+        ###########################
+        # We'll create this here, but then later check if in training mode.
+        # Based on that, we'll factor this into the computation graph (or not).
+        self.add_submodule(
+            self.node_type + "/node_future_encoder",
+            model_if_absent=nn.LSTM(
+                input_size=self.pred_state_length,
+                hidden_size=self.hyperparams["enc_rnn_dim_future"],
+                bidirectional=True,
+                batch_first=True,
+            ),
+        )
+        # These are related to how you initialize states for the node future encoder.
+        self.add_submodule(
+            self.node_type + "/node_future_encoder/initial_h",
+            model_if_absent=nn.Linear(self.state_length, self.hyperparams["enc_rnn_dim_future"]),
+        )
+        self.add_submodule(
+            self.node_type + "/node_future_encoder/initial_c",
+            model_if_absent=nn.Linear(self.state_length, self.hyperparams["enc_rnn_dim_future"]),
+        )
+
+        ############################
+        #   Robot Future Encoder   #
+        ############################
+        # We'll create this here, but then later check if we're next to the robot.
+        # Based on that, we'll factor this into the computation graph (or not).
+        if self.hyperparams["incl_robot_node"]:
+            self.add_submodule(
+                "robot_future_encoder",
+                model_if_absent=nn.LSTM(
+                    input_size=self.robot_state_length,
+                    hidden_size=self.hyperparams["enc_rnn_dim_future"],
+                    bidirectional=True,
+                    batch_first=True,
+                ),
+            )
+            # These are related to how you initialize states for the robot future encoder.
+            self.add_submodule(
+                "robot_future_encoder/initial_h",
+                model_if_absent=nn.Linear(self.robot_state_length, self.hyperparams["enc_rnn_dim_future"]),
+            )
+            self.add_submodule(
+                "robot_future_encoder/initial_c",
+                model_if_absent=nn.Linear(self.robot_state_length, self.hyperparams["enc_rnn_dim_future"]),
+            )
+
+        if self.hyperparams["edge_encoding"]:
+            ##############################
+            #   Edge Influence Encoder   #
+            ##############################
+            # NOTE: The edge influence encoding happens during calls
+            # to forward or incremental_forward, so we don't create
+            # a model for it here for the max and sum variants.
+            if self.hyperparams["edge_influence_combine_method"] == "bi-rnn":
+                self.add_submodule(
+                    self.node_type + "/edge_influence_encoder",
+                    model_if_absent=nn.LSTM(
+                        input_size=self.hyperparams["enc_rnn_dim_edge"],
+                        hidden_size=self.hyperparams["enc_rnn_dim_edge_influence"],
+                        bidirectional=True,
+                        batch_first=True,
+                    ),
+                )
+
+                # Four times because we're trying to mimic a bi-directional
+                # LSTM's output (which, here, is c and h from both ends).
+                self.eie_output_dims = 4 * self.hyperparams["enc_rnn_dim_edge_influence"]
+
+            elif self.hyperparams["edge_influence_combine_method"] == "attention":
+                # Chose additive attention because of https://arxiv.org/pdf/1703.03906.pdf
+                # We calculate an attention context vector using the encoded edges as the "encoder"
+                # (that we attend _over_)
+                # and the node history encoder representation as the "decoder state" (that we attend _on_).
+                self.add_submodule(
+                    self.node_type + "/edge_influence_encoder",
+                    model_if_absent=AdditiveAttention(
+                        encoder_hidden_state_dim=self.hyperparams["enc_rnn_dim_edge_influence"],
+                        decoder_hidden_state_dim=self.hyperparams["enc_rnn_dim_history"],
+                    ),
+                )
+
+                self.eie_output_dims = self.hyperparams["enc_rnn_dim_edge_influence"]
+
+        ###################
+        #   Map Encoder   #
+        ###################
+        if self.hyperparams["use_map_encoding"]:
+            if self.node_type in self.hyperparams["map_encoder"]:
+                me_params = self.hyperparams["map_encoder"][self.node_type]
+                self.add_submodule(
+                    self.node_type + "/map_encoder",
+                    model_if_absent=CNNMapEncoder(
+                        me_params["map_channels"],
+                        me_params["hidden_channels"],
+                        me_params["output_size"],
+                        me_params["masks"],
+                        me_params["strides"],
+                        me_params["patch_size"],
+                    ),
+                )
+
+        ################################
+        #   Discrete Latent Variable   #
+        ################################
+        self.latent = DiscreteLatent(self.hyperparams, self.device)
+
+        ######################################################################
+        #   Various Fully-Connected Layers from Encoder to Latent Variable   #
+        ######################################################################
+        # Node History Encoder
+        x_size = self.hyperparams["enc_rnn_dim_history"]
+        if self.hyperparams["edge_encoding"]:
+            #              Edge Encoder
+            x_size += self.eie_output_dims
+        if self.hyperparams["incl_robot_node"]:
+            #              Future Conditional Encoder
+            x_size += 4 * self.hyperparams["enc_rnn_dim_future"]
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            #              Map Encoder
+            x_size += self.hyperparams["map_encoder"][self.node_type]["output_size"]
+
+        z_size = self.hyperparams["N"] * self.hyperparams["K"]
+
+        if self.hyperparams["p_z_x_MLP_dims"] is not None:
+            self.add_submodule(
+                self.node_type + "/p_z_x", model_if_absent=nn.Linear(x_size, self.hyperparams["p_z_x_MLP_dims"])
+            )
+            hx_size = self.hyperparams["p_z_x_MLP_dims"]
+        else:
+            hx_size = x_size
+
+        self.add_submodule(self.node_type + "/hx_to_z", model_if_absent=nn.Linear(hx_size, self.latent.z_dim))
+
+        if self.hyperparams["q_z_xy_MLP_dims"] is not None:
+            self.add_submodule(
+                self.node_type + "/q_z_xy",
+                #                                           Node Future Encoder
+                model_if_absent=nn.Linear(
+                    x_size + 4 * self.hyperparams["enc_rnn_dim_future"], self.hyperparams["q_z_xy_MLP_dims"]
+                ),
+            )
+            hxy_size = self.hyperparams["q_z_xy_MLP_dims"]
+        else:
+            #                           Node Future Encoder
+            hxy_size = x_size + 4 * self.hyperparams["enc_rnn_dim_future"]
+
+        self.add_submodule(self.node_type + "/hxy_to_z", model_if_absent=nn.Linear(hxy_size, self.latent.z_dim))
+
+        ####################
+        #   Decoder LSTM   #
+        ####################
+        if self.hyperparams["incl_robot_node"]:
+            decoder_input_dims = self.pred_state_length + self.robot_state_length + z_size + x_size
+        else:
+            decoder_input_dims = self.pred_state_length + z_size + x_size
+
+        self.add_submodule(
+            self.node_type + "/decoder/state_action",
+            model_if_absent=nn.Sequential(nn.Linear(self.state_length, self.pred_state_length)),
+        )
+
+        self.add_submodule(
+            self.node_type + "/decoder/rnn_cell",
+            model_if_absent=nn.GRUCell(decoder_input_dims, self.hyperparams["dec_rnn_dim"]),
+        )
+        self.add_submodule(
+            self.node_type + "/decoder/initial_h",
+            model_if_absent=nn.Linear(z_size + x_size, self.hyperparams["dec_rnn_dim"]),
+        )
+
+        ###################
+        #   Decoder GMM   #
+        ###################
+        self.add_submodule(
+            self.node_type + "/decoder/proj_to_GMM_log_pis",
+            model_if_absent=nn.Linear(self.hyperparams["dec_rnn_dim"], self.hyperparams["GMM_components"]),
+        )
+        self.add_submodule(
+            self.node_type + "/decoder/proj_to_GMM_mus",
+            model_if_absent=nn.Linear(
+                self.hyperparams["dec_rnn_dim"], self.hyperparams["GMM_components"] * self.pred_state_length
+            ),
+        )
+        self.add_submodule(
+            self.node_type + "/decoder/proj_to_GMM_log_sigmas",
+            model_if_absent=nn.Linear(
+                self.hyperparams["dec_rnn_dim"], self.hyperparams["GMM_components"] * self.pred_state_length
+            ),
+        )
+        self.add_submodule(
+            self.node_type + "/decoder/proj_to_GMM_corrs",
+            model_if_absent=nn.Linear(self.hyperparams["dec_rnn_dim"], self.hyperparams["GMM_components"]),
+        )
+
+        self.x_size = x_size
+        self.z_size = z_size
+
+    def create_edge_models(self, edge_types):
+        for edge_type in edge_types:
+            neighbor_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[edge_type.split("->")[1]].values()])
+            )
+            if self.hyperparams["edge_state_combine_method"] == "pointnet":
+                self.add_submodule(
+                    edge_type + "/pointnet_encoder",
+                    model_if_absent=nn.Sequential(
+                        nn.Linear(self.state_length, 2 * self.state_length),
+                        nn.ReLU(),
+                        nn.Linear(2 * self.state_length, 2 * self.state_length),
+                        nn.ReLU(),
+                    ),
+                )
+
+                edge_encoder_input_size = 2 * self.state_length + self.state_length
+
+            elif self.hyperparams["edge_state_combine_method"] == "attention":
+                self.add_submodule(
+                    self.node_type + "/edge_attention_combine",
+                    model_if_absent=TemporallyBatchedAdditiveAttention(
+                        encoder_hidden_state_dim=self.state_length, decoder_hidden_state_dim=self.state_length
+                    ),
+                )
+                edge_encoder_input_size = self.state_length + neighbor_state_length
+
+            else:
+                edge_encoder_input_size = self.state_length + neighbor_state_length
+
+            self.add_submodule(
+                edge_type + "/edge_encoder",
+                model_if_absent=nn.LSTM(
+                    input_size=edge_encoder_input_size,
+                    hidden_size=self.hyperparams["enc_rnn_dim_edge"],
+                    batch_first=True,
+                ),
+            )
+
+    def create_graphical_model(self, edge_types):
+        """
+        Creates or queries all trainable components.
+
+        :param edge_types: List containing strings for all possible edge types for the node type.
+        :return: None
+        """
+        self.clear_submodules()
+
+        ############################
+        #   Everything but Edges   #
+        ############################
+        self.create_node_models()
+
+        #####################
+        #   Edge Encoders   #
+        #####################
+        if self.hyperparams["edge_encoding"]:
+            self.create_edge_models(edge_types)
+
+        for name, module in self.node_modules.items():
+            module.to(self.device)
+
+    def create_new_scheduler(self, name, annealer, annealer_kws, creation_condition=True):
+        value_scheduler = None
+        rsetattr(self, name + "_scheduler", value_scheduler)
+        if creation_condition:
+            annealer_kws["device"] = self.device
+            value_annealer = annealer(annealer_kws)
+            rsetattr(self, name + "_annealer", value_annealer)
+
+            # This is the value that we'll update on each call of
+            # step_annealers().
+            rsetattr(self, name, value_annealer(0).clone().detach())
+            dummy_optimizer = optim.Optimizer([rgetattr(self, name)], {"lr": value_annealer(0).clone().detach()})
+            rsetattr(self, name + "_optimizer", dummy_optimizer)
+
+            value_scheduler = CustomLR(dummy_optimizer, value_annealer)
+            rsetattr(self, name + "_scheduler", value_scheduler)
+
+        self.schedulers.append(value_scheduler)
+        self.annealed_vars.append(name)
+
+    def set_annealing_params(self):
+        self.schedulers = list()
+        self.annealed_vars = list()
+
+        self.create_new_scheduler(
+            name="kl_weight",
+            annealer=sigmoid_anneal,
+            annealer_kws={
+                "start": self.hyperparams["kl_weight_start"],
+                "finish": self.hyperparams["kl_weight"],
+                "center_step": self.hyperparams["kl_crossover"],
+                "steps_lo_to_hi": self.hyperparams["kl_crossover"] / self.hyperparams["kl_sigmoid_divisor"],
+            },
+        )
+
+        self.create_new_scheduler(
+            name="latent.temp",
+            annealer=exp_anneal,
+            annealer_kws={
+                "start": self.hyperparams["tau_init"],
+                "finish": self.hyperparams["tau_final"],
+                "rate": self.hyperparams["tau_decay_rate"],
+            },
+        )
+
+        self.create_new_scheduler(
+            name="latent.z_logit_clip",
+            annealer=sigmoid_anneal,
+            annealer_kws={
+                "start": self.hyperparams["z_logit_clip_start"],
+                "finish": self.hyperparams["z_logit_clip_final"],
+                "center_step": self.hyperparams["z_logit_clip_crossover"],
+                "steps_lo_to_hi": self.hyperparams["z_logit_clip_crossover"] / self.hyperparams["z_logit_clip_divisor"],
+            },
+            creation_condition=self.hyperparams["use_z_logit_clipping"],
+        )
+
+    def step_annealers(self):
+        # This should manage all of the step-wise changed
+        # parameters automatically.
+        for idx, annealed_var in enumerate(self.annealed_vars):
+            if rgetattr(self, annealed_var + "_scheduler") is not None:
+                # First we step the scheduler.
+                with warnings.catch_warnings():  # We use a dummy optimizer: Warning because no .step() was called on it
+                    warnings.simplefilter("ignore")
+                    rgetattr(self, annealed_var + "_scheduler").step()
+
+                # Then we set the annealed vars' value.
+                rsetattr(self, annealed_var, rgetattr(self, annealed_var + "_optimizer").param_groups[0]["lr"])
+
+        self.summarize_annealers()
+
+    def summarize_annealers(self):
+        if self.log_writer is not None:
+            for annealed_var in self.annealed_vars:
+                if rgetattr(self, annealed_var) is not None:
+                    self.log_writer.add_scalar(
+                        "%s/%s" % (str(self.node_type), annealed_var.replace(".", "/")),
+                        rgetattr(self, annealed_var),
+                        self.curr_iter,
+                    )
+
+    def obtain_encoded_tensors(
+        self,
+        mode,
+        inputs,
+        inputs_st,
+        packed_inputs_st,
+        labels,
+        labels_st,
+        first_history_indices,
+        neighbors,
+        neighbors_edge_value,
+        robot,
+        map,
+    ) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
+        """
+        Encodes input and output tensors for node and robot.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :return: tuple(x, x_nr_t, y_e, y_r, y, n_s_t0)
+            WHERE
+            - x: Encoded input / condition tensor to the CVAE x_e.
+            - x_r_t: Robot state (if robot is in scene).
+            - y_e: Encoded label / future of the node.
+            - y_r: Encoded future of the robot.
+            - y: Label / future of the node.
+            - n_s_t0: Standardized current state of the node.
+        """
+
+        x, x_r_t, y_e, y_r, y = None, None, None, None, None
+        initial_dynamics = dict()
+
+        batch_size = inputs.shape[0]
+
+        #########################################
+        # Provide basic information to encoders #
+        #########################################
+        node_history = inputs
+        node_present_state = inputs[:, -1]
+        node_pos = inputs[:, -1, 0:2]
+        node_vel = inputs[:, -1, 2:4]
+
+        node_history_st = packed_inputs_st
+        node_present_state_st = inputs_st[:, -1]
+        node_pos_st = inputs_st[:, -1, 0:2]
+        node_vel_st = inputs_st[:, -1, 2:4]
+
+        n_s_t0 = node_present_state_st
+
+        initial_dynamics["pos"] = node_pos
+        initial_dynamics["vel"] = node_vel
+
+        self.dynamic.set_initial_condition(initial_dynamics)
+
+        if self.hyperparams["incl_robot_node"]:
+            x_r_t, y_r = robot[..., 0, :], robot[..., 1:, :]
+
+        ##################
+        # Encode History #
+        ##################
+        node_history_encoded = self.encode_node_history(mode, node_history_st, first_history_indices)
+
+        return node_history_encoded
+
+        ##################
+        # Encode Present #
+        ##################
+        node_present = node_present_state_st  # [bs, state_dim]
+
+        ##################
+        # Encode Future #
+        ##################
+        if mode != ModeKeys.PREDICT:
+            y = labels_st
+
+        ##############################
+        # Encode Node Edges per Type #
+        ##############################
+        if self.hyperparams["edge_encoding"]:
+            node_edges_encoded = list()
+            for edge_type in self.edge_types:
+                # Encode edges for given edge type
+                encoded_edges_type = self.encode_edge(
+                    mode,
+                    node_history,
+                    node_history_st,
+                    edge_type,
+                    neighbors[edge_type],
+                    neighbors_edge_value[edge_type],
+                    first_history_indices,
+                )
+                node_edges_encoded.append(encoded_edges_type)  # List of [bs/nbs, enc_rnn_dim]
+            #####################
+            # Encode Node Edges #
+            #####################
+            total_edge_influence = self.encode_total_edge_influence(
+                mode, node_edges_encoded, node_history_encoded, batch_size
+            )
+
+        ################
+        # Map Encoding #
+        ################
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            if self.log_writer and (self.curr_iter + 1) % 500 == 0:
+                map_clone = map.clone()
+                map_patch = self.hyperparams["map_encoder"][self.node_type]["patch_size"]
+                map_clone[:, :, map_patch[1] - 5 : map_patch[1] + 5, map_patch[0] - 5 : map_patch[0] + 5] = 1.0
+                self.log_writer.add_images(
+                    f"{self.node_type}/cropped_maps", map_clone, self.curr_iter, dataformats="NCWH"
+                )
+
+            encoded_map = self.node_modules[self.node_type + "/map_encoder"](map * 2.0 - 1.0, (mode == ModeKeys.TRAIN))
+            do = self.hyperparams["map_encoder"][self.node_type]["dropout"]
+            encoded_map = F.dropout(encoded_map, do, training=(mode == ModeKeys.TRAIN))
+
+        ######################################
+        # Concatenate Encoder Outputs into x #
+        ######################################
+        x_concat_list = list()
+
+        # Every node has an edge-influence encoder (which could just be zero).
+        if self.hyperparams["edge_encoding"]:
+            x_concat_list.append(total_edge_influence)  # [bs/nbs, 4*enc_rnn_dim]
+
+        # Every node has a history encoder.
+        x_concat_list.append(node_history_encoded)  # [bs/nbs, enc_rnn_dim_history]
+
+        if self.hyperparams["incl_robot_node"]:
+            robot_future_encoder = self.encode_robot_future(mode, x_r_t, y_r)
+            x_concat_list.append(robot_future_encoder)
+
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            if self.log_writer:
+                self.log_writer.add_scalar(
+                    f"{self.node_type}/encoded_map_max", torch.max(torch.abs(encoded_map)), self.curr_iter
+                )
+            x_concat_list.append(encoded_map)
+
+        x = torch.cat(x_concat_list, dim=1)
+
+        if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL:
+            y_e = self.encode_node_future(mode, node_present, y)
+
+        return x, x_r_t, y_e, y_r, y, n_s_t0
+
+    def encode_node_history(self, mode, node_hist, first_history_indices):
+        """
+        Encodes the nodes history.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param node_hist: Historic and current state of the node. [bs, mhl, state]
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :return: Encoded node history tensor. [bs, enc_rnn_dim]
+        """
+        outputs = run_lstm_on_variable_length_seqs(
+            self.node_modules[self.node_type + "/node_history_encoder"],
+            # outputs, _ = run_lstm_on_variable_length_seqs(self.node_modules[self.node_type + '/node_history_encoder'],
+            original_seqs=node_hist,
+            lower_indices=first_history_indices,
+        )
+
+        return outputs
+
+        outputs = F.dropout(
+            outputs, p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+        )  # [bs, max_time, enc_rnn_dim]
+
+        last_index_per_sequence = -(first_history_indices + 1)
+
+        return outputs[torch.arange(first_history_indices.shape[0]), last_index_per_sequence]
+
+    def encode_edge(
+        self, mode, node_history, node_history_st, edge_type, neighbors, neighbors_edge_value, first_history_indices
+    ):
+
+        max_hl = self.hyperparams["maximum_history_length"]
+
+        edge_states_list = list()  # list of [#of neighbors, max_ht, state_dim]
+        for i, neighbor_states in enumerate(neighbors):  # Get neighbors for timestep in batch
+            if len(neighbor_states) == 0:  # There are no neighbors for edge type # TODO necessary?
+                neighbor_state_length = int(
+                    np.sum([len(entity_dims) for entity_dims in self.state[edge_type[1]].values()])
+                )
+                edge_states_list.append(torch.zeros((1, max_hl + 1, neighbor_state_length), device=self.device))
+            else:
+                edge_states_list.append(torch.stack(neighbor_states, dim=0).to(self.device))
+
+        if self.hyperparams["edge_state_combine_method"] == "sum":
+            # Used in Structural-RNN to combine edges as well.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.sum(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(
+                        torch.clamp(torch.sum(edge_value.to(self.device), dim=0, keepdim=True), max=1.0)
+                    )
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams["edge_state_combine_method"] == "max":
+            # Used in NLP, e.g. max over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.max(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(
+                        torch.clamp(torch.max(edge_value.to(self.device), dim=0, keepdim=True), max=1.0)
+                    )
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams["edge_state_combine_method"] == "mean":
+            # Used in NLP, e.g. mean over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.mean(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(
+                        torch.clamp(torch.mean(edge_value.to(self.device), dim=0, keepdim=True), max=1.0)
+                    )
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        joint_history = torch.cat([combined_neighbors, node_history_st], dim=-1)
+
+        outputs, _ = run_lstm_on_variable_length_seqs(
+            self.node_modules[DirectedEdge.get_str_from_types(*edge_type) + "/edge_encoder"],
+            original_seqs=joint_history,
+            lower_indices=first_history_indices,
+        )
+
+        outputs = F.dropout(
+            outputs, p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+        )  # [bs, max_time, enc_rnn_dim]
+
+        last_index_per_sequence = -(first_history_indices + 1)
+        ret = outputs[torch.arange(last_index_per_sequence.shape[0]), last_index_per_sequence]
+        if self.hyperparams["dynamic_edges"] == "yes":
+            return ret * combined_edge_masks
+        else:
+            return ret
+
+    def encode_total_edge_influence(self, mode, encoded_edges, node_history_encoder, batch_size):
+        if self.hyperparams["edge_influence_combine_method"] == "sum":
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.sum(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams["edge_influence_combine_method"] == "mean":
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.mean(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams["edge_influence_combine_method"] == "max":
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.max(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams["edge_influence_combine_method"] == "bi-rnn":
+            if len(encoded_edges) == 0:
+                combined_edges = torch.zeros((batch_size, self.eie_output_dims), device=self.device)
+
+            else:
+                # axis=1 because then we get size [batch_size, max_time, depth]
+                encoded_edges = torch.stack(encoded_edges, dim=1)
+
+                _, state = self.node_modules[self.node_type + "/edge_influence_encoder"](encoded_edges)
+                combined_edges = unpack_RNN_state(state)
+                combined_edges = F.dropout(
+                    combined_edges,
+                    p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"],
+                    training=(mode == ModeKeys.TRAIN),
+                )
+
+        elif self.hyperparams["edge_influence_combine_method"] == "attention":
+            # Used in Social Attention (https://arxiv.org/abs/1710.04689)
+            if len(encoded_edges) == 0:
+                combined_edges = torch.zeros((batch_size, self.eie_output_dims), device=self.device)
+
+            else:
+                # axis=1 because then we get size [batch_size, max_time, depth]
+                encoded_edges = torch.stack(encoded_edges, dim=1)
+                combined_edges, _ = self.node_modules[self.node_type + "/edge_influence_encoder"](
+                    encoded_edges, node_history_encoder
+                )
+                combined_edges = F.dropout(
+                    combined_edges,
+                    p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"],
+                    training=(mode == ModeKeys.TRAIN),
+                )
+
+        return combined_edges
+
+    def encode_node_future(self, mode, node_present, node_future) -> torch.Tensor:
+        """
+        Encodes the node future (during training) using a bi-directional LSTM
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param node_present: Current state of the node. [bs, state]
+        :param node_future: Future states of the node. [bs, ph, state]
+        :return: Encoded future.
+        """
+        initial_h_model = self.node_modules[self.node_type + "/node_future_encoder/initial_h"]
+        initial_c_model = self.node_modules[self.node_type + "/node_future_encoder/initial_c"]
+
+        # Here we're initializing the forward hidden states,
+        # but zeroing the backward ones.
+        initial_h = initial_h_model(node_present)
+        initial_h = torch.stack([initial_h, torch.zeros_like(initial_h, device=self.device)], dim=0)
+
+        initial_c = initial_c_model(node_present)
+        initial_c = torch.stack([initial_c, torch.zeros_like(initial_c, device=self.device)], dim=0)
+
+        initial_state = (initial_h, initial_c)
+
+        _, state = self.node_modules[self.node_type + "/node_future_encoder"](node_future, initial_state)
+        state = unpack_RNN_state(state)
+        state = F.dropout(
+            state, p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+        )
+
+        return state
+
+    def encode_robot_future(self, mode, robot_present, robot_future) -> torch.Tensor:
+        """
+        Encodes the robot future (during training) using a bi-directional LSTM
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param robot_present: Current state of the robot. [bs, state]
+        :param robot_future: Future states of the robot. [bs, ph, state]
+        :return: Encoded future.
+        """
+        initial_h_model = self.node_modules["robot_future_encoder/initial_h"]
+        initial_c_model = self.node_modules["robot_future_encoder/initial_c"]
+
+        # Here we're initializing the forward hidden states,
+        # but zeroing the backward ones.
+        initial_h = initial_h_model(robot_present)
+        initial_h = torch.stack([initial_h, torch.zeros_like(initial_h, device=self.device)], dim=0)
+
+        initial_c = initial_c_model(robot_present)
+        initial_c = torch.stack([initial_c, torch.zeros_like(initial_c, device=self.device)], dim=0)
+
+        initial_state = (initial_h, initial_c)
+
+        _, state = self.node_modules["robot_future_encoder"](robot_future, initial_state)
+        state = unpack_RNN_state(state)
+        state = F.dropout(
+            state, p=1.0 - self.hyperparams["rnn_kwargs"]["dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+        )
+
+        return state
+
+    def q_z_xy(self, mode, x, y_e) -> torch.Tensor:
+        r"""
+        .. math:: q_\phi(z \mid \mathbf{x}_i, \mathbf{y}_i)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param y_e: Encoded future tensor.
+        :return: Latent distribution of the CVAE.
+        """
+        xy = torch.cat([x, y_e], dim=1)
+
+        if self.hyperparams["q_z_xy_MLP_dims"] is not None:
+            dense = self.node_modules[self.node_type + "/q_z_xy"]
+            h = F.dropout(
+                F.relu(dense(xy)), p=1.0 - self.hyperparams["MLP_dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+            )
+
+        else:
+            h = xy
+
+        to_latent = self.node_modules[self.node_type + "/hxy_to_z"]
+        return self.latent.dist_from_h(to_latent(h), mode)
+
+    def p_z_x(self, mode, x):
+        r"""
+        .. math:: p_\theta(z \mid \mathbf{x}_i)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :return: Latent distribution of the CVAE.
+        """
+        if self.hyperparams["p_z_x_MLP_dims"] is not None:
+            dense = self.node_modules[self.node_type + "/p_z_x"]
+            h = F.dropout(
+                F.relu(dense(x)), p=1.0 - self.hyperparams["MLP_dropout_keep_prob"], training=(mode == ModeKeys.TRAIN)
+            )
+
+        else:
+            h = x
+
+        to_latent = self.node_modules[self.node_type + "/hx_to_z"]
+        return self.latent.dist_from_h(to_latent(h), mode)
+
+    def project_to_GMM_params(self, tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
+        """
+        Projects tensor to parameters of a GMM with N components and D dimensions.
+
+        :param tensor: Input tensor.
+        :return: tuple(log_pis, mus, log_sigmas, corrs)
+            WHERE
+            - log_pis: Weight (logarithm) of each GMM component. [N]
+            - mus: Mean of each GMM component. [N, D]
+            - log_sigmas: Standard Deviation (logarithm) of each GMM component. [N, D]
+            - corrs: Correlation between the GMM components. [N]
+        """
+        log_pis = self.node_modules[self.node_type + "/decoder/proj_to_GMM_log_pis"](tensor)
+        mus = self.node_modules[self.node_type + "/decoder/proj_to_GMM_mus"](tensor)
+        log_sigmas = self.node_modules[self.node_type + "/decoder/proj_to_GMM_log_sigmas"](tensor)
+        corrs = torch.tanh(self.node_modules[self.node_type + "/decoder/proj_to_GMM_corrs"](tensor))
+        return log_pis, mus, log_sigmas, corrs
+
+    def p_y_xz(
+        self, mode, x, x_nr_t, y_r, n_s_t0, z_stacked, prediction_horizon, num_samples, num_components=1, gmm_mode=False
+    ):
+        r"""
+        .. math:: p_\psi(\mathbf{y}_i \mid \mathbf{x}_i, z)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param x_nr_t: Joint state of node and robot (if robot is in scene).
+        :param y: Future tensor.
+        :param y_r: Encoded future tensor.
+        :param n_s_t0: Standardized current state of the node.
+        :param z_stacked: Stacked latent state. [num_samples_z * num_samples_gmm, bs, latent_state]
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :param num_components: Number of GMM components.
+        :param gmm_mode: If True: The mode of the GMM is sampled.
+        :return: GMM2D. If mode is Predict, also samples from the GMM.
+        """
+        ph = prediction_horizon
+        pred_dim = self.pred_state_length
+
+        z = torch.reshape(z_stacked, (-1, self.latent.z_dim))
+        zx = torch.cat([z, x.repeat(num_samples * num_components, 1)], dim=1)
+
+        cell = self.node_modules[self.node_type + "/decoder/rnn_cell"]
+        initial_h_model = self.node_modules[self.node_type + "/decoder/initial_h"]
+
+        initial_state = initial_h_model(zx)
+
+        log_pis, mus, log_sigmas, corrs, a_sample = [], [], [], [], []
+
+        # Infer initial action state for node from current state
+        a_0 = self.node_modules[self.node_type + "/decoder/state_action"](n_s_t0)
+
+        state = initial_state
+        if self.hyperparams["incl_robot_node"]:
+            input_ = torch.cat(
+                [zx, a_0.repeat(num_samples * num_components, 1), x_nr_t.repeat(num_samples * num_components, 1)], dim=1
+            )
+        else:
+            input_ = torch.cat([zx, a_0.repeat(num_samples * num_components, 1)], dim=1)
+
+        for j in range(ph):
+            h_state = cell(input_, state)
+            log_pi_t, mu_t, log_sigma_t, corr_t = self.project_to_GMM_params(h_state)
+
+            gmm = GMM2D(log_pi_t, mu_t, log_sigma_t, corr_t)  # [k;bs, pred_dim]
+
+            if mode == ModeKeys.PREDICT and gmm_mode:
+                a_t = gmm.mode()
+            else:
+                a_t = gmm.rsample()
+
+            if num_components > 1:
+                if mode == ModeKeys.PREDICT:
+                    log_pis.append(self.latent.p_dist.logits.repeat(num_samples, 1, 1))
+                else:
+                    log_pis.append(self.latent.q_dist.logits.repeat(num_samples, 1, 1))
+            else:
+                log_pis.append(
+                    torch.ones_like(corr_t.reshape(num_samples, num_components, -1).permute(0, 2, 1).reshape(-1, 1))
+                )
+
+            mus.append(
+                mu_t.reshape(num_samples, num_components, -1, 2).permute(0, 2, 1, 3).reshape(-1, 2 * num_components)
+            )
+            log_sigmas.append(
+                log_sigma_t.reshape(num_samples, num_components, -1, 2)
+                .permute(0, 2, 1, 3)
+                .reshape(-1, 2 * num_components)
+            )
+            corrs.append(corr_t.reshape(num_samples, num_components, -1).permute(0, 2, 1).reshape(-1, num_components))
+
+            if self.hyperparams["incl_robot_node"]:
+                dec_inputs = [zx, a_t, y_r[:, j].repeat(num_samples * num_components, 1)]
+            else:
+                dec_inputs = [zx, a_t]
+            input_ = torch.cat(dec_inputs, dim=1)
+            state = h_state
+
+        log_pis = torch.stack(log_pis, dim=1)
+        mus = torch.stack(mus, dim=1)
+        log_sigmas = torch.stack(log_sigmas, dim=1)
+        corrs = torch.stack(corrs, dim=1)
+
+        a_dist = GMM2D(
+            torch.reshape(log_pis, [num_samples, -1, ph, num_components]),
+            torch.reshape(mus, [num_samples, -1, ph, num_components * pred_dim]),
+            torch.reshape(log_sigmas, [num_samples, -1, ph, num_components * pred_dim]),
+            torch.reshape(corrs, [num_samples, -1, ph, num_components]),
+        )
+
+        if self.hyperparams["dynamic"][self.node_type]["distribution"]:
+            y_dist = self.dynamic.integrate_distribution(a_dist, x)
+        else:
+            y_dist = a_dist
+
+        if mode == ModeKeys.PREDICT:
+            if gmm_mode:
+                a_sample = a_dist.mode()
+            else:
+                a_sample = a_dist.rsample()
+            sampled_future = self.dynamic.integrate_samples(a_sample, x)
+            return y_dist, sampled_future
+        else:
+            return y_dist
+
+    def encoder(self, mode, x, y_e, num_samples=None):
+        """
+        Encoder of the CVAE.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param y_e: Encoded future tensor.
+        :param num_samples: Number of samples from the latent space during Prediction.
+        :return: tuple(z, kl_obj)
+            WHERE
+            - z: Samples from the latent space.
+            - kl_obj: KL Divergenze between q and p
+        """
+        if mode == ModeKeys.TRAIN:
+            sample_ct = self.hyperparams["k"]
+        elif mode == ModeKeys.EVAL:
+            sample_ct = self.hyperparams["k_eval"]
+        elif mode == ModeKeys.PREDICT:
+            sample_ct = num_samples
+            if num_samples is None:
+                raise ValueError("num_samples cannot be None with mode == PREDICT.")
+
+        self.latent.q_dist = self.q_z_xy(mode, x, y_e)
+        self.latent.p_dist = self.p_z_x(mode, x)
+
+        z = self.latent.sample_q(sample_ct, mode)
+
+        if mode == ModeKeys.TRAIN:
+            kl_obj = self.latent.kl_q_p(self.log_writer, "%s" % str(self.node_type), self.curr_iter)
+            if self.log_writer is not None:
+                self.log_writer.add_scalar("%s/%s" % (str(self.node_type), "kl"), kl_obj, self.curr_iter)
+        else:
+            kl_obj = None
+
+        return z, kl_obj
+
+    def decoder(self, mode, x, x_nr_t, y, y_r, n_s_t0, z, labels, prediction_horizon, num_samples):
+        """
+        Decoder of the CVAE.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param x: Input / Condition tensor.
+        :param x_nr_t: Joint state of node and robot (if robot is in scene).
+        :param y: Future tensor.
+        :param y_r: Encoded future tensor.
+        :param n_s_t0: Standardized current state of the node.
+        :param z: Stacked latent state.
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :return: Log probability of y over p.
+        """
+
+        num_components = self.hyperparams["N"] * self.hyperparams["K"]
+        y_dist = self.p_y_xz(
+            mode, x, x_nr_t, y_r, n_s_t0, z, prediction_horizon, num_samples, num_components=num_components
+        )
+        log_p_yt_xz = torch.clamp(y_dist.log_prob(labels), max=self.hyperparams["log_p_yt_xz_max"])
+        if self.hyperparams["log_histograms"] and self.log_writer is not None:
+            self.log_writer.add_histogram("%s/%s" % (str(self.node_type), "log_p_yt_xz"), log_p_yt_xz, self.curr_iter)
+
+        log_p_y_xz = torch.sum(log_p_yt_xz, dim=2)
+        return log_p_y_xz
+
+    def train_loss(
+        self,
+        inputs,
+        inputs_st,
+        first_history_indices,
+        labels,
+        labels_st,
+        neighbors,
+        neighbors_edge_value,
+        robot,
+        map,
+        prediction_horizon,
+    ) -> torch.Tensor:
+        """
+        Calculates the training loss for a batch.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :return: Scalar tensor -> nll loss
+        """
+        mode = ModeKeys.TRAIN
+
+        x, x_nr_t, y_e, y_r, y, n_s_t0 = self.obtain_encoded_tensors(
+            mode=mode,
+            inputs=inputs,
+            inputs_st=inputs_st,
+            labels=labels,
+            labels_st=labels_st,
+            first_history_indices=first_history_indices,
+            neighbors=neighbors,
+            neighbors_edge_value=neighbors_edge_value,
+            robot=robot,
+            map=map,
+        )
+
+        z, kl = self.encoder(mode, x, y_e)
+        log_p_y_xz = self.decoder(
+            mode,
+            x,
+            x_nr_t,
+            y,
+            y_r,
+            n_s_t0,
+            z,
+            labels,  # Loss is calculated on unstandardized label
+            prediction_horizon,
+            self.hyperparams["k"],
+        )
+
+        log_p_y_xz_mean = torch.mean(log_p_y_xz, dim=0)  # [nbs]
+        log_likelihood = torch.mean(log_p_y_xz_mean)
+
+        mutual_inf_q = mutual_inf_mc(self.latent.q_dist)
+        mutual_inf_p = mutual_inf_mc(self.latent.p_dist)
+
+        ELBO = log_likelihood - self.kl_weight * kl + 1.0 * mutual_inf_p
+        loss = -ELBO
+
+        if self.hyperparams["log_histograms"] and self.log_writer is not None:
+            self.log_writer.add_histogram(
+                "%s/%s" % (str(self.node_type), "log_p_y_xz"), log_p_y_xz_mean, self.curr_iter
+            )
+
+        if self.log_writer is not None:
+            self.log_writer.add_scalar(
+                "%s/%s" % (str(self.node_type), "mutual_information_q"), mutual_inf_q, self.curr_iter
+            )
+            self.log_writer.add_scalar(
+                "%s/%s" % (str(self.node_type), "mutual_information_p"), mutual_inf_p, self.curr_iter
+            )
+            self.log_writer.add_scalar(
+                "%s/%s" % (str(self.node_type), "log_likelihood"), log_likelihood, self.curr_iter
+            )
+            self.log_writer.add_scalar("%s/%s" % (str(self.node_type), "loss"), loss, self.curr_iter)
+            if self.hyperparams["log_histograms"]:
+                self.latent.summarize_for_tensorboard(self.log_writer, str(self.node_type), self.curr_iter)
+        return loss
+
+    def eval_loss(
+        self,
+        inputs,
+        inputs_st,
+        first_history_indices,
+        labels,
+        labels_st,
+        neighbors,
+        neighbors_edge_value,
+        robot,
+        map,
+        prediction_horizon,
+    ) -> torch.Tensor:
+        """
+        Calculates the evaluation loss for a batch.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :return: tuple(nll_q_is, nll_p, nll_exact, nll_sampled)
+        """
+
+        mode = ModeKeys.EVAL
+
+        x, x_nr_t, y_e, y_r, y, n_s_t0 = self.obtain_encoded_tensors(
+            mode=mode,
+            inputs=inputs,
+            inputs_st=inputs_st,
+            labels=labels,
+            labels_st=labels_st,
+            first_history_indices=first_history_indices,
+            neighbors=neighbors,
+            neighbors_edge_value=neighbors_edge_value,
+            robot=robot,
+            map=map,
+        )
+
+        num_components = self.hyperparams["N"] * self.hyperparams["K"]
+        ### Importance sampled NLL estimate
+        z, _ = self.encoder(mode, x, y_e)  # [k_eval, nbs, N*K]
+        z = self.latent.sample_p(1, mode, full_dist=True)
+        y_dist, _ = self.p_y_xz(
+            ModeKeys.PREDICT,
+            x,
+            x_nr_t,
+            y_r,
+            n_s_t0,
+            z,
+            prediction_horizon,
+            num_samples=1,
+            num_components=num_components,
+        )
+        # We use unstandardized labels to compute the loss
+        log_p_yt_xz = torch.clamp(y_dist.log_prob(labels), max=self.hyperparams["log_p_yt_xz_max"])
+        log_p_y_xz = torch.sum(log_p_yt_xz, dim=2)
+        log_p_y_xz_mean = torch.mean(log_p_y_xz, dim=0)  # [nbs]
+        log_likelihood = torch.mean(log_p_y_xz_mean)
+        nll = -log_likelihood
+
+        return nll
+
+    def predict(
+        self,
+        inputs,
+        inputs_st,
+        packed_inputs_st,
+        first_history_indices,
+        neighbors,
+        neighbors_edge_value,
+        robot,
+        map,
+        prediction_horizon,
+        num_samples,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=True,
+        all_z_sep=False,
+    ):
+        """
+        Predicts the future of a batch of nodes.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :param z_mode: If True: Select the most likely latent state.
+        :param gmm_mode: If True: The mode of the GMM is sampled.
+        :param all_z_sep: Samples each latent mode individually without merging them into a GMM.
+        :param full_dist: Samples all latent states and merges them into a GMM as output.
+        :return:
+        """
+        mode = ModeKeys.PREDICT
+
+        # x, x_nr_t, _, y_r, _, n_s_t0 = self.obtain_encoded_tensors(mode=mode,
+        out = self.obtain_encoded_tensors(
+            mode=mode,
+            inputs=inputs,
+            inputs_st=inputs_st,
+            packed_inputs_st=packed_inputs_st,
+            labels=None,
+            labels_st=None,
+            first_history_indices=first_history_indices,
+            neighbors=neighbors,
+            neighbors_edge_value=neighbors_edge_value,
+            robot=robot,
+            map=map,
+        )
+        # return x, n_s_t0
+        return out
+
+        self.latent.p_dist = self.p_z_x(mode, x)
+        z, num_samples, num_components = self.latent.sample_p(
+            num_samples, mode, most_likely_z=z_mode, full_dist=full_dist, all_z_sep=all_z_sep
+        )
+
+        _, our_sampled_future = self.p_y_xz(
+            mode, x, x_nr_t, y_r, n_s_t0, z, prediction_horizon, num_samples, num_components, gmm_mode
+        )
+
+        return our_sampled_future
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py
new file mode 100644
index 000000000..d5aaf1966
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import torch
+import torch.nn as nn
+
+
+def get_model_device(model):
+    return next(model.parameters()).device
+
+
+class ModelRegistrar(nn.Module):
+    def __init__(self, model_dir, device):
+        super(ModelRegistrar, self).__init__()
+        self.model_dict = nn.ModuleDict()
+        self.model_dir = model_dir
+        self.device = device
+
+    def forward(self):
+        raise NotImplementedError("Although ModelRegistrar is a nn.Module, it is only to store parameters.")
+
+    def get_model(self, name, model_if_absent=None):
+        # 4 cases: name in self.model_dict and model_if_absent is None         (OK)
+        #          name in self.model_dict and model_if_absent is not None     (OK)
+        #          name not in self.model_dict and model_if_absent is not None (OK)
+        #          name not in self.model_dict and model_if_absent is None     (NOT OK)
+
+        if name in self.model_dict:
+            return self.model_dict[name]
+
+        elif model_if_absent is not None:
+            self.model_dict[name] = model_if_absent.to(self.device)
+            return self.model_dict[name]
+
+        else:
+            raise ValueError(f"{name} was never initialized in this Registrar!")
+
+    def get_name_match(self, name):
+        ret_model_list = nn.ModuleList()
+        for key in self.model_dict.keys():
+            if name in key:
+                ret_model_list.append(self.model_dict[key])
+        return ret_model_list
+
+    def get_all_but_name_match(self, name):
+        ret_model_list = nn.ModuleList()
+        for key in self.model_dict.keys():
+            if name not in key:
+                ret_model_list.append(self.model_dict[key])
+        return ret_model_list
+
+    def print_model_names(self):
+        print(self.model_dict.keys())
+
+    def save_models(self, curr_iter):
+        # Create the model directiory if it's not present.
+        save_path = os.path.join(self.model_dir, "model_registrar-%d.pt" % curr_iter)
+
+        torch.save(self.model_dict, save_path)
+
+    def load_models(self, iter_num):
+        self.model_dict.clear()
+
+        save_path = os.path.join(self.model_dir, "model_registrar-%d.pt" % iter_num)
+
+        print("")
+        print("Loading from " + save_path)
+        self.model_dict = torch.load(save_path, map_location=self.device)
+        print("Loaded!")
+        print("")
+
+    def to(self, device):
+        for name, model in self.model_dict.items():
+            if get_model_device(model) != device:
+                model.to(device)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py
new file mode 100644
index 000000000..19c4cf3ec
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn.utils.rnn as rnn
+from enum import Enum
+import functools
+import numpy as np
+import math
+
+
+class ModeKeys(Enum):
+    TRAIN = 1
+    EVAL = 2
+    PREDICT = 3
+
+
+def cyclical_lr(stepsize, min_lr=3e-4, max_lr=3e-3, decay=1.0):
+    # Lambda function to calculate the LR
+    lr_lambda = lambda it: min_lr + (max_lr - min_lr) * relative(it, stepsize) * decay**it
+
+    # Additional function to see where on the cycle we are
+    def relative(it, stepsize):
+        cycle = math.floor(1 + it / (2 * stepsize))
+        x = abs(it / stepsize - 2 * cycle + 1)
+        return max(0, (1 - x))
+
+    return lr_lambda
+
+
+def to_one_hot(labels, n_labels):
+    return torch.eye(n_labels, device=labels.device)[labels]
+
+
+def exp_anneal(anneal_kws):
+    device = anneal_kws["device"]
+    start = torch.tensor(anneal_kws["start"], device=device)
+    finish = torch.tensor(anneal_kws["finish"], device=device)
+    rate = torch.tensor(anneal_kws["rate"], device=device)
+    return lambda step: finish - (finish - start) * torch.pow(
+        rate, torch.tensor(step, dtype=torch.float, device=device)
+    )
+
+
+def sigmoid_anneal(anneal_kws):
+    device = anneal_kws["device"]
+    start = torch.tensor(anneal_kws["start"], device=device)
+    finish = torch.tensor(anneal_kws["finish"], device=device)
+    center_step = torch.tensor(anneal_kws["center_step"], device=device, dtype=torch.float)
+    steps_lo_to_hi = torch.tensor(anneal_kws["steps_lo_to_hi"], device=device, dtype=torch.float)
+    return lambda step: start + (finish - start) * torch.sigmoid(
+        (torch.tensor(float(step), device=device) - center_step) * (1.0 / steps_lo_to_hi)
+    )
+
+
+class CustomLR(torch.optim.lr_scheduler.LambdaLR):
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1):
+        super(CustomLR, self).__init__(optimizer, lr_lambda, last_epoch)
+
+    def get_lr(self):
+        return [lmbda(self.last_epoch) for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+
+
+def mutual_inf_mc(x_dist):
+    dist = x_dist.__class__
+    H_y = dist(probs=x_dist.probs.mean(dim=0)).entropy()
+    return (H_y - x_dist.entropy().mean(dim=0)).sum()
+
+
+def run_lstm_on_variable_length_seqs(
+    lstm_module, original_seqs, lower_indices=None, upper_indices=None, total_length=None
+):
+    # breakpoint()
+    # bs, tf = original_seqs.shape[:2]
+    # if lower_indices is None:
+    #     lower_indices = torch.zeros(bs, dtype=torch.int)
+    # if upper_indices is None:
+    #     upper_indices = torch.ones(bs, dtype=torch.int) * (tf - 1)
+    # if total_length is None:
+    #     total_length = max(upper_indices) + 1
+    # # This is done so that we can just pass in self.prediction_timesteps
+    # # (which we want to INCLUDE, so this will exclude the next timestep).
+    # inclusive_break_indices = upper_indices + 1
+
+    # pad_list = list()
+    # for i, seq_len in enumerate(inclusive_break_indices):
+    #     pad_list.append(original_seqs[i, lower_indices[i]:seq_len])
+
+    # packed_seqs = rnn.pack_sequence(pad_list, enforce_sorted=False)
+    # return packed_seqs  # TypeError: int() argument must be a string, a bytes-like object or a real number, not 'Any'
+
+    packed_seqs = original_seqs
+    packed_output, (h_n, c_n) = lstm_module(packed_seqs)
+    return packed_output  # TypeError: object of type 'Call' has no len()
+    output, _ = rnn.pad_packed_sequence(packed_output, batch_first=True, total_length=total_length)
+
+    return output, (h_n, c_n)
+
+
+def extract_subtensor_per_batch_element(tensor, indices):
+    batch_idxs = torch.arange(start=0, end=len(indices))
+
+    batch_idxs = batch_idxs[~torch.isnan(indices)]
+    indices = indices[~torch.isnan(indices)]
+    if indices.size == 0:
+        return None
+    else:
+        indices = indices.long()
+    if tensor.is_cuda:
+        batch_idxs = batch_idxs.to(tensor.get_device())
+        indices = indices.to(tensor.get_device())
+    return tensor[batch_idxs, indices]
+
+
+def unpack_RNN_state(state_tuple):
+    # PyTorch returned LSTM states have 3 dims:
+    # (num_layers * num_directions, batch, hidden_size)
+
+    state = torch.cat(state_tuple, dim=0).permute(1, 0, 2)
+    # Now state is (batch, 2 * num_layers * num_directions, hidden_size)
+
+    state_size = state.size()
+    return torch.reshape(state, (-1, state_size[1] * state_size[2]))
+
+
+def rsetattr(obj, attr, val):
+    pre, _, post = attr.rpartition(".")
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
+def rgetattr(obj, attr, *args):
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split("."))
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py
new file mode 100644
index 000000000..e8fa6b337
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .online_trajectron import OnlineTrajectron
+from .online_mgcvae import OnlineMultimodalGenerativeCVAE
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py
new file mode 100644
index 000000000..624ebf426
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py
@@ -0,0 +1,428 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from collections import defaultdict, Counter
+from model.components import *
+from model.model_utils import *
+from model.dataset import get_relative_robot_traj
+import model.dynamics as dynamic_module
+from model.mgcvae import MultimodalGenerativeCVAE
+from environment.scene_graph import DirectedEdge
+from environment.node_type import NodeType
+
+
+class OnlineMultimodalGenerativeCVAE(MultimodalGenerativeCVAE):
+    def __init__(self, env, node, model_registrar, hyperparams, device):
+        self.hyperparams = hyperparams
+        self.node = node
+        self.node_type = self.node.type
+
+        if len(env.scenes) != 1:
+            raise ValueError("Passed in Environment has number of scenes != 1")
+        self.robot = env.scenes[0].robot
+        self.model_registrar = model_registrar
+        self.device = device
+
+        self.node_modules = dict()
+        self.env = env
+        self.scene_graph = None
+
+        self.state = self.hyperparams["state"]
+        self.pred_state = self.hyperparams["pred_state"][self.node.type]
+        self.state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[self.node.type].values()]))
+        if self.hyperparams["incl_robot_node"]:
+            self.robot_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[self.robot.type].values()])
+            )
+        self.pred_state_length = int(np.sum([len(entity_dims) for entity_dims in self.pred_state.values()]))
+
+        self.curr_hidden_states = dict()
+        self.edge_types = Counter()
+
+        self.create_graphical_model()
+
+        dynamic_class = getattr(dynamic_module, self.hyperparams["dynamic"][self.node_type]["name"])
+        dyn_limits = hyperparams["dynamic"][self.node_type]["limits"]
+        self.dynamic = dynamic_class(
+            self.env.scenes[0].dt, dyn_limits, device, self.model_registrar, self.x_size, self.node_type
+        )
+
+    def create_graphical_model(self):
+        """
+        Creates or queries all trainable components.
+
+        :return: None
+        """
+        self.clear_submodules()
+
+        ############################
+        #   Everything but Edges   #
+        ############################
+        self.create_node_models()
+
+        for name, module in self.node_modules.items():
+            module.to(self.device)
+
+    def update_graph(self, new_scene_graph, new_neighbors, removed_neighbors):
+        self.scene_graph = new_scene_graph
+
+        if self.node in new_neighbors:
+            for edge_type, new_neighbor_nodes in new_neighbors[self.node].items():
+                self.add_edge_model(edge_type)
+                self.edge_types += Counter({edge_type: len(new_neighbor_nodes)})
+
+        if self.node in removed_neighbors:
+            for edge_type, removed_neighbor_nodes in removed_neighbors[self.node].items():
+                self.remove_edge_model(edge_type)
+                self.edge_types -= Counter({edge_type: len(removed_neighbor_nodes)})
+
+    def get_edge_to(self, other_node):
+        return DirectedEdge(self.node, other_node)
+
+    def add_edge_model(self, edge_type):
+        if self.hyperparams["edge_encoding"]:
+            if edge_type + "/edge_encoder" not in self.node_modules:
+                neighbor_state_length = int(
+                    np.sum(
+                        [
+                            len(entity_dims)
+                            for entity_dims in self.state[self._get_other_node_type_from_edge(edge_type)].values()
+                        ]
+                    )
+                )
+                if self.hyperparams["edge_state_combine_method"] == "pointnet":
+                    self.add_submodule(
+                        edge_type + "/pointnet_encoder",
+                        model_if_absent=nn.Sequential(
+                            nn.Linear(self.state_length, 2 * self.state_length),
+                            nn.ReLU(),
+                            nn.Linear(2 * self.state_length, 2 * self.state_length),
+                            nn.ReLU(),
+                        ),
+                    )
+
+                    edge_encoder_input_size = 2 * self.state_length + self.state_length
+
+                elif self.hyperparams["edge_state_combine_method"] == "attention":
+                    self.add_submodule(
+                        self.node.type + "/edge_attention_combine",
+                        model_if_absent=TemporallyBatchedAdditiveAttention(
+                            encoder_hidden_state_dim=self.state_length, decoder_hidden_state_dim=self.state_length
+                        ),
+                    )
+                    edge_encoder_input_size = self.state_length + neighbor_state_length
+
+                else:
+                    edge_encoder_input_size = self.state_length + neighbor_state_length
+
+                self.add_submodule(
+                    edge_type + "/edge_encoder",
+                    model_if_absent=nn.LSTM(
+                        input_size=edge_encoder_input_size,
+                        hidden_size=self.hyperparams["enc_rnn_dim_edge"],
+                        batch_first=True,
+                    ),
+                )
+
+    def _get_other_node_type_from_edge(self, edge_type_str):
+        n2_type_str = edge_type_str.split("->")[1]
+        return NodeType(n2_type_str, self.env.node_type_list.index(n2_type_str) + 1)
+
+    def _get_edge_type_from_str(self, edge_type_str):
+        n1_type_str, n2_type_str = edge_type_str.split("->")
+        return (
+            NodeType(n1_type_str, self.env.node_type_list.index(n1_type_str) + 1),
+            NodeType(n2_type_str, self.env.node_type_list.index(n2_type_str) + 1),
+        )
+
+    def remove_edge_model(self, edge_type):
+        if self.hyperparams["edge_encoding"]:
+            if len(self.scene_graph.get_neighbors(self.node, self._get_other_node_type_from_edge(edge_type))) == 0:
+                del self.node_modules[edge_type + "/edge_encoder"]
+
+    def obtain_encoded_tensors(self, mode, inputs, inputs_st, inputs_np, robot_present_and_future, maps):
+        x, x_r_t, y_r = None, None, None
+        batch_size = 1
+
+        our_inputs = inputs[self.node]
+        our_inputs_st = inputs_st[self.node]
+
+        initial_dynamics = dict()
+        initial_dynamics["pos"] = our_inputs[:, 0:2]  # TODO: Generalize
+        initial_dynamics["vel"] = our_inputs[:, 2:4]  # TODO: Generalize
+        self.dynamic.set_initial_condition(initial_dynamics)
+
+        #########################################
+        # Provide basic information to encoders #
+        #########################################
+        if self.hyperparams["incl_robot_node"] and self.robot is not None:
+            robot_present_and_future_st = get_relative_robot_traj(
+                self.env, self.state, our_inputs, robot_present_and_future, self.node.type, self.robot.type
+            )
+            x_r_t = robot_present_and_future_st[..., 0, :]
+            y_r = robot_present_and_future_st[..., 1:, :]
+
+        ##################
+        # Encode History #
+        ##################
+        node_history_encoded = self.encode_node_history(our_inputs_st)
+
+        ##############################
+        # Encode Node Edges per Type #
+        ##############################
+        total_edge_influence = None
+        if self.hyperparams["edge_encoding"]:
+            node_edges_encoded = list()
+            for edge_type in self.edge_types:
+                connected_nodes_batched = list()
+                edge_masks_batched = list()
+
+                # We get all nodes which are connected to the current node for the current timestep
+                connected_nodes_batched.append(
+                    self.scene_graph.get_neighbors(self.node, self._get_other_node_type_from_edge(edge_type))
+                )
+
+                if self.hyperparams["dynamic_edges"] == "yes":
+                    # We get the edge masks for the current node at the current timestep
+                    edge_masks_for_node = self.scene_graph.get_edge_scaling(self.node)
+                    edge_masks_batched.append(torch.tensor(edge_masks_for_node, dtype=torch.float, device=self.device))
+
+                # Encode edges for given edge type
+                encoded_edges_type = self.encode_edge(
+                    inputs, inputs_st, inputs_np, edge_type, connected_nodes_batched, edge_masks_batched
+                )
+                node_edges_encoded.append(encoded_edges_type)  # List of [bs/nbs, enc_rnn_dim]
+
+            #####################
+            # Encode Node Edges #
+            #####################
+            total_edge_influence = self.encode_total_edge_influence(
+                mode, node_edges_encoded, node_history_encoded, batch_size
+            )
+
+        self.TD = {"node_history_encoded": node_history_encoded, "total_edge_influence": total_edge_influence}
+
+        ################
+        # Map Encoding #
+        ################
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            if self.node not in maps:
+                # This means the node was removed (it is only being kept around because of the edge removal filter).
+                me_params = self.hyperparams["map_encoder"][self.node_type]
+                self.TD["encoded_map"] = torch.zeros((1, me_params["output_size"]))
+            else:
+                encoded_map = self.node_modules[self.node_type + "/map_encoder"](
+                    maps[self.node] * 2.0 - 1.0, (mode == ModeKeys.TRAIN)
+                )
+                do = self.hyperparams["map_encoder"][self.node_type]["dropout"]
+                encoded_map = F.dropout(encoded_map, do, training=(mode == ModeKeys.TRAIN))
+                self.TD["encoded_map"] = encoded_map
+
+        ######################################
+        # Concatenate Encoder Outputs into x #
+        ######################################
+        return self.create_encoder_rep(mode, self.TD, x_r_t, y_r)
+
+    def create_encoder_rep(self, mode, TD, robot_present_st, robot_future_st):
+        # Unpacking TD
+        node_history_encoded = TD["node_history_encoded"]
+        if self.hyperparams["edge_encoding"]:
+            total_edge_influence = TD["total_edge_influence"]
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            encoded_map = TD["encoded_map"]
+
+        if (
+            self.hyperparams["incl_robot_node"]
+            and self.robot is not None
+            and robot_future_st is not None
+            and robot_present_st is not None
+        ):
+            robot_future_encoder = self.encode_robot_future(mode, robot_present_st, robot_future_st)
+
+            # Tiling for multiple samples
+            # This tiling is done because:
+            #   a) we must consider the prediction case where there are many candidate robot future actions,
+            #   b) the edge and history encoders are all the same regardless of which candidate future robot action
+            #      we're evaluating.
+            node_history_encoded = TD["node_history_encoded"].repeat(robot_future_st.size()[0], 1)
+            if self.hyperparams["edge_encoding"]:
+                total_edge_influence = TD["total_edge_influence"].repeat(robot_future_st.size()[0], 1)
+            if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+                encoded_map = TD["encoded_map"].repeat(robot_future_st.size()[0], 1)
+
+        elif self.hyperparams["incl_robot_node"] and self.robot is not None:
+            # Four times because we're trying to mimic a bi-directional RNN's output (which is c and h from both ends).
+            robot_future_encoder = torch.zeros([1, 4 * self.hyperparams["enc_rnn_dim_future"]], device=self.device)
+
+        x_concat_list = list()
+
+        # Every node has an edge-influence encoder (which could just be zero).
+        if self.hyperparams["edge_encoding"]:
+            x_concat_list.append(total_edge_influence)  # [bs/nbs, 4*enc_rnn_dim]
+
+        # Every node has a history encoder.
+        x_concat_list.append(node_history_encoded)  # [bs/nbs, enc_rnn_dim_history]
+
+        if self.hyperparams["incl_robot_node"] and self.robot is not None:
+            x_concat_list.append(robot_future_encoder)  # [bs/nbs, 4*enc_rnn_dim_history]
+
+        if self.hyperparams["use_map_encoding"] and self.node_type in self.hyperparams["map_encoder"]:
+            x_concat_list.append(encoded_map)  # [bs/nbs, CNN output size]
+
+        return torch.cat(x_concat_list, dim=1)
+
+    def encode_node_history(self, inputs_st):
+        new_state = torch.unsqueeze(inputs_st, dim=1)  # [bs, 1, state_dim]
+        if self.node.type + "/node_history_encoder" not in self.curr_hidden_states:
+            outputs, self.curr_hidden_states[self.node.type + "/node_history_encoder"] = self.node_modules[
+                self.node.type + "/node_history_encoder"
+            ](new_state)
+        else:
+            outputs, self.curr_hidden_states[self.node.type + "/node_history_encoder"] = self.node_modules[
+                self.node.type + "/node_history_encoder"
+            ](new_state, self.curr_hidden_states[self.node.type + "/node_history_encoder"])
+
+        return outputs[:, 0, :]
+
+    def encode_edge(self, inputs, inputs_st, inputs_np, edge_type, connected_nodes, edge_masks):
+        edge_type_tuple = self._get_edge_type_from_str(edge_type)
+        edge_states_list = list()  # list of [#of neighbors, max_ht, state_dim]
+        neighbor_states = list()
+
+        orig_rel_state = inputs[self.node].cpu().numpy()
+        for node in connected_nodes[0]:
+            neighbor_state_np = inputs_np[node]
+
+            # Make State relative to node
+            _, std = self.env.get_standardize_params(self.state[node.type], node_type=node.type)
+            std[0:2] = self.env.attention_radius[edge_type_tuple]
+
+            # TODO: This all makes the unsafe assumption that the first n dims
+            #  refer to the same quantities even for different agent types!
+            equal_dims = np.min((neighbor_state_np.shape[-1], orig_rel_state.shape[-1]))
+            rel_state = np.zeros_like(neighbor_state_np)
+            rel_state[..., :equal_dims] = orig_rel_state[..., :equal_dims]
+            neighbor_state_np_st = self.env.standardize(
+                neighbor_state_np, self.state[node.type], node_type=node.type, mean=rel_state, std=std
+            )
+
+            neighbor_state = torch.tensor(neighbor_state_np_st).float().to(self.device)
+            neighbor_states.append(neighbor_state)
+
+        if len(neighbor_states) == 0:  # There are no neighbors for edge type # TODO necessary?
+            neighbor_state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[edge_type[1]].values()]))
+            edge_states_list.append(torch.zeros((1, 1, neighbor_state_length), device=self.device))
+        else:
+            edge_states_list.append(torch.stack(neighbor_states, dim=0))
+
+        if self.hyperparams["edge_state_combine_method"] == "sum":
+            # Used in Structural-RNN to combine edges as well.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.sum(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.sum(edge_mask, dim=0, keepdim=True), max=1.0))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams["edge_state_combine_method"] == "max":
+            # Used in NLP, e.g. max over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.max(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.max(edge_mask, dim=0, keepdim=True), max=1.0))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams["edge_state_combine_method"] == "mean":
+            # Used in NLP, e.g. mean over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.mean(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams["dynamic_edges"] == "yes":
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.mean(edge_mask, dim=0, keepdim=True), max=1.0))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        joint_history = torch.cat([combined_neighbors, torch.unsqueeze(inputs_st[self.node], dim=0)], dim=-1)
+
+        if edge_type + "/edge_encoder" not in self.curr_hidden_states:
+            outputs, self.curr_hidden_states[edge_type + "/edge_encoder"] = self.node_modules[
+                edge_type + "/edge_encoder"
+            ](joint_history)
+        else:
+            outputs, self.curr_hidden_states[edge_type + "/edge_encoder"] = self.node_modules[
+                edge_type + "/edge_encoder"
+            ](joint_history, self.curr_hidden_states[edge_type + "/edge_encoder"])
+
+        if self.hyperparams["dynamic_edges"] == "yes":
+            return outputs[:, 0, :] * combined_edge_masks
+        else:
+            return outputs[:, 0, :]  # [bs, enc_rnn_dim]
+
+    def encoder_forward(self, inputs, inputs_st, inputs_np, robot_present_and_future=None, maps=None):
+        # Always predicting with the online model.
+        mode = ModeKeys.PREDICT
+
+        self.x = self.obtain_encoded_tensors(mode, inputs, inputs_st, inputs_np, robot_present_and_future, maps)
+        self.n_s_t0 = inputs_st[self.node]
+
+        self.latent.p_dist = self.p_z_x(mode, self.x)
+
+    # robot_future_st is optional here since you can use the same one from encoder_forward,
+    # but if it's given then we'll re-run that part of the model (if the node is adjacent to the robot).
+    def decoder_forward(
+        self,
+        prediction_horizon,
+        num_samples,
+        robot_present_and_future=None,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=False,
+        all_z_sep=False,
+    ):
+        # Always predicting with the online model.
+        mode = ModeKeys.PREDICT
+
+        x_nr_t, y_r = None, None
+        if self.hyperparams["incl_robot_node"] and self.robot is not None and robot_present_and_future is not None:
+            our_inputs = torch.tensor(
+                self.node.get(np.array([self.node.last_timestep]), self.state[self.node.type], padding=0.0),
+                dtype=torch.float,
+                device=self.device,
+            )
+            robot_present_and_future_st = get_relative_robot_traj(
+                self.env, self.state, our_inputs, robot_present_and_future, self.node.type, self.robot.type
+            )
+            x_nr_t = robot_present_and_future_st[..., 0, :]
+            y_r = robot_present_and_future_st[..., 1:, :]
+            self.x = self.create_encoder_rep(mode, self.TD, x_nr_t, y_r)
+            self.latent.p_dist = self.p_z_x(mode, self.x)
+
+            # Making sure n_s_t0 has the same batch size as x_nr_t
+            self.n_s_t0 = self.n_s_t0[[0]].repeat(x_nr_t.size()[0], 1)
+
+        z, num_samples, num_components = self.latent.sample_p(
+            num_samples, mode, most_likely_z=z_mode, full_dist=full_dist, all_z_sep=all_z_sep
+        )
+
+        y_dist, our_sampled_future = self.p_y_xz(
+            mode, self.x, x_nr_t, y_r, self.n_s_t0, z, prediction_horizon, num_samples, num_components, gmm_mode
+        )
+
+        return y_dist, our_sampled_future
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py
new file mode 100644
index 000000000..73de7ed89
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py
@@ -0,0 +1,343 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import numpy as np
+from collections import Counter
+from model.trajectron import Trajectron
+from model.online.online_mgcvae import OnlineMultimodalGenerativeCVAE
+from model.model_utils import ModeKeys
+from environment import RingBuffer, TemporalSceneGraph, SceneGraph, derivative_of
+
+
+class OnlineTrajectron(Trajectron):
+    def __init__(self, model_registrar, hyperparams, device):
+        super(OnlineTrajectron, self).__init__(
+            model_registrar=model_registrar, hyperparams=hyperparams, log_writer=False, device=device
+        )
+        self.node_data = dict()
+        self.scene_graph = None
+        self.RING_CAPACITY = (
+            max(
+                len(self.hyperparams["edge_removal_filter"]),
+                len(self.hyperparams["edge_addition_filter"]),
+                self.hyperparams["maximum_history_length"],
+            )
+            + 1
+        )
+        self.rel_states = dict()
+        self.removed_nodes = Counter()
+
+    def __repr__(self):
+        return f"OnlineTrajectron(# nodes: {len(self.nodes)}, device: {self.device}, hyperparameters: {str(self.hyperparams)}) "
+
+    def _add_node_model(self, node):
+        if node in self.nodes:
+            raise ValueError("%s was already added to this graph!" % str(node))
+
+        self.nodes.add(node)
+        self.node_models_dict[node] = OnlineMultimodalGenerativeCVAE(
+            self.env, node, self.model_registrar, self.hyperparams, self.device
+        )
+
+    def update_removed_nodes(self):
+        for node in list(self.removed_nodes.keys()):
+            if self.removed_nodes[node] >= len(self.hyperparams["edge_removal_filter"]):
+                del self.node_data[node]
+                del self.removed_nodes[node]
+
+    def _remove_node_model(self, node):
+        if node not in self.nodes:
+            raise ValueError("%s is not in this graph!" % str(node))
+
+        self.nodes.remove(node)
+        del self.node_models_dict[node]
+
+    def set_environment(self, env, init_timestep=0):
+        self.env = env
+        self.scene_graph = SceneGraph(edge_radius=self.env.attention_radius)
+        self.nodes.clear()
+        self.node_data.clear()
+        self.node_models_dict.clear()
+
+        # Fast-forwarding ourselves to the initial timestep, without running any of the underlying models.
+        for timestep in range(init_timestep + 1):
+            self.incremental_forward(
+                self.env.scenes[0].get_clipped_input_dict(timestep, self.hyperparams["state"]),
+                maps=None,
+                run_models=False,
+            )
+
+    def incremental_forward(
+        self,
+        new_inputs_dict,
+        maps,
+        prediction_horizon=0,
+        num_samples=0,
+        robot_present_and_future=None,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=False,
+        all_z_sep=False,
+        run_models=True,
+    ):
+        # The way this function works is by appending the new datapoints to the
+        # ends of each of the LSTMs in the graph. Then, we recalculate the
+        # encoder's output vector h_x and feed that into the decoder to sample new outputs.
+        mode = ModeKeys.PREDICT
+
+        # No grad since we're predicting always, as evidenced by the line above.
+        with torch.no_grad():
+            for node, new_input in new_inputs_dict.items():
+                if node not in self.node_data:
+                    self.node_data[node] = RingBuffer(
+                        capacity=self.RING_CAPACITY,
+                        dtype=(float, sum(len(self.state[node.type][k]) for k in self.state[node.type])),
+                    )
+                self.node_data[node].append(new_input)
+
+                if node in self.removed_nodes:
+                    del self.removed_nodes[node]
+
+            # Nodes in self.node_data that aren't in new_inputs_dict were just removed.
+            newly_removed_nodes = (set(self.node_data.keys()) - set(self.removed_nodes.keys())) - set(
+                new_inputs_dict.keys()
+            )
+
+            # We update self.removed_nodes with the newly removed nodes as well as all existing removed nodes to get
+            # the time since their last removal increased by one.
+            self.removed_nodes.update(newly_removed_nodes | set(self.removed_nodes.keys()))
+
+            # For any nodes that are older than the length of the edge_removal_filter, we can safely clear their data.
+            self.update_removed_nodes()
+
+            # Any remaining removed nodes that aren't yet old enough for data clearing simply have NaNs appended so
+            # that when it's passed through the LSTMs, the hidden state keeps propagating but the input plays no role
+            # (the NaNs get converted to zeros later on).
+            for node in self.removed_nodes:
+                self.node_data[node].append(np.full((1, self.node_data[node].shape[1]), np.nan))
+
+            for node in self.node_data:
+                node.overwrite_data(
+                    self.node_data[node],
+                    None,
+                    forward_in_time_on_next_overwrite=(self.node_data[node].shape[0] == self.RING_CAPACITY),
+                )
+
+            temp_scene_dict = {k: v[:, 0:2] for k, v in self.node_data.items()}
+            if not temp_scene_dict:
+                new_scene_graph = SceneGraph(edge_radius=self.env.attention_radius)
+            else:
+                new_scene_graph = TemporalSceneGraph.create_from_temp_scene_dict(
+                    temp_scene_dict,
+                    self.env.attention_radius,
+                    duration=self.RING_CAPACITY,
+                    edge_addition_filter=self.hyperparams["edge_addition_filter"],
+                    edge_removal_filter=self.hyperparams["edge_removal_filter"],
+                    online=True,
+                ).to_scene_graph(t=self.RING_CAPACITY - 1)
+
+            if self.hyperparams["dynamic_edges"] == "yes":
+                new_nodes, removed_nodes, new_neighbors, removed_neighbors = new_scene_graph - self.scene_graph
+
+                # Aside from updating the scene graph, this for loop updates the graph model
+                # structure of all affected nodes.
+                not_removed_nodes = [node for node in self.nodes if node not in removed_nodes]
+                self.scene_graph = new_scene_graph
+                for node in not_removed_nodes:
+                    self.node_models_dict[node].update_graph(new_scene_graph, new_neighbors, removed_neighbors)
+
+                # These next 2 for loops add or remove entire node models.
+                for node in new_nodes:
+                    if (
+                        node.is_robot and self.hyperparams["incl_robot_node"]
+                    ) or node.type not in self.pred_state.keys():
+                        # Only deal with Models for NodeTypes we want to predict
+                        continue
+
+                    self._add_node_model(node)
+                    self.node_models_dict[node].update_graph(new_scene_graph, new_neighbors, removed_neighbors)
+
+                for node in removed_nodes:
+                    if (
+                        node.is_robot and self.hyperparams["incl_robot_node"]
+                    ) or node.type not in self.pred_state.keys():
+                        continue
+
+                    self._remove_node_model(node)
+
+            # This actually updates the node models with the newly observed data.
+            if run_models:
+                inputs = dict()
+                inputs_st = dict()
+                inputs_np = dict()
+
+                iter_list = list(self.node_models_dict.keys()) + [
+                    node for node in new_inputs_dict if node.type not in self.pred_state.keys()
+                ]
+                if self.env.scenes[0].robot is not None:
+                    iter_list.append(self.env.scenes[0].robot)
+
+                for node in iter_list:
+                    input_np = node.get(np.array([node.last_timestep, node.last_timestep]), self.state[node.type])
+
+                    _, std = self.env.get_standardize_params(self.state[node.type.name], node.type)
+                    std[0:2] = self.env.attention_radius[(node.type, node.type)]
+                    rel_state = np.zeros_like(input_np)
+                    rel_state[:, 0:2] = input_np[:, 0:2]
+                    input_st = self.env.standardize(input_np, self.state[node.type.name], node.type, mean=rel_state)
+                    self.rel_states[node] = rel_state
+
+                    # Converting NaNs to zeros.
+                    input_np[np.isnan(input_np)] = 0
+                    input_st[np.isnan(input_st)] = 0
+
+                    # Convert to torch tensors
+                    inputs[node] = torch.tensor(input_np, dtype=torch.float, device=self.device)
+                    inputs_st[node] = torch.tensor(input_st, dtype=torch.float, device=self.device)
+                    inputs_np[node] = input_np
+
+                # We want tensors of shape (1, ph + 1, state_dim) where the first 1 is the batch size.
+                if (
+                    self.hyperparams["incl_robot_node"]
+                    and self.env.scenes[0].robot is not None
+                    and robot_present_and_future is not None
+                ):
+                    if len(robot_present_and_future.shape) == 2:
+                        robot_present_and_future = robot_present_and_future[np.newaxis, :]
+
+                    assert robot_present_and_future.shape[1] == prediction_horizon + 1
+                    robot_present_and_future = torch.tensor(
+                        robot_present_and_future, dtype=torch.float, device=self.device
+                    )
+
+                for node in self.node_models_dict:
+                    self.node_models_dict[node].encoder_forward(
+                        inputs, inputs_st, inputs_np, robot_present_and_future, maps
+                    )
+
+                # If num_predicted_timesteps or num_samples == 0 then do not run the decoder at all,
+                # just update the encoder LSTMs.
+                if prediction_horizon == 0 or num_samples == 0:
+                    return
+
+                return self.sample_model(
+                    prediction_horizon,
+                    num_samples,
+                    robot_present_and_future=robot_present_and_future,
+                    z_mode=z_mode,
+                    gmm_mode=gmm_mode,
+                    full_dist=full_dist,
+                    all_z_sep=all_z_sep,
+                )
+
+    def _run_decoder(
+        self,
+        node,
+        num_predicted_timesteps,
+        num_samples,
+        robot_present_and_future=None,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=False,
+        all_z_sep=False,
+    ):
+        model = self.node_models_dict[node]
+        prediction_dist, predictions_uns = model.decoder_forward(
+            num_predicted_timesteps,
+            num_samples,
+            robot_present_and_future=robot_present_and_future,
+            z_mode=z_mode,
+            gmm_mode=gmm_mode,
+            full_dist=full_dist,
+            all_z_sep=all_z_sep,
+        )
+
+        predictions_np = predictions_uns.cpu().detach().numpy()
+
+        # Return will be of shape (batch_size, num_samples, num_predicted_timesteps, 2)
+        return prediction_dist, np.transpose(predictions_np, (1, 0, 2, 3))
+
+    def sample_model(
+        self,
+        num_predicted_timesteps,
+        num_samples,
+        robot_present_and_future=None,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=False,
+        all_z_sep=False,
+    ):
+        # Just start from the encoder output (minus the
+        # robot future) and get num_samples of
+        # num_predicted_timesteps-length trajectories.
+        if num_predicted_timesteps == 0 or num_samples == 0:
+            return
+
+        mode = ModeKeys.PREDICT
+
+        # We want tensors of shape (1, ph + 1, state_dim) where the first 1 is the batch size.
+        if (
+            self.hyperparams["incl_robot_node"]
+            and self.env.scenes[0].robot is not None
+            and robot_present_and_future is not None
+        ):
+            if len(robot_present_and_future.shape) == 2:
+                robot_present_and_future = robot_present_and_future[np.newaxis, :]
+
+            assert robot_present_and_future.shape[1] == num_predicted_timesteps + 1
+
+        # No grad since we're predicting always, as evidenced by the line above.
+        with torch.no_grad():
+            predictions_dict = dict()
+            prediction_dists = dict()
+            for node in set(self.nodes) - set(self.removed_nodes.keys()):
+                if node.is_robot:
+                    continue
+
+                prediction_dists[node], predictions_dict[node] = self._run_decoder(
+                    node,
+                    num_predicted_timesteps,
+                    num_samples,
+                    robot_present_and_future,
+                    z_mode,
+                    gmm_mode,
+                    full_dist,
+                    all_z_sep,
+                )
+
+        return prediction_dists, predictions_dict
+
+    def forward(
+        self,
+        init_env,
+        init_timestep,
+        input_dicts,  # After the initial environment
+        num_predicted_timesteps,
+        num_samples,
+        robot_present_and_future=None,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=False,
+        all_z_sep=False,
+    ):
+        # This is the standard forward prediction function,
+        # if you have some historical data and just want to
+        # predict forward some number of timesteps.
+
+        # Setting us back to the initial scene graph we had.
+        self.set_environment(init_env, init_timestep)
+
+        # Looping through and applying updates to the model.
+        for i in range(len(input_dicts)):
+            self.incremental_forward(input_dicts[i])
+
+        return self.sample_model(
+            num_predicted_timesteps,
+            num_samples,
+            robot_present_and_future=robot_present_and_future,
+            z_mode=z_mode,
+            gmm_mode=gmm_mode,
+            full_dist=full_dist,
+            all_z_sep=all_z_sep,
+        )
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py
new file mode 100644
index 000000000..333a6b671
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py
@@ -0,0 +1,241 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import numpy as np
+from model.mgcvae import MultimodalGenerativeCVAE
+from model.dataset import get_timesteps_data, restore
+
+
+class Trajectron(torch.nn.Module):
+    def __init__(self, model_registrar, hyperparams, log_writer, device):
+        super(Trajectron, self).__init__()
+        self.hyperparams = hyperparams
+        self.log_writer = log_writer
+        self.device = device
+        self.curr_iter = 0
+
+        self.model_registrar = model_registrar
+        # self.node_models_dict = dict()
+        self.node_models_dict = torch.nn.ModuleDict()
+        self.nodes = set()
+
+        self.env = None
+
+        self.min_ht = self.hyperparams["minimum_history_length"]
+        self.max_ht = self.hyperparams["maximum_history_length"]
+        self.ph = self.hyperparams["prediction_horizon"]
+        self.state = self.hyperparams["state"]
+        self.state_length = dict()
+        for state_type in self.state.keys():
+            self.state_length[state_type] = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[state_type].values()])
+            )
+        self.pred_state = self.hyperparams["pred_state"]
+
+    def eval(self):
+        super().eval()
+        for key in self.node_models_dict.keys():
+            self.node_models_dict[key].eval()
+
+    def set_environment(self, env):
+        self.env = env
+
+        self.node_models_dict.clear()
+        edge_types = env.get_edge_types()
+
+        for node_type in env.NodeType:
+            # Only add a Model for NodeTypes we want to predict
+            if node_type in self.pred_state.keys():
+                self.node_models_dict[str(node_type)] = MultimodalGenerativeCVAE(
+                    env,
+                    node_type,
+                    self.model_registrar,
+                    self.hyperparams,
+                    self.device,
+                    edge_types,
+                    log_writer=self.log_writer,
+                )
+
+    def set_curr_iter(self, curr_iter):
+        self.curr_iter = curr_iter
+        for node_str, model in self.node_models_dict.items():
+            model.set_curr_iter(curr_iter)
+
+    def set_annealing_params(self):
+        for node_str, model in self.node_models_dict.items():
+            model.set_annealing_params()
+
+    def step_annealers(self, node_type=None):
+        if node_type is None:
+            for node_type in self.node_models_dict:
+                self.node_models_dict[node_type].step_annealers()
+        else:
+            self.node_models_dict[node_type].step_annealers()
+
+    def train_loss(self, batch, node_type):
+        (
+            first_history_index,
+            x_t,
+            y_t,
+            x_st_t,
+            y_st_t,
+            neighbors_data_st,
+            neighbors_edge_value,
+            robot_traj_st_t,
+            map,
+        ) = batch
+
+        x = x_t.to(self.device)
+        y = y_t.to(self.device)
+        x_st_t = x_st_t.to(self.device)
+        y_st_t = y_st_t.to(self.device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(self.device)
+        if type(map) == torch.Tensor:
+            map = map.to(self.device)
+
+        # Run forward pass
+        model = self.node_models_dict[node_type]
+        loss = model.train_loss(
+            inputs=x,
+            inputs_st=x_st_t,
+            first_history_indices=first_history_index,
+            labels=y,
+            labels_st=y_st_t,
+            neighbors=restore(neighbors_data_st),
+            neighbors_edge_value=restore(neighbors_edge_value),
+            robot=robot_traj_st_t,
+            map=map,
+            prediction_horizon=self.ph,
+        )
+
+        return loss
+
+    def eval_loss(self, batch, node_type):
+        (
+            first_history_index,
+            x_t,
+            y_t,
+            x_st_t,
+            y_st_t,
+            neighbors_data_st,
+            neighbors_edge_value,
+            robot_traj_st_t,
+            map,
+        ) = batch
+
+        x = x_t.to(self.device)
+        y = y_t.to(self.device)
+        x_st_t = x_st_t.to(self.device)
+        y_st_t = y_st_t.to(self.device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(self.device)
+        if type(map) == torch.Tensor:
+            map = map.to(self.device)
+
+        # Run forward pass
+        model = self.node_models_dict[node_type]
+        nll = model.eval_loss(
+            inputs=x,
+            inputs_st=x_st_t,
+            first_history_indices=first_history_index,
+            labels=y,
+            labels_st=y_st_t,
+            neighbors=restore(neighbors_data_st),
+            neighbors_edge_value=restore(neighbors_edge_value),
+            robot=robot_traj_st_t,
+            map=map,
+            prediction_horizon=self.ph,
+        )
+
+        return nll.cpu().detach().numpy()
+
+    def predict(
+        self,
+        scene,
+        timesteps,
+        ph,
+        num_samples=1,
+        min_future_timesteps=0,
+        min_history_timesteps=1,
+        z_mode=False,
+        gmm_mode=False,
+        full_dist=True,
+        all_z_sep=False,
+    ):
+
+        predictions_dict = {}
+        for node_type in self.env.NodeType:
+            if node_type not in self.pred_state:
+                continue
+
+            model = self.node_models_dict[node_type]
+
+            # Get Input data for node type and given timesteps
+            batch = get_timesteps_data(
+                env=self.env,
+                scene=scene,
+                t=timesteps,
+                node_type=node_type,
+                state=self.state,
+                pred_state=self.pred_state,
+                edge_types=model.edge_types,
+                min_ht=min_history_timesteps,
+                max_ht=self.max_ht,
+                min_ft=min_future_timesteps,
+                max_ft=min_future_timesteps,
+                hyperparams=self.hyperparams,
+            )
+            # There are no nodes of type present for timestep
+            if batch is None:
+                continue
+            (
+                (
+                    first_history_index,
+                    x_t,
+                    y_t,
+                    x_st_t,
+                    y_st_t,
+                    neighbors_data_st,
+                    neighbors_edge_value,
+                    robot_traj_st_t,
+                    map,
+                ),
+                nodes,
+                timesteps_o,
+            ) = batch
+
+            x = x_t.to(self.device)
+            x_st_t = x_st_t.to(self.device)
+            if robot_traj_st_t is not None:
+                robot_traj_st_t = robot_traj_st_t.to(self.device)
+            if type(map) == torch.Tensor:
+                map = map.to(self.device)
+
+            # Run forward pass
+            predictions = model.predict(
+                inputs=x,
+                inputs_st=x_st_t,
+                first_history_indices=first_history_index,
+                neighbors=neighbors_data_st,
+                neighbors_edge_value=neighbors_edge_value,
+                robot=robot_traj_st_t,
+                map=map,
+                prediction_horizon=ph,
+                num_samples=num_samples,
+                z_mode=z_mode,
+                gmm_mode=gmm_mode,
+                full_dist=full_dist,
+                all_z_sep=all_z_sep,
+            )
+
+            predictions_np = predictions.cpu().detach().numpy()
+
+            # Assign predictions to node
+            for i, ts in enumerate(timesteps_o):
+                if ts not in predictions_dict.keys():
+                    predictions_dict[ts] = dict()
+                predictions_dict[ts][nodes[i]] = np.transpose(predictions_np[:, [i]], (1, 0, 2, 3))
+
+        return predictions_dict
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json
new file mode 100644
index 000000000..bf417f081
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json
@@ -0,0 +1 @@
+{"batch_size": 256, "grad_clip": 1.0, "learning_rate_style": "exp", "learning_rate": 0.001, "min_learning_rate": 1e-05, "learning_decay_rate": 0.9999, "prediction_horizon": 12, "minimum_history_length": 1, "maximum_history_length": 8, "map_encoder": {"PEDESTRIAN": {"heading_state_index": 5, "patch_size": [50, 10, 50, 90], "map_channels": 3, "hidden_channels": [10, 20, 10, 1], "output_size": 32, "masks": [5, 5, 5, 5], "strides": [1, 1, 1, 1], "dropout": 0.5}}, "k": 1, "k_eval": 1, "kl_min": 0.07, "kl_weight": 100.0, "kl_weight_start": 0, "kl_decay_rate": 0.99995, "kl_crossover": 400, "kl_sigmoid_divisor": 4, "rnn_kwargs": {"dropout_keep_prob": 0.75}, "MLP_dropout_keep_prob": 0.9, "enc_rnn_dim_edge": 32, "enc_rnn_dim_edge_influence": 32, "enc_rnn_dim_history": 32, "enc_rnn_dim_future": 32, "dec_rnn_dim": 128, "q_z_xy_MLP_dims": null, "p_z_x_MLP_dims": 32, "GMM_components": 1, "log_p_yt_xz_max": 6, "N": 1, "K": 25, "tau_init": 2.0, "tau_final": 0.05, "tau_decay_rate": 0.997, "use_z_logit_clipping": true, "z_logit_clip_start": 0.05, "z_logit_clip_final": 5.0, "z_logit_clip_crossover": 300, "z_logit_clip_divisor": 5, "dynamic": {"PEDESTRIAN": {"name": "SingleIntegrator", "distribution": true, "limits": {}}}, "state": {"PEDESTRIAN": {"position": ["x", "y"], "velocity": ["x", "y"], "acceleration": ["x", "y"]}}, "pred_state": {"PEDESTRIAN": {"position": ["x", "y"]}}, "log_histograms": false, "scene_freq_mult_eval": false, "node_freq_mult_eval": false, "edge_encoding": false, "incl_robot_node": false, "use_map_encoding": false}
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py
new file mode 100644
index 000000000..e840fda99
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pandas as pd
+from data import SingleHeaderNumpyArray, DoubleHeaderNumpyArray
+
+
+def test_single_header_numpy_array():
+    x = np.random.rand(10)
+    y = np.random.rand(10)
+
+    array = SingleHeaderNumpyArray(np.stack((x, y), axis=-1), ["x", "y"])
+
+    assert (array[:, "x"] == x).all()
+    assert (array[:, "y"] == y).all()
+    assert (array[3:7, "y"] == y[3:7]).all()
+    assert (array.x == x).all()
+    assert (array.y == y).all()
+
+
+def test_double_header_numpy_array():
+    x = np.random.rand(10)
+    y = np.random.rand(10)
+    vx = np.random.rand(10)
+    vy = np.random.rand(10)
+
+    data_dict = {("position", "x"): x, ("position", "y"): y, ("velocity", "x"): vx, ("velocity", "y"): vy}
+
+    data_columns = pd.MultiIndex.from_product([["position", "velocity"], ["x", "y"]])
+
+    node_data = pd.DataFrame(data_dict, columns=data_columns)
+
+    array = DoubleHeaderNumpyArray(node_data.values, list(node_data.columns))
+
+    test_header_dict = {"position": ["x", "y"], "velocity": ["y"]}
+
+    assert (array[:, ("position", "x")] == x).all()
+    assert (array[:, ("velocity", "y")] == vy).all()
+    assert (array[4:7, ("velocity", "y")] == vy[4:7]).all()
+    assert (array[:, [("position", "x"), ("velocity", "y")]] == np.stack((x, vy), axis=-1)).all()
+    assert (array[:, [("position", "y"), ("velocity", "x")]] == np.stack((y, vx), axis=-1)).all()
+    assert (array[2:6, [("position", "y"), ("velocity", "x")]] == np.stack((y, vx), axis=-1)[2:6]).all()
+    assert (array[:, test_header_dict] == np.stack((x, y, vy), axis=-1)).all()
+    assert (array[1:8, test_header_dict] == np.stack((x, y, vy), axis=-1)[1:8]).all()
+    assert (array.position.x == x).all()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py
new file mode 100644
index 000000000..123b8e87e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py
@@ -0,0 +1,252 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+import time
+import json
+import torch
+import dill
+import random
+import pathlib
+import evaluation
+import numpy as np
+import visualization as vis
+from argument_parser import args
+from model.online.online_trajectron import OnlineTrajectron
+from model.model_registrar import ModelRegistrar
+from environment import Environment, Scene
+import matplotlib.pyplot as plt
+
+if not torch.cuda.is_available() or args.device == "cpu":
+    args.device = torch.device("cpu")
+else:
+    if torch.cuda.device_count() == 1:
+        # If you have CUDA_VISIBLE_DEVICES set, which you should,
+        # then this will prevent leftover flag arguments from
+        # messing with the device allocation.
+        args.device = "cuda:0"
+
+    args.device = torch.device(args.device)
+
+if args.eval_device is None:
+    args.eval_device = "cpu"
+
+if args.seed is not None:
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def create_online_env(env, hyperparams, scene_idx, init_timestep):
+    test_scene = env.scenes[scene_idx]
+
+    online_scene = Scene(timesteps=init_timestep + 1, map=test_scene.map, dt=test_scene.dt)
+    online_scene.nodes = test_scene.get_nodes_clipped_at_time(
+        timesteps=np.arange(init_timestep - hyperparams["maximum_history_length"], init_timestep + 1),
+        state=hyperparams["state"],
+    )
+    online_scene.robot = test_scene.robot
+    online_scene.calculate_scene_graph(
+        attention_radius=env.attention_radius,
+        edge_addition_filter=hyperparams["edge_addition_filter"],
+        edge_removal_filter=hyperparams["edge_removal_filter"],
+    )
+
+    return Environment(
+        node_type_list=env.node_type_list,
+        standardization=env.standardization,
+        scenes=[online_scene],
+        attention_radius=env.attention_radius,
+        robot_type=env.robot_type,
+    )
+
+
+def get_maps_for_input(input_dict, scene, hyperparams):
+    scene_maps = list()
+    scene_pts = list()
+    heading_angles = list()
+    patch_sizes = list()
+    nodes_with_maps = list()
+    for node in input_dict:
+        if node.type in hyperparams["map_encoder"]:
+            x = input_dict[node]
+            me_hyp = hyperparams["map_encoder"][node.type]
+            if "heading_state_index" in me_hyp:
+                heading_state_index = me_hyp["heading_state_index"]
+                # We have to rotate the map in the opposit direction of the agent to match them
+                if type(heading_state_index) is list:  # infer from velocity or heading vector
+                    heading_angle = (
+                        -np.arctan2(x[-1, heading_state_index[1]], x[-1, heading_state_index[0]]) * 180 / np.pi
+                    )
+                else:
+                    heading_angle = -x[-1, heading_state_index] * 180 / np.pi
+            else:
+                heading_angle = None
+
+            scene_map = scene.map[node.type]
+            map_point = x[-1, :2]
+
+            patch_size = hyperparams["map_encoder"][node.type]["patch_size"]
+
+            scene_maps.append(scene_map)
+            scene_pts.append(map_point)
+            heading_angles.append(heading_angle)
+            patch_sizes.append(patch_size)
+            nodes_with_maps.append(node)
+
+    if heading_angles[0] is None:
+        heading_angles = None
+    else:
+        heading_angles = torch.Tensor(heading_angles)
+
+    maps = scene_maps[0].get_cropped_maps_from_scene_map_batch(
+        scene_maps, scene_pts=torch.Tensor(scene_pts), patch_size=patch_sizes[0], rotation=heading_angles
+    )
+
+    maps_dict = {node: maps[[i]] for i, node in enumerate(nodes_with_maps)}
+    return maps_dict
+
+
+def main():
+    # Choose one of the model directory names under the experiment/*/models folders.
+    # Possibilities are 'vel_ee', 'int_ee', 'int_ee_me', or 'robot'
+    model_dir = os.path.join(args.log_dir, "int_ee")
+
+    # Load hyperparameters from json
+    config_file = os.path.join(model_dir, args.conf)
+    if not os.path.exists(config_file):
+        raise ValueError("Config json not found!")
+    with open(config_file, "r") as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Add hyperparams from arguments
+    hyperparams["dynamic_edges"] = args.dynamic_edges
+    hyperparams["edge_state_combine_method"] = args.edge_state_combine_method
+    hyperparams["edge_influence_combine_method"] = args.edge_influence_combine_method
+    hyperparams["edge_addition_filter"] = args.edge_addition_filter
+    hyperparams["edge_removal_filter"] = args.edge_removal_filter
+    hyperparams["batch_size"] = args.batch_size
+    hyperparams["k_eval"] = args.k_eval
+    hyperparams["offline_scene_graph"] = args.offline_scene_graph
+    hyperparams["incl_robot_node"] = args.incl_robot_node
+    hyperparams["edge_encoding"] = not args.no_edge_encoding
+    hyperparams["use_map_encoding"] = args.map_encoding
+
+    output_save_dir = os.path.join(model_dir, "pred_figs")
+    pathlib.Path(output_save_dir).mkdir(parents=True, exist_ok=True)
+
+    eval_data_path = os.path.join(args.data_dir, args.eval_data_dict)
+    with open(eval_data_path, "rb") as f:
+        eval_env = dill.load(f, encoding="latin1")
+
+    if eval_env.robot_type is None and hyperparams["incl_robot_node"]:
+        eval_env.robot_type = eval_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+        for scene in eval_env.scenes:
+            scene.add_robot_from_nodes(eval_env.robot_type)
+
+    print("Loaded data from %s" % (eval_data_path,))
+
+    # Creating a dummy environment with a single scene that contains information about the world.
+    # When using this code, feel free to use whichever scene index or initial timestep you wish.
+    scene_idx = 0
+
+    # You need to have at least acceleration, so you want 2 timesteps of prior data, e.g. [0, 1],
+    # so that you can immediately start incremental inference from the 3rd timestep onwards.
+    init_timestep = 1
+
+    eval_scene = eval_env.scenes[scene_idx]
+    online_env = create_online_env(eval_env, hyperparams, scene_idx, init_timestep)
+
+    model_registrar = ModelRegistrar(model_dir, args.eval_device)
+    model_registrar.load_models(iter_num=12)
+
+    trajectron = OnlineTrajectron(model_registrar, hyperparams, args.eval_device)
+
+    # If you want to see what different robot futures do to the predictions, uncomment this line as well as
+    # related "... += adjustment" lines below.
+    # adjustment = np.stack([np.arange(13)/float(i*2.0) for i in range(6, 12)], axis=1)
+
+    # Here's how you'd incrementally run the model, e.g. with streaming data.
+    trajectron.set_environment(online_env, init_timestep)
+
+    for timestep in range(init_timestep + 1, eval_scene.timesteps):
+        input_dict = eval_scene.get_clipped_input_dict(timestep, hyperparams["state"])
+
+        maps = None
+        if hyperparams["use_map_encoding"]:
+            maps = get_maps_for_input(input_dict, eval_scene, hyperparams)
+
+        robot_present_and_future = None
+        if eval_scene.robot is not None and hyperparams["incl_robot_node"]:
+            robot_present_and_future = eval_scene.robot.get(
+                np.array([timestep, timestep + hyperparams["prediction_horizon"]]),
+                hyperparams["state"][eval_scene.robot.type],
+                padding=0.0,
+            )
+            robot_present_and_future = np.stack([robot_present_and_future, robot_present_and_future], axis=0)
+            # robot_present_and_future += adjustment
+
+        start = time.time()
+        dists, preds = trajectron.incremental_forward(
+            input_dict,
+            maps,
+            prediction_horizon=6,
+            num_samples=1,
+            robot_present_and_future=robot_present_and_future,
+            full_dist=True,
+        )
+        end = time.time()
+        print(
+            "t=%d: took %.2f s (= %.2f Hz) w/ %d nodes and %d edges"
+            % (
+                timestep,
+                end - start,
+                1.0 / (end - start),
+                len(trajectron.nodes),
+                trajectron.scene_graph.get_num_edges(),
+            )
+        )
+
+        detailed_preds_dict = dict()
+        for node in eval_scene.nodes:
+            if node in preds:
+                detailed_preds_dict[node] = preds[node]
+
+        fig, ax = plt.subplots()
+        vis.visualize_distribution(ax, dists)
+        vis.visualize_prediction(
+            ax,
+            {timestep: preds},
+            eval_scene.dt,
+            hyperparams["maximum_history_length"],
+            hyperparams["prediction_horizon"],
+        )
+
+        if eval_scene.robot is not None and hyperparams["incl_robot_node"]:
+            robot_for_plotting = eval_scene.robot.get(
+                np.array([timestep, timestep + hyperparams["prediction_horizon"]]),
+                hyperparams["state"][eval_scene.robot.type],
+            )
+            # robot_for_plotting += adjustment
+
+            ax.plot(robot_for_plotting[1:, 1], robot_for_plotting[1:, 0], color="r", linewidth=1.0, alpha=1.0)
+
+            # Current Node Position
+            circle = plt.Circle(
+                (robot_for_plotting[0, 1], robot_for_plotting[0, 0]),
+                0.3,
+                facecolor="r",
+                edgecolor="k",
+                lw=0.5,
+                zorder=3,
+            )
+            ax.add_artist(circle)
+
+        fig.savefig(os.path.join(output_save_dir, f"pred_{timestep}.pdf"), dpi=300)
+        plt.close(fig)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py
new file mode 100644
index 000000000..274e3e0de
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py
@@ -0,0 +1,452 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from torch import nn, optim, utils
+import numpy as np
+import os
+import time
+import dill
+import json
+import random
+import pathlib
+import warnings
+from tqdm import tqdm
+import visualization
+import evaluation
+import matplotlib.pyplot as plt
+from argument_parser import args
+from model.trajectron import Trajectron
+from model.model_registrar import ModelRegistrar
+from model.model_utils import cyclical_lr
+from model.dataset import EnvironmentDataset, collate
+from tensorboardX import SummaryWriter
+
+# torch.autograd.set_detect_anomaly(True)
+
+if not torch.cuda.is_available() or args.device == "cpu":
+    args.device = torch.device("cpu")
+else:
+    if torch.cuda.device_count() == 1:
+        # If you have CUDA_VISIBLE_DEVICES set, which you should,
+        # then this will prevent leftover flag arguments from
+        # messing with the device allocation.
+        args.device = "cuda:0"
+
+    args.device = torch.device(args.device)
+
+if args.eval_device is None:
+    args.eval_device = torch.device("cpu")
+
+# This is needed for memory pinning using a DataLoader (otherwise memory is pinned to cuda:0 by default)
+torch.cuda.set_device(args.device)
+
+if args.seed is not None:
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def main():
+    # Load hyperparameters from json
+    if not os.path.exists(args.conf):
+        print("Config json not found!")
+    with open(args.conf, "r", encoding="utf-8") as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Add hyperparams from arguments
+    hyperparams["dynamic_edges"] = args.dynamic_edges
+    hyperparams["edge_state_combine_method"] = args.edge_state_combine_method
+    hyperparams["edge_influence_combine_method"] = args.edge_influence_combine_method
+    hyperparams["edge_addition_filter"] = args.edge_addition_filter
+    hyperparams["edge_removal_filter"] = args.edge_removal_filter
+    hyperparams["batch_size"] = args.batch_size
+    hyperparams["k_eval"] = args.k_eval
+    hyperparams["offline_scene_graph"] = args.offline_scene_graph
+    hyperparams["incl_robot_node"] = args.incl_robot_node
+    hyperparams["node_freq_mult_train"] = args.node_freq_mult_train
+    hyperparams["node_freq_mult_eval"] = args.node_freq_mult_eval
+    hyperparams["scene_freq_mult_train"] = args.scene_freq_mult_train
+    hyperparams["scene_freq_mult_eval"] = args.scene_freq_mult_eval
+    hyperparams["scene_freq_mult_viz"] = args.scene_freq_mult_viz
+    hyperparams["edge_encoding"] = not args.no_edge_encoding
+    hyperparams["use_map_encoding"] = args.map_encoding
+    hyperparams["augment"] = args.augment
+    hyperparams["override_attention_radius"] = args.override_attention_radius
+
+    print("-----------------------")
+    print("| TRAINING PARAMETERS |")
+    print("-----------------------")
+    print("| batch_size: %d" % args.batch_size)
+    print("| device: %s" % args.device)
+    print("| eval_device: %s" % args.eval_device)
+    print("| Offline Scene Graph Calculation: %s" % args.offline_scene_graph)
+    print("| EE state_combine_method: %s" % args.edge_state_combine_method)
+    print("| EIE scheme: %s" % args.edge_influence_combine_method)
+    print("| dynamic_edges: %s" % args.dynamic_edges)
+    print("| robot node: %s" % args.incl_robot_node)
+    print("| edge_addition_filter: %s" % args.edge_addition_filter)
+    print("| edge_removal_filter: %s" % args.edge_removal_filter)
+    print("| MHL: %s" % hyperparams["minimum_history_length"])
+    print("| PH: %s" % hyperparams["prediction_horizon"])
+    print("-----------------------")
+
+    log_writer = None
+    model_dir = None
+    if not args.debug:
+        # Create the log and model directiory if they're not present.
+        model_dir = os.path.join(
+            args.log_dir, "models_" + time.strftime("%d_%b_%Y_%H_%M_%S", time.localtime()) + args.log_tag
+        )
+        pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)
+
+        # Save config to model directory
+        with open(os.path.join(model_dir, "config.json"), "w") as conf_json:
+            json.dump(hyperparams, conf_json)
+
+        log_writer = SummaryWriter(log_dir=model_dir)
+
+    # Load training and evaluation environments and scenes
+    train_scenes = []
+    train_data_path = os.path.join(args.data_dir, args.train_data_dict)
+    with open(train_data_path, "rb") as f:
+        train_env = dill.load(f, encoding="latin1")
+
+    for attention_radius_override in args.override_attention_radius:
+        node_type1, node_type2, attention_radius = attention_radius_override.split(" ")
+        train_env.attention_radius[(node_type1, node_type2)] = float(attention_radius)
+
+    if train_env.robot_type is None and hyperparams["incl_robot_node"]:
+        train_env.robot_type = train_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+        for scene in train_env.scenes:
+            scene.add_robot_from_nodes(train_env.robot_type)
+
+    train_scenes = train_env.scenes
+    train_scenes_sample_probs = train_env.scenes_freq_mult_prop if args.scene_freq_mult_train else None
+
+    train_dataset = EnvironmentDataset(
+        train_env,
+        hyperparams["state"],
+        hyperparams["pred_state"],
+        scene_freq_mult=hyperparams["scene_freq_mult_train"],
+        node_freq_mult=hyperparams["node_freq_mult_train"],
+        hyperparams=hyperparams,
+        min_history_timesteps=hyperparams["minimum_history_length"],
+        min_future_timesteps=hyperparams["prediction_horizon"],
+        return_robot=not args.incl_robot_node,
+    )
+    train_data_loader = dict()
+    for node_type_data_set in train_dataset:
+        if len(node_type_data_set) == 0:
+            continue
+
+        node_type_dataloader = utils.data.DataLoader(
+            node_type_data_set,
+            collate_fn=collate,
+            pin_memory=False if args.device is "cpu" else True,
+            batch_size=args.batch_size,
+            shuffle=True,
+            num_workers=args.preprocess_workers,
+        )
+        train_data_loader[node_type_data_set.node_type] = node_type_dataloader
+
+    print(f"Loaded training data from {train_data_path}")
+
+    eval_scenes = []
+    eval_scenes_sample_probs = None
+    if args.eval_every is not None:
+        eval_data_path = os.path.join(args.data_dir, args.eval_data_dict)
+        with open(eval_data_path, "rb") as f:
+            eval_env = dill.load(f, encoding="latin1")
+
+        for attention_radius_override in args.override_attention_radius:
+            node_type1, node_type2, attention_radius = attention_radius_override.split(" ")
+            eval_env.attention_radius[(node_type1, node_type2)] = float(attention_radius)
+
+        if eval_env.robot_type is None and hyperparams["incl_robot_node"]:
+            eval_env.robot_type = eval_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+            for scene in eval_env.scenes:
+                scene.add_robot_from_nodes(eval_env.robot_type)
+
+        eval_scenes = eval_env.scenes
+        eval_scenes_sample_probs = eval_env.scenes_freq_mult_prop if args.scene_freq_mult_eval else None
+
+        eval_dataset = EnvironmentDataset(
+            eval_env,
+            hyperparams["state"],
+            hyperparams["pred_state"],
+            scene_freq_mult=hyperparams["scene_freq_mult_eval"],
+            node_freq_mult=hyperparams["node_freq_mult_eval"],
+            hyperparams=hyperparams,
+            min_history_timesteps=hyperparams["minimum_history_length"],
+            min_future_timesteps=hyperparams["prediction_horizon"],
+            return_robot=not args.incl_robot_node,
+        )
+        eval_data_loader = dict()
+        for node_type_data_set in eval_dataset:
+            if len(node_type_data_set) == 0:
+                continue
+
+            node_type_dataloader = utils.data.DataLoader(
+                node_type_data_set,
+                collate_fn=collate,
+                pin_memory=False if args.eval_device is "cpu" else True,
+                batch_size=args.eval_batch_size,
+                shuffle=True,
+                num_workers=args.preprocess_workers,
+            )
+            eval_data_loader[node_type_data_set.node_type] = node_type_dataloader
+
+        print(f"Loaded evaluation data from {eval_data_path}")
+
+    # Offline Calculate Scene Graph
+    if hyperparams["offline_scene_graph"] == "yes":
+        print(f"Offline calculating scene graphs")
+        for i, scene in enumerate(train_scenes):
+            scene.calculate_scene_graph(
+                train_env.attention_radius, hyperparams["edge_addition_filter"], hyperparams["edge_removal_filter"]
+            )
+            print(f"Created Scene Graph for Training Scene {i}")
+
+        for i, scene in enumerate(eval_scenes):
+            scene.calculate_scene_graph(
+                eval_env.attention_radius, hyperparams["edge_addition_filter"], hyperparams["edge_removal_filter"]
+            )
+            print(f"Created Scene Graph for Evaluation Scene {i}")
+
+    model_registrar = ModelRegistrar(model_dir, args.device)
+
+    trajectron = Trajectron(model_registrar, hyperparams, log_writer, args.device)
+
+    trajectron.set_environment(train_env)
+    trajectron.set_annealing_params()
+    print("Created Training Model.")
+
+    eval_trajectron = None
+    if args.eval_every is not None or args.vis_every is not None:
+        eval_trajectron = Trajectron(model_registrar, hyperparams, log_writer, args.eval_device)
+        eval_trajectron.set_environment(eval_env)
+        eval_trajectron.set_annealing_params()
+    print("Created Evaluation Model.")
+
+    optimizer = dict()
+    lr_scheduler = dict()
+    for node_type in train_env.NodeType:
+        if node_type not in hyperparams["pred_state"]:
+            continue
+        optimizer[node_type] = optim.Adam(
+            [
+                {"params": model_registrar.get_all_but_name_match("map_encoder").parameters()},
+                {"params": model_registrar.get_name_match("map_encoder").parameters(), "lr": 0.0008},
+            ],
+            lr=hyperparams["learning_rate"],
+        )
+        # Set Learning Rate
+        if hyperparams["learning_rate_style"] == "const":
+            lr_scheduler[node_type] = optim.lr_scheduler.ExponentialLR(optimizer[node_type], gamma=1.0)
+        elif hyperparams["learning_rate_style"] == "exp":
+            lr_scheduler[node_type] = optim.lr_scheduler.ExponentialLR(
+                optimizer[node_type], gamma=hyperparams["learning_decay_rate"]
+            )
+
+    #################################
+    #           TRAINING            #
+    #################################
+    curr_iter_node_type = {node_type: 0 for node_type in train_data_loader.keys()}
+    for epoch in range(1, args.train_epochs + 1):
+        model_registrar.to(args.device)
+        train_dataset.augment = args.augment
+        for node_type, data_loader in train_data_loader.items():
+            curr_iter = curr_iter_node_type[node_type]
+            pbar = tqdm(data_loader, ncols=80)
+            for batch in pbar:
+                trajectron.set_curr_iter(curr_iter)
+                trajectron.step_annealers(node_type)
+                optimizer[node_type].zero_grad()
+                train_loss = trajectron.train_loss(batch, node_type)
+                pbar.set_description(f"Epoch {epoch}, {node_type} L: {train_loss.item():.2f}")
+                train_loss.backward()
+                # Clipping gradients.
+                if hyperparams["grad_clip"] is not None:
+                    nn.utils.clip_grad_value_(model_registrar.parameters(), hyperparams["grad_clip"])
+                optimizer[node_type].step()
+
+                # Stepping forward the learning rate scheduler and annealers.
+                lr_scheduler[node_type].step()
+
+                if not args.debug:
+                    log_writer.add_scalar(
+                        f"{node_type}/train/learning_rate", lr_scheduler[node_type].get_lr()[0], curr_iter
+                    )
+                    log_writer.add_scalar(f"{node_type}/train/loss", train_loss, curr_iter)
+
+                curr_iter += 1
+            curr_iter_node_type[node_type] = curr_iter
+        train_dataset.augment = False
+        if args.eval_every is not None or args.vis_every is not None:
+            eval_trajectron.set_curr_iter(epoch)
+
+        #################################
+        #        VISUALIZATION          #
+        #################################
+        if args.vis_every is not None and not args.debug and epoch % args.vis_every == 0 and epoch > 0:
+            max_hl = hyperparams["maximum_history_length"]
+            ph = hyperparams["prediction_horizon"]
+            with torch.no_grad():
+                # Predict random timestep to plot for train data set
+                if args.scene_freq_mult_viz:
+                    scene = np.random.choice(train_scenes, p=train_scenes_sample_probs)
+                else:
+                    scene = np.random.choice(train_scenes)
+                timestep = scene.sample_timesteps(1, min_future_timesteps=ph)
+                predictions = trajectron.predict(
+                    scene,
+                    timestep,
+                    ph,
+                    min_future_timesteps=ph,
+                    z_mode=True,
+                    gmm_mode=True,
+                    all_z_sep=False,
+                    full_dist=False,
+                )
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(
+                    ax,
+                    predictions,
+                    scene.dt,
+                    max_hl=max_hl,
+                    ph=ph,
+                    map=scene.map["VISUALIZATION"] if scene.map is not None else None,
+                )
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure("train/prediction", fig, epoch)
+
+                model_registrar.to(args.eval_device)
+                # Predict random timestep to plot for eval data set
+                if args.scene_freq_mult_viz:
+                    scene = np.random.choice(eval_scenes, p=eval_scenes_sample_probs)
+                else:
+                    scene = np.random.choice(eval_scenes)
+                timestep = scene.sample_timesteps(1, min_future_timesteps=ph)
+                predictions = eval_trajectron.predict(
+                    scene, timestep, ph, num_samples=20, min_future_timesteps=ph, z_mode=False, full_dist=False
+                )
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(
+                    ax,
+                    predictions,
+                    scene.dt,
+                    max_hl=max_hl,
+                    ph=ph,
+                    map=scene.map["VISUALIZATION"] if scene.map is not None else None,
+                )
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure("eval/prediction", fig, epoch)
+
+                # Predict random timestep to plot for eval data set
+                predictions = eval_trajectron.predict(
+                    scene,
+                    timestep,
+                    ph,
+                    min_future_timesteps=ph,
+                    z_mode=True,
+                    gmm_mode=True,
+                    all_z_sep=True,
+                    full_dist=False,
+                )
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(
+                    ax,
+                    predictions,
+                    scene.dt,
+                    max_hl=max_hl,
+                    ph=ph,
+                    map=scene.map["VISUALIZATION"] if scene.map is not None else None,
+                )
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure("eval/prediction_all_z", fig, epoch)
+
+        #################################
+        #           EVALUATION          #
+        #################################
+        if args.eval_every is not None and not args.debug and epoch % args.eval_every == 0 and epoch > 0:
+            max_hl = hyperparams["maximum_history_length"]
+            ph = hyperparams["prediction_horizon"]
+            model_registrar.to(args.eval_device)
+            with torch.no_grad():
+                # Calculate evaluation loss
+                for node_type, data_loader in eval_data_loader.items():
+                    eval_loss = []
+                    print(f"Starting Evaluation @ epoch {epoch} for node type: {node_type}")
+                    pbar = tqdm(data_loader, ncols=80)
+                    for batch in pbar:
+                        eval_loss_node_type = eval_trajectron.eval_loss(batch, node_type)
+                        pbar.set_description(f"Epoch {epoch}, {node_type} L: {eval_loss_node_type.item():.2f}")
+                        eval_loss.append({node_type: {"nll": [eval_loss_node_type]}})
+                        del batch
+
+                    evaluation.log_batch_errors(eval_loss, log_writer, f"{node_type}/eval_loss", epoch)
+
+                # Predict batch timesteps for evaluation dataset evaluation
+                eval_batch_errors = []
+                for scene in tqdm(eval_scenes, desc="Sample Evaluation", ncols=80):
+                    timesteps = scene.sample_timesteps(args.eval_batch_size)
+
+                    predictions = eval_trajectron.predict(
+                        scene, timesteps, ph, num_samples=50, min_future_timesteps=ph, full_dist=False
+                    )
+
+                    eval_batch_errors.append(
+                        evaluation.compute_batch_statistics(
+                            predictions, scene.dt, max_hl=max_hl, ph=ph, node_type_enum=eval_env.NodeType, map=scene.map
+                        )
+                    )
+
+                evaluation.log_batch_errors(
+                    eval_batch_errors, log_writer, "eval", epoch, bar_plot=["kde"], box_plot=["ade", "fde"]
+                )
+
+                # Predict maximum likelihood batch timesteps for evaluation dataset evaluation
+                eval_batch_errors_ml = []
+                for scene in tqdm(eval_scenes, desc="MM Evaluation", ncols=80):
+                    timesteps = scene.sample_timesteps(scene.timesteps)
+
+                    predictions = eval_trajectron.predict(
+                        scene,
+                        timesteps,
+                        ph,
+                        num_samples=1,
+                        min_future_timesteps=ph,
+                        z_mode=True,
+                        gmm_mode=True,
+                        full_dist=False,
+                    )
+
+                    eval_batch_errors_ml.append(
+                        evaluation.compute_batch_statistics(
+                            predictions,
+                            scene.dt,
+                            max_hl=max_hl,
+                            ph=ph,
+                            map=scene.map,
+                            node_type_enum=eval_env.NodeType,
+                            kde=False,
+                        )
+                    )
+
+                evaluation.log_batch_errors(eval_batch_errors_ml, log_writer, "eval/ml", epoch)
+
+        if args.save_every is not None and args.debug is False and epoch % args.save_every == 0:
+            model_registrar.save_models(epoch)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py
new file mode 100644
index 000000000..9200d8dee
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .trajectory_utils import prediction_output_to_trajectories
+from .matrix_utils import block_diag, tile
+from .os_utils import maybe_makedirs
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py
new file mode 100644
index 000000000..cb32abc44
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+
+
+def attach_dim(v, n_dim_to_prepend=0, n_dim_to_append=0):
+    return v.reshape(torch.Size([1] * n_dim_to_prepend) + v.shape + torch.Size([1] * n_dim_to_append))
+
+
+def block_diag(m):
+    """
+    Make a block diagonal matrix along dim=-3
+    EXAMPLE:
+    block_diag(torch.ones(4,3,2))
+    should give a 12 x 8 matrix with blocks of 3 x 2 ones.
+    Prepend batch dimensions if needed.
+    You can also give a list of matrices.
+    :type m: torch.Tensor, list
+    :rtype: torch.Tensor
+    """
+    if type(m) is list:
+        m = torch.cat([m1.unsqueeze(-3) for m1 in m], -3)
+
+    d = m.dim()
+    n = m.shape[-3]
+    siz0 = m.shape[:-3]
+    siz1 = m.shape[-2:]
+    m2 = m.unsqueeze(-2)
+    eye = attach_dim(torch.eye(n, device=m.device).unsqueeze(-2), d - 3, 1)
+    return (m2 * eye).reshape(siz0 + torch.Size(torch.tensor(siz1) * n))
+
+
+def tile(a, dim, n_tile, device="cpu"):
+    init_dim = a.size(dim)
+    repeat_idx = [1] * a.dim()
+    repeat_idx[dim] = n_tile
+    a = a.repeat(*(repeat_idx))
+    order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)])).to(
+        device
+    )
+    return torch.index_select(a, dim, order_index)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py
new file mode 100644
index 000000000..5acd68d77
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+
+def maybe_makedirs(path_to_create):
+    """This function will create a directory, unless it exists already,
+    at which point the function will return.
+    The exception handling is necessary as it prevents a race condition
+    from occurring.
+    Inputs:
+        path_to_create - A string path to a directory you'd like created.
+    """
+    try:
+        os.makedirs(path_to_create)
+    except OSError:
+        if not os.path.isdir(path_to_create):
+            raise
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py
new file mode 100644
index 000000000..588151c6e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+
+def prediction_output_to_trajectories(prediction_output_dict, dt, max_h, ph, map=None, prune_ph_to_future=False):
+
+    prediction_timesteps = prediction_output_dict.keys()
+
+    output_dict = dict()
+    histories_dict = dict()
+    futures_dict = dict()
+
+    for t in prediction_timesteps:
+        histories_dict[t] = dict()
+        output_dict[t] = dict()
+        futures_dict[t] = dict()
+        prediction_nodes = prediction_output_dict[t].keys()
+        for node in prediction_nodes:
+            predictions_output = prediction_output_dict[t][node]
+            position_state = {"position": ["x", "y"]}
+
+            history = node.get(np.array([t - max_h, t]), position_state)  # History includes current pos
+            history = history[~np.isnan(history.sum(axis=1))]
+
+            future = node.get(np.array([t + 1, t + ph]), position_state)
+            future = future[~np.isnan(future.sum(axis=1))]
+
+            if prune_ph_to_future:
+                predictions_output = predictions_output[:, :, : future.shape[0]]
+                if predictions_output.shape[2] == 0:
+                    continue
+
+            trajectory = predictions_output
+
+            if map is None:
+                histories_dict[t][node] = history
+                output_dict[t][node] = trajectory
+                futures_dict[t][node] = future
+            else:
+                histories_dict[t][node] = map.to_map_points(history)
+                output_dict[t][node] = map.to_map_points(trajectory)
+                futures_dict[t][node] = map.to_map_points(future)
+
+    return output_dict, histories_dict, futures_dict
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py
new file mode 100644
index 000000000..d8b5b2027
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from .visualization import visualize_prediction, visualize_distribution
+from .visualization_utils import plot_boxplots
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py
new file mode 100644
index 000000000..5c1547ef1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from utils import prediction_output_to_trajectories
+from scipy import linalg
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import matplotlib.patheffects as pe
+import numpy as np
+import seaborn as sns
+
+
+def plot_trajectories(
+    ax,
+    prediction_dict,
+    histories_dict,
+    futures_dict,
+    line_alpha=0.7,
+    line_width=0.2,
+    edge_width=2,
+    circle_edge_width=0.5,
+    node_circle_size=0.3,
+    batch_num=0,
+    kde=False,
+):
+
+    cmap = ["k", "b", "y", "g", "r"]
+
+    for node in histories_dict:
+        history = histories_dict[node]
+        future = futures_dict[node]
+        predictions = prediction_dict[node]
+
+        if np.isnan(history[-1]).any():
+            continue
+
+        ax.plot(history[:, 0], history[:, 1], "k--")
+
+        for sample_num in range(prediction_dict[node].shape[1]):
+
+            if kde and predictions.shape[1] >= 50:
+                line_alpha = 0.2
+                for t in range(predictions.shape[2]):
+                    sns.kdeplot(
+                        predictions[batch_num, :, t, 0],
+                        predictions[batch_num, :, t, 1],
+                        ax=ax,
+                        shade=True,
+                        shade_lowest=False,
+                        color=np.random.choice(cmap),
+                        alpha=0.8,
+                    )
+
+            ax.plot(
+                predictions[batch_num, sample_num, :, 0],
+                predictions[batch_num, sample_num, :, 1],
+                color=cmap[node.type.value],
+                linewidth=line_width,
+                alpha=line_alpha,
+            )
+
+            ax.plot(
+                future[:, 0],
+                future[:, 1],
+                "w--",
+                path_effects=[pe.Stroke(linewidth=edge_width, foreground="k"), pe.Normal()],
+            )
+
+            # Current Node Position
+            circle = plt.Circle(
+                (history[-1, 0], history[-1, 1]),
+                node_circle_size,
+                facecolor="g",
+                edgecolor="k",
+                lw=circle_edge_width,
+                zorder=3,
+            )
+            ax.add_artist(circle)
+
+    ax.axis("equal")
+
+
+def visualize_prediction(ax, prediction_output_dict, dt, max_hl, ph, robot_node=None, map=None, **kwargs):
+
+    prediction_dict, histories_dict, futures_dict = prediction_output_to_trajectories(
+        prediction_output_dict, dt, max_hl, ph, map=map
+    )
+
+    assert len(prediction_dict.keys()) <= 1
+    if len(prediction_dict.keys()) == 0:
+        return
+    ts_key = list(prediction_dict.keys())[0]
+
+    prediction_dict = prediction_dict[ts_key]
+    histories_dict = histories_dict[ts_key]
+    futures_dict = futures_dict[ts_key]
+
+    if map is not None:
+        ax.imshow(map.as_image(), origin="lower", alpha=0.5)
+    plot_trajectories(ax, prediction_dict, histories_dict, futures_dict, *kwargs)
+
+
+def visualize_distribution(ax, prediction_distribution_dict, map=None, pi_threshold=0.05, **kwargs):
+    if map is not None:
+        ax.imshow(map.as_image(), origin="lower", alpha=0.5)
+
+    for node, pred_dist in prediction_distribution_dict.items():
+        if pred_dist.mus.shape[:2] != (1, 1):
+            return
+
+        means = pred_dist.mus.squeeze().cpu().numpy()
+        covs = pred_dist.get_covariance_matrix().squeeze().cpu().numpy()
+        pis = pred_dist.pis_cat_dist.probs.squeeze().cpu().numpy()
+
+        for timestep in range(means.shape[0]):
+            for z_val in range(means.shape[1]):
+                mean = means[timestep, z_val]
+                covar = covs[timestep, z_val]
+                pi = pis[timestep, z_val]
+
+                if pi < pi_threshold:
+                    continue
+
+                v, w = linalg.eigh(covar)
+                v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
+                u = w[0] / linalg.norm(w[0])
+
+                # Plot an ellipse to show the Gaussian component
+                angle = np.arctan(u[1] / u[0])
+                angle = 180.0 * angle / np.pi  # convert to degrees
+                ell = patches.Ellipse(
+                    mean, v[0], v[1], 180.0 + angle, color="blue" if node.type.name == "VEHICLE" else "orange"
+                )
+                ell.set_edgecolor(None)
+                ell.set_clip_box(ax.bbox)
+                ell.set_alpha(pi / 10)
+                ax.add_artist(ell)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py
new file mode 100644
index 000000000..a12b8a2eb
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+def plot_boxplots(ax, perf_dict_for_pd, x_label, y_label):
+    perf_df = pd.DataFrame.from_dict(perf_dict_for_pd)
+    our_mean_color = sns.color_palette("muted")[9]
+    marker_size = 7
+    mean_markers = "X"
+    with sns.color_palette("muted"):
+        sns.boxplot(x=x_label, y=y_label, data=perf_df, ax=ax, showfliers=False)
+        ax.plot(
+            [0],
+            [np.mean(perf_df[y_label])],
+            color=our_mean_color,
+            marker=mean_markers,
+            markeredgecolor="#545454",
+            markersize=marker_size,
+            zorder=10,
+        )
+
+
+def plot_barplots(ax, perf_dict_for_pd, x_label, y_label):
+    perf_df = pd.DataFrame.from_dict(perf_dict_for_pd)
+    with sns.color_palette("muted"):
+        sns.barplot(x=x_label, y=y_label, ax=ax, data=perf_df)