diff --git a/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py
new file mode 100644
index 000000000..a13ffb3d3
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/test_trajectron.py
@@ -0,0 +1,225 @@
+# # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# # SPDX-License-Identifier: Apache-2.0
+
+import sys
+sys.path.append("forge/test/models/pytorch/multimodal/trajectron/trajectron/")
+import pytest
+import forge
+from  test.models.pytorch.multimodal.trajectron.trajectron.model import Trajectron
+from test.models.pytorch.multimodal.trajectron.trajectron.model.model_registrar import ModelRegistrar
+from test.models.pytorch.multimodal.trajectron.trajectron.model.dataset import EnvironmentDataset, collate, get_timesteps_data
+from forge.verify.compare import compare_with_golden
+import os
+import json
+import dill
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Any
+import torch.nn.utils.rnn as rnn
+import pytest
+
+
+def load_hyperparams():
+    conf_path = "forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json"
+    with open(conf_path, 'r', encoding='utf-8') as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Set Default values
+    hyperparams['scene_freq_mult_eval'] = False
+    hyperparams['node_freq_mult_eval'] = False
+    hyperparams['edge_encoding'] = False
+    hyperparams['incl_robot_node'] = False
+    hyperparams['use_map_encoding'] = False
+
+    hyperparams['edge_addition_filter'] = [1, 1]
+    hyperparams['edge_removal_filter'] = [1, 1]
+
+    return hyperparams
+
+def load_env():
+    eval_data_path = "forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl"
+    with open(eval_data_path, 'rb') as f:
+        eval_env = dill.load(f, encoding='latin1')
+    return eval_env
+
+
+class TrajectronWrapper(nn.Module):
+    def __init__(self, model_dir: str, hyperparams: dict[str, Any], env: Any, scene_index: int, num_samples: int = 1, z_mode: bool = True, gmm_mode: bool = True, all_z_sep: bool = False, full_dist: bool = False):
+        super().__init__()
+        
+        # Build Model registrar
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir, exist_ok=False)
+        model_config_path = model_dir + "/config.json"
+        if not os.path.exists(model_config_path):
+            with open(model_config_path, 'w') as conf_json:
+                json.dump(hyperparams, conf_json)
+        model_registrar = ModelRegistrar(model_dir, "cpu")
+
+        # Build Trajectron Model
+        self.model = Trajectron(model_registrar=model_registrar, hyperparams=hyperparams, log_writer=None, device="cpu")
+        self.model.set_environment(env=env)
+
+        self.model_dir = model_dir
+        self.hyperparams = hyperparams
+        self.env = env
+
+        assert len(self.env.NodeType) == 1
+        self.node_type = self.env.NodeType[0]
+
+        self.scene_index = scene_index
+        self.num_samples = num_samples
+        self.z_mode = z_mode
+        self.gmm_mode = gmm_mode
+        self.all_z_sep = all_z_sep
+        self.full_dist = full_dist
+
+    def _build_packed_sequence(self, packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices):
+        packed_sequence = torch.nn.utils.rnn.PackedSequence(
+            data=packed_sequence_data.squeeze(), 
+            batch_sizes=packed_sequence_batch_sizes.squeeze(), 
+            sorted_indices=packed_sequence_sorted_indices.squeeze(), 
+            unsorted_indices=packed_sequence_unsorted_indices.squeeze(),
+        )
+        return packed_sequence
+
+    def forward(self, x, x_st_t, packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices, first_history_index):
+        neighbors_data_st = None
+        neighbors_edge_value = None
+        robot_traj_st_t = None
+        map = None
+
+        ph = self.hyperparams['prediction_horizon']
+
+        packed_x_st_t = self._build_packed_sequence(packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices)
+
+        model = self.model.node_models_dict[self.node_type]
+        predictions = model.predict(
+            inputs=x,
+            inputs_st=x_st_t,  # Pack and send this
+            packed_inputs_st=packed_x_st_t,
+            first_history_indices=first_history_index,
+            neighbors=neighbors_data_st,
+            neighbors_edge_value=neighbors_edge_value,
+            robot=robot_traj_st_t,
+            map=map,
+            prediction_horizon=ph,
+            num_samples=self.num_samples,
+            z_mode=self.z_mode,
+            gmm_mode=self.gmm_mode,
+            full_dist=self.full_dist,
+            all_z_sep=self.all_z_sep
+        )
+
+        return predictions
+
+    def eval(self):
+        super().eval()
+        self.model.eval()
+
+    def get_input_batch(self, scene):
+        ph = self.hyperparams['prediction_horizon']
+        timesteps = scene.sample_timesteps(1, min_future_timesteps=ph)
+
+        min_future_timesteps = ph
+        min_history_timesteps = 1
+
+        node_type = self.node_type
+        assert node_type in self.model.pred_state
+        model = self.model.node_models_dict[node_type]
+
+        # Get Input data for node type and given timesteps
+        batch = get_timesteps_data(env=self.env, scene=scene, t=timesteps, node_type=node_type, state=self.model.state,
+                                    pred_state=self.model.pred_state, edge_types=model.edge_types,
+                                    min_ht=min_history_timesteps, max_ht=self.model.max_ht, min_ft=min_future_timesteps,
+                                    max_ft=min_future_timesteps, hyperparams=self.hyperparams)
+
+        assert batch is not None
+
+        (first_history_index,
+             x_t, y_t, x_st_t, y_st_t,
+             neighbors_data_st,
+             neighbors_edge_value,
+             robot_traj_st_t,
+             map), nodes, timesteps_o = batch
+
+        device = self.model.device
+        x = x_t.to(device)
+        x_st_t = x_st_t.to(device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(device)
+
+        if type(map) == torch.Tensor:
+            map = map.to(device)
+
+        return (x, x_st_t, first_history_index, neighbors_data_st, neighbors_edge_value, robot_traj_st_t, map), (nodes, timesteps_o)
+
+
+def pack_input_sequences(sequences, lower_indices = None, upper_indices = None, total_length=None):
+    bs, tf = sequences.shape[:2]
+    if lower_indices is None:
+        lower_indices = torch.zeros(bs, dtype=torch.int)
+    if upper_indices is None:
+        upper_indices = torch.ones(bs, dtype=torch.int) * (tf - 1)
+    if total_length is None:
+        total_length = max(upper_indices) + 1
+    # This is done so that we can just pass in self.prediction_timesteps
+    # (which we want to INCLUDE, so this will exclude the next timestep).
+    inclusive_break_indices = upper_indices + 1
+
+    pad_list = list()
+    for i, seq_len in enumerate(inclusive_break_indices):
+        pad_list.append(sequences[i, lower_indices[i]:seq_len])
+
+    packed_seqs = rnn.pack_sequence(pad_list, enforce_sorted=False)
+
+    return packed_seqs
+
+
+def get_packed_sequence_values(packed_sequence):
+    values = (
+        packed_sequence.data.unsqueeze(0).unsqueeze(0), 
+        packed_sequence.batch_sizes.unsqueeze(0), 
+        packed_sequence.sorted_indices.unsqueeze(0), 
+        packed_sequence.unsorted_indices.unsqueeze(0),
+    )
+    return values
+@pytest.mark.nightly
+@pytest.mark.model_analysis
+def test_trajectronpp_pytorch():
+    env = load_env()
+    hyperparams = load_hyperparams()
+    model_dir = "forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir"
+
+
+    # Build Pytorch Model    
+    pt_model = TrajectronWrapper(model_dir=model_dir, hyperparams=hyperparams, env=env, scene_index=0)
+    pt_model.eval()
+
+    scene = env.scenes[0]
+    inputs_batch = pt_model.get_input_batch(scene=scene)
+
+    (x, x_st_t, first_history_index, neighbors_data_st, neighbors_edge_value, robot_traj_st_t, map), (nodes, timesteps_o) = inputs_batch
+
+    packed_x_st_t = pack_input_sequences(x_st_t, lower_indices=first_history_index)
+    (packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices) = get_packed_sequence_values(packed_x_st_t)
+
+
+
+    assert neighbors_data_st is None
+    assert neighbors_edge_value is None
+    assert robot_traj_st_t is None
+    assert map is None
+    # Run CPU Inference    
+    output = pt_model(x, x_st_t, packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices, first_history_index)
+    inputs = [x, x_st_t, packed_sequence_data, packed_sequence_batch_sizes, packed_sequence_sorted_indices, packed_sequence_unsorted_indices, first_history_index]
+    compiled_model = forge.compile(pt_model,inputs)
+    co_out = compiled_model(*inputs)
+    fw_out = pt_model(*inputs)
+
+    co_out = [co.to("cpu") for co in co_out]
+    fw_out = [fw_out] if isinstance(fw_out, torch.Tensor) else fw_out
+
+    assert all([compare_with_golden(golden=fo, calculated=co, pcc=0.99) for fo, co in zip(fw_out, co_out)])
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py
new file mode 100644
index 000000000..638673905
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/__init__.py
@@ -0,0 +1 @@
+from model import Trajectron
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py
new file mode 100644
index 000000000..3eb5f047b
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/argument_parser.py
@@ -0,0 +1,172 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--conf",
+                    help="path to json config file for hyperparameters",
+                    type=str,
+                    default='../config/config.json')
+
+parser.add_argument("--debug",
+                    help="disable all disk writing processes.",
+                    action='store_true')
+
+parser.add_argument("--preprocess_workers",
+                    help="number of processes to spawn for preprocessing",
+                    type=int,
+                    default=0)
+
+
+# Model Parameters
+parser.add_argument("--offline_scene_graph",
+                    help="whether to precompute the scene graphs offline, options are 'no' and 'yes'",
+                    type=str,
+                    default='yes')
+
+parser.add_argument("--dynamic_edges",
+                    help="whether to use dynamic edges or not, options are 'no' and 'yes'",
+                    type=str,
+                    default='yes')
+
+parser.add_argument("--edge_state_combine_method",
+                    help="the method to use for combining edges of the same type",
+                    type=str,
+                    default='sum')
+
+parser.add_argument("--edge_influence_combine_method",
+                    help="the method to use for combining edge influences",
+                    type=str,
+                    default='attention')
+
+parser.add_argument('--edge_addition_filter',
+                    nargs='+',
+                    help="what scaling to use for edges as they're created",
+                    type=float,
+                    default=[0.25, 0.5, 0.75, 1.0]) # We don't automatically pad left with 0.0, if you want a sharp
+                                                    # and short edge addition, then you need to have a 0.0 at the
+                                                    # beginning, e.g. [0.0, 1.0].
+
+parser.add_argument('--edge_removal_filter',
+                    nargs='+',
+                    help="what scaling to use for edges as they're removed",
+                    type=float,
+                    default=[1.0, 0.0])  # We don't automatically pad right with 0.0, if you want a sharp drop off like
+                                         # the default, then you need to have a 0.0 at the end.
+
+parser.add_argument('--override_attention_radius',
+                    action='append',
+                    help='Specify one attention radius to override. E.g. "PEDESTRIAN VEHICLE 10.0"',
+                    default=[])
+
+parser.add_argument('--incl_robot_node',
+                    help="whether to include a robot node in the graph or simply model all agents",
+                    action='store_true')
+
+parser.add_argument('--map_encoding',
+                    help="Whether to use map encoding or not",
+                    action='store_true')
+
+parser.add_argument('--augment',
+                    help="Whether to augment the scene during training",
+                    action='store_true')
+
+parser.add_argument('--node_freq_mult_train',
+                    help="Whether to use frequency multiplying of nodes during training",
+                    action='store_true')
+
+parser.add_argument('--node_freq_mult_eval',
+                    help="Whether to use frequency multiplying of nodes during evaluation",
+                    action='store_true')
+
+parser.add_argument('--scene_freq_mult_train',
+                    help="Whether to use frequency multiplying of nodes during training",
+                    action='store_true')
+
+parser.add_argument('--scene_freq_mult_eval',
+                    help="Whether to use frequency multiplying of nodes during evaluation",
+                    action='store_true')
+
+parser.add_argument('--scene_freq_mult_viz',
+                    help="Whether to use frequency multiplying of nodes during evaluation",
+                    action='store_true')
+
+parser.add_argument('--no_edge_encoding',
+                    help="Whether to use neighbors edge encoding",
+                    action='store_true')
+
+# Data Parameters
+parser.add_argument("--data_dir",
+                    help="what dir to look in for data",
+                    type=str,
+                    default='../experiments/processed')
+
+parser.add_argument("--train_data_dict",
+                    help="what file to load for training data",
+                    type=str,
+                    default='train.pkl')
+
+parser.add_argument("--eval_data_dict",
+                    help="what file to load for evaluation data",
+                    type=str,
+                    default='val.pkl')
+
+parser.add_argument("--log_dir",
+                    help="what dir to save training information (i.e., saved models, logs, etc)",
+                    type=str,
+                    default='../experiments/logs')
+
+parser.add_argument("--log_tag",
+                    help="tag for the log folder",
+                    type=str,
+                    default='')
+
+parser.add_argument('--device',
+                    help='what device to perform training on',
+                    type=str,
+                    default='cuda:0')
+
+parser.add_argument("--eval_device",
+                    help="what device to use during evaluation",
+                    type=str,
+                    default=None)
+
+# Training Parameters
+parser.add_argument("--train_epochs",
+                    help="number of iterations to train for",
+                    type=int,
+                    default=1)
+
+parser.add_argument('--batch_size',
+                    help='training batch size',
+                    type=int,
+                    default=256)
+
+parser.add_argument('--eval_batch_size',
+                    help='evaluation batch size',
+                    type=int,
+                    default=256)
+
+parser.add_argument('--k_eval',
+                    help='how many samples to take during evaluation',
+                    type=int,
+                    default=25)
+
+parser.add_argument('--seed',
+                    help='manual seed to use, default is 123',
+                    type=int,
+                    default=123)
+
+parser.add_argument('--eval_every',
+                    help='how often to evaluate during training, never if None',
+                    type=int,
+                    default=1)
+
+parser.add_argument('--vis_every',
+                    help='how often to visualize during training, never if None',
+                    type=int,
+                    default=1)
+
+parser.add_argument('--save_every',
+                    help='how often to save during training, never if None',
+                    type=int,
+                    default=1)
+args = parser.parse_args()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json
new file mode 100644
index 000000000..3dc5d35d1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/config.json
@@ -0,0 +1,90 @@
+{
+
+  "batch_size": 256,
+  "grad_clip": 1.0,
+
+  "learning_rate_style": "exp",
+  "learning_rate": 0.001,
+  "min_learning_rate": 0.00001,
+  "learning_decay_rate": 0.9999,
+
+  "prediction_horizon": 12,
+  "minimum_history_length": 1,
+  "maximum_history_length": 8,
+
+  "map_encoder": {
+    "PEDESTRIAN": {
+      "heading_state_index": 5,
+      "patch_size": [50, 10, 50, 90],
+      "map_channels": 3,
+      "hidden_channels": [10, 20, 10, 1],
+      "output_size": 32,
+      "masks": [5, 5, 5, 5],
+      "strides": [1, 1, 1, 1],
+      "dropout": 0.5
+    }
+  },
+
+  "k": 1,
+  "k_eval": 1,
+
+  "kl_min": 0.07,
+  "kl_weight": 100.0,
+  "kl_weight_start": 0,
+  "kl_decay_rate": 0.99995,
+  "kl_crossover": 400,
+  "kl_sigmoid_divisor": 4,
+
+  "rnn_kwargs": {
+    "dropout_keep_prob": 0.75
+  },
+  "MLP_dropout_keep_prob": 0.9,
+  "enc_rnn_dim_edge": 32,
+  "enc_rnn_dim_edge_influence": 32,
+  "enc_rnn_dim_history": 32,
+  "enc_rnn_dim_future": 32,
+  "dec_rnn_dim": 128,
+
+  "q_z_xy_MLP_dims": null,
+  "p_z_x_MLP_dims": 32,
+  "GMM_components": 1,
+
+  "log_p_yt_xz_max": 6,
+
+  "N": 1,
+  "K": 25,
+
+  "tau_init": 2.0,
+  "tau_final": 0.05,
+  "tau_decay_rate": 0.997,
+
+  "use_z_logit_clipping": true,
+  "z_logit_clip_start": 0.05,
+  "z_logit_clip_final": 5.0,
+  "z_logit_clip_crossover": 300,
+  "z_logit_clip_divisor": 5,
+
+  "dynamic": {
+      "PEDESTRIAN": {
+        "name": "SingleIntegrator",
+        "distribution": true,
+        "limits": {}
+      }
+  },
+
+  "state": {
+      "PEDESTRIAN": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"]
+    }
+  },
+
+  "pred_state": {
+    "PEDESTRIAN": {
+      "position": ["x", "y"]
+    }
+  },
+
+  "log_histograms": false
+}
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json
new file mode 100644
index 000000000..919ea6be1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/config/nuScenes.json
@@ -0,0 +1,109 @@
+{
+
+  "batch_size": 256,
+  "grad_clip": 1.0,
+
+  "learning_rate_style": "exp",
+  "learning_rate": 0.003,
+  "min_learning_rate": 0.00001,
+  "learning_decay_rate": 0.9999,
+
+  "prediction_horizon": 6,
+  "minimum_history_length": 1,
+  "maximum_history_length": 8,
+
+  "map_encoder": {
+    "VEHICLE": {
+      "heading_state_index": 6,
+      "patch_size": [50, 10, 50, 90],
+      "map_channels": 3,
+      "hidden_channels": [10, 20, 10, 1],
+      "output_size": 32,
+      "masks": [5, 5, 5, 3],
+      "strides": [2, 2, 1, 1],
+      "dropout": 0.5
+    }
+  },
+
+  "k": 1,
+  "k_eval": 1,
+
+  "kl_min": 0.07,
+  "kl_weight": 100.0,
+  "kl_weight_start": 0,
+  "kl_decay_rate": 0.99995,
+  "kl_crossover": 400,
+  "kl_sigmoid_divisor": 4,
+
+  "rnn_kwargs": {
+    "dropout_keep_prob": 0.75
+  },
+  "MLP_dropout_keep_prob": 0.9,
+  "enc_rnn_dim_edge": 32,
+  "enc_rnn_dim_edge_influence": 32,
+  "enc_rnn_dim_history": 32,
+  "enc_rnn_dim_future": 32,
+  "dec_rnn_dim": 128,
+
+  "q_z_xy_MLP_dims": null,
+  "p_z_x_MLP_dims": 32,
+  "GMM_components": 1,
+
+  "log_p_yt_xz_max": 6,
+
+  "N": 1,
+  "K": 25,
+
+  "tau_init": 2.0,
+  "tau_final": 0.05,
+  "tau_decay_rate": 0.997,
+
+  "use_z_logit_clipping": true,
+  "z_logit_clip_start": 0.05,
+  "z_logit_clip_final": 5.0,
+  "z_logit_clip_crossover": 300,
+  "z_logit_clip_divisor": 5,
+
+  "dynamic": {
+    "PEDESTRIAN": {
+      "name": "SingleIntegrator",
+      "distribution": true,
+      "limits": {}
+    },
+    "VEHICLE": {
+      "name": "Unicycle",
+      "distribution": true,
+      "limits": {
+          "max_a": 4,
+          "min_a": -5,
+          "max_heading_change": 0.7,
+          "min_heading_change": -0.7
+        }
+    }
+  },
+
+  "state": {
+    "PEDESTRIAN": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"]
+    },
+    "VEHICLE": {
+      "position": ["x", "y"],
+      "velocity": ["x", "y"],
+      "acceleration": ["x", "y"],
+      "heading": ["°", "d°"]
+    }
+  },
+
+  "pred_state": {
+    "VEHICLE": {
+      "position": ["x", "y"]
+    },
+    "PEDESTRIAN": {
+      "position": ["x", "y"]
+    }
+  },
+
+  "log_histograms": false
+}
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl b/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl
new file mode 100644
index 000000000..8afc0d7c0
Binary files /dev/null and b/forge/test/models/pytorch/multimodal/trajectron/trajectron/dataset_envs/eth_val.pkl differ
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py
new file mode 100644
index 000000000..9ad06818f
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/__init__.py
@@ -0,0 +1,8 @@
+from .data_structures import RingBuffer, SingleHeaderNumpyArray, DoubleHeaderNumpyArray
+from .scene import Scene
+from .node import Node
+from .scene_graph import TemporalSceneGraph, SceneGraph
+from .environment import Environment
+from .node_type import NodeTypeEnum
+from .data_utils import derivative_of
+from .map import GeometricMap
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py
new file mode 100644
index 000000000..d16a9ea19
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_structures.py
@@ -0,0 +1,277 @@
+import numpy as np
+import pandas as pd
+from collections.abc import Sequence
+from collections import OrderedDict
+
+
+class RingBuffer(Sequence):
+    def __init__(self, capacity, dtype=float, allow_overwrite=True):
+        """
+        Create a new ring buffer with the given capacity and element type.
+        Code copy-pasted from: https://github.com/eric-wieser/numpy_ringbuffer
+
+        Parameters
+        ----------
+        capacity: int
+            The maximum capacity of the ring buffer
+        dtype: data-type, optional
+            Desired type of buffer elements. Use a type like (float, 2) to
+            produce a buffer with shape (N, 2)
+        allow_overwrite: bool
+            If false, throw an IndexError when trying to append to an already
+            full buffer
+        """
+        self._arr = np.full(capacity, np.nan, dtype)
+        self._left_index = 0
+        self._right_index = 0
+        self._capacity = capacity
+        self._allow_overwrite = allow_overwrite
+
+    def _unwrap(self):
+        """ Copy the data from this buffer into unwrapped form """
+        return np.concatenate((
+            self._arr[self._left_index:min(self._right_index, self._capacity)],
+            self._arr[:max(self._right_index - self._capacity, 0)]
+        ))
+
+    def _fix_indices(self):
+        """
+        Enforce our invariant that 0 <= self._left_index < self._capacity
+        """
+        if self._left_index >= self._capacity:
+            self._left_index -= self._capacity
+            self._right_index -= self._capacity
+        elif self._left_index < 0:
+            self._left_index += self._capacity
+            self._right_index += self._capacity
+
+    @property
+    def is_full(self):
+        """ True if there is no more space in the buffer """
+        return len(self) == self._capacity
+
+    # numpy compatibility
+    def __array__(self):
+        return self._unwrap()
+
+    @property
+    def dtype(self):
+        return self._arr.dtype
+
+    @property
+    def shape(self):
+        return (len(self),) + self._arr.shape[1:]
+
+    # these mirror methods from deque
+    @property
+    def maxlen(self):
+        return self._capacity
+
+    def append(self, value):
+        if self.is_full:
+            if not self._allow_overwrite:
+                raise IndexError('append to a full RingBuffer with overwrite disabled')
+            elif not len(self):
+                return
+            else:
+                self._left_index += 1
+
+        self._arr[self._right_index % self._capacity] = value
+        self._right_index += 1
+        self._fix_indices()
+
+    def appendleft(self, value):
+        if self.is_full:
+            if not self._allow_overwrite:
+                raise IndexError('append to a full RingBuffer with overwrite disabled')
+            elif not len(self):
+                return
+            else:
+                self._right_index -= 1
+
+        self._left_index -= 1
+        self._fix_indices()
+        self._arr[self._left_index] = value
+
+    def pop(self):
+        if len(self) == 0:
+            raise IndexError("pop from an empty RingBuffer")
+        self._right_index -= 1
+        self._fix_indices()
+        res = self._arr[self._right_index % self._capacity]
+        return res
+
+    def popleft(self):
+        if len(self) == 0:
+            raise IndexError("pop from an empty RingBuffer")
+        res = self._arr[self._left_index]
+        self._left_index += 1
+        self._fix_indices()
+        return res
+
+    def extend(self, values):
+        lv = len(values)
+        if len(self) + lv > self._capacity:
+            if not self._allow_overwrite:
+                raise IndexError('extend a RingBuffer such that it would overflow, with overwrite disabled')
+            elif not len(self):
+                return
+        if lv >= self._capacity:
+            # wipe the entire array! - this may not be threadsafe
+            self._arr[...] = values[-self._capacity:]
+            self._right_index = self._capacity
+            self._left_index = 0
+            return
+
+        ri = self._right_index % self._capacity
+        sl1 = np.s_[ri:min(ri + lv, self._capacity)]
+        sl2 = np.s_[:max(ri + lv - self._capacity, 0)]
+        self._arr[sl1] = values[:sl1.stop - sl1.start]
+        self._arr[sl2] = values[sl1.stop - sl1.start:]
+        self._right_index += lv
+
+        self._left_index = max(self._left_index, self._right_index - self._capacity)
+        self._fix_indices()
+
+    def extendleft(self, values):
+        lv = len(values)
+        if len(self) + lv > self._capacity:
+            if not self._allow_overwrite:
+                raise IndexError('extend a RingBuffer such that it would overflow, with overwrite disabled')
+            elif not len(self):
+                return
+        if lv >= self._capacity:
+            # wipe the entire array! - this may not be threadsafe
+            self._arr[...] = values[:self._capacity]
+            self._right_index = self._capacity
+            self._left_index = 0
+            return
+
+        self._left_index -= lv
+        self._fix_indices()
+        li = self._left_index
+        sl1 = np.s_[li:min(li + lv, self._capacity)]
+        sl2 = np.s_[:max(li + lv - self._capacity, 0)]
+        self._arr[sl1] = values[:sl1.stop - sl1.start]
+        self._arr[sl2] = values[sl1.stop - sl1.start:]
+
+        self._right_index = min(self._right_index, self._left_index + self._capacity)
+
+    # implement Sequence methods
+    def __len__(self):
+        return self._right_index - self._left_index
+
+    def __getitem__(self, item):
+        # handle simple (b[1]) and basic (b[np.array([1, 2, 3])]) fancy indexing specially
+        if not isinstance(item, tuple):
+            item_arr = np.asarray(item)
+            if issubclass(item_arr.dtype.type, np.integer):
+                item_arr = (item_arr + self._left_index) % self._capacity
+                return self._arr[item_arr]
+
+        # for everything else, get it right at the expense of efficiency
+        return self._unwrap()[item]
+
+    def __iter__(self):
+        # alarmingly, this is comparable in speed to using itertools.chain
+        return iter(self._unwrap())
+
+    # Everything else
+    def __repr__(self):
+        return '<RingBuffer of {!r}>'.format(np.asarray(self))
+
+
+class DoubleHeaderNumpyArray(object):
+    def __init__(self, data: np.ndarray, header: list):
+        """
+        Data Structure mirroring some functionality of double indexed pandas DataFrames.
+        Indexing options are:
+        [:, (header1, header2)]
+        [:, [(header1, header2), (header1, header2)]]
+        [:, {header1: [header21, header22]}]
+
+        A SingleHeaderNumpyArray can is returned if an element of the first header is querried as attribut:
+        doubleHeaderNumpyArray.position -> SingleHeaderNumpyArray
+
+        :param data: The numpy array.
+        :param header: The double header structure as list of tuples [(header11, header21), (header11, header22) ...]
+        """
+        self.data = data
+        self.header = header
+        self.double_header_lookup = OrderedDict()
+        self.tree_header_lookup = OrderedDict()
+        for i, header_item in enumerate(header):
+            self.double_header_lookup[header_item] = i
+            if header_item[0] not in self.tree_header_lookup:
+                self.tree_header_lookup[header_item[0]] = dict()
+            self.tree_header_lookup[header_item[0]][header_item[1]] = i
+
+    def __mul__(self, other):
+        return DoubleHeaderNumpyArray(self.data * other, self.header)
+
+    def get_single_header_array(self, h1: str, rows=slice(None, None, None)):
+        data_integer_indices = list()
+        h2_list = list()
+        for h2 in self.tree_header_lookup[h1]:
+            data_integer_indices.append(self.tree_header_lookup[h1][h2])
+            h2_list.append(h2)
+        return SingleHeaderNumpyArray(self.data[rows, data_integer_indices], h2_list)
+
+    def __getitem__(self, item):
+        rows, columns = item
+        data_integer_indices = list()
+        if type(columns) is dict:
+            for h1, h2s in columns.items():
+                for h2 in h2s:
+                    data_integer_indices.append(self.double_header_lookup[(h1, h2)])
+            return self.data[rows, data_integer_indices]
+        elif type(columns) is list:
+            for column in columns:
+                assert type(column) is tuple, "If Index is list it hast to be list of double header tuples."
+                data_integer_indices.append(self.double_header_lookup[column])
+            return self.data[rows, data_integer_indices]
+        elif type(columns) is tuple:
+            return self.data[rows, self.double_header_lookup[columns]]
+        else:
+            assert type(item) is str, "Index must be str, list of tuples or dict of tree structure."
+            return self.get_single_header_array(item, rows=rows)
+
+    def __getattr__(self, item):
+        if not item.startswith('_'):
+            if item in self.tree_header_lookup.keys():
+                return self.get_single_header_array(item)
+            else:
+                try:
+                    return self.data.__getattribute__(item)
+                except AttributeError:
+                    return super().__getattribute__(item)
+        else:
+            return super().__getattribute__(item)
+
+
+class SingleHeaderNumpyArray(object):
+    def __init__(self, data: np.ndarray, header: list):
+        self.data = data
+        self.header_lookup = OrderedDict({h: i for i, h in enumerate(header)})
+
+    def __getitem__(self, item):
+        rows, columns = item
+        data_integer_indices = list()
+        if type(columns) is list or type(columns) is tuple:
+            for column in columns:
+                data_integer_indices.append(self.header_lookup[column])
+        else:
+            data_integer_indices = self.header_lookup[columns]
+        return self.data[rows, data_integer_indices]
+
+    def __getattr__(self, item):
+        if not item.startswith('_'):
+            if item in self.header_lookup.keys():
+                return self[:, item]
+            else:
+                try:
+                    return self.data.__getattribute__(item)
+                except AttributeError:
+                    return super().__getattribute__(item)
+        else:
+            return super().__getattribute__(item)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py
new file mode 100644
index 000000000..72c7ec86b
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/data_utils.py
@@ -0,0 +1,33 @@
+import numpy as np
+
+
+def make_continuous_copy(alpha):
+    alpha = (alpha + np.pi) % (2.0 * np.pi) - np.pi
+    continuous_x = np.zeros_like(alpha)
+    continuous_x[0] = alpha[0]
+    for i in range(1, len(alpha)):
+        if not (np.sign(alpha[i]) == np.sign(alpha[i - 1])) and np.abs(alpha[i]) > np.pi / 2:
+            continuous_x[i] = continuous_x[i - 1] + (
+                    alpha[i] - alpha[i - 1]) - np.sign(
+                (alpha[i] - alpha[i - 1])) * 2 * np.pi
+        else:
+            continuous_x[i] = continuous_x[i - 1] + (alpha[i] - alpha[i - 1])
+
+    return continuous_x
+
+
+def derivative_of(x, dt=1, radian=False):
+    if radian:
+        x = make_continuous_copy(x)
+
+    not_nan_mask = ~np.isnan(x)
+    masked_x = x[not_nan_mask]
+
+    if masked_x.shape[-1] < 2:
+        return np.zeros_like(x)
+
+    dx = np.full_like(x, np.nan)
+    dx[not_nan_mask] = np.ediff1d(masked_x, to_begin=(masked_x[1] - masked_x[0])) / dt
+
+    return dx
+
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py
new file mode 100644
index 000000000..24ebb679f
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/environment.py
@@ -0,0 +1,64 @@
+import json
+import numpy as np
+from itertools import product
+from .node_type import NodeTypeEnum
+
+
+class Environment(object):
+    def __init__(self, node_type_list, standardization, scenes=None, attention_radius=None, robot_type=None):
+        self.scenes = scenes
+        self.node_type_list = node_type_list
+        self.attention_radius = attention_radius
+        self.NodeType = NodeTypeEnum(node_type_list)
+        self.robot_type = robot_type
+
+        self.standardization = standardization
+        self.standardize_param_memo = dict()
+
+        self._scenes_resample_prop = None
+
+    def get_edge_types(self):
+        return list(product(self.NodeType, repeat=2))
+
+    def get_standardize_params(self, state, node_type):
+        memo_key = (json.dumps(state), node_type)
+        if memo_key in self.standardize_param_memo:
+            return self.standardize_param_memo[memo_key]
+
+        standardize_mean_list = list()
+        standardize_std_list = list()
+        for entity, dims in state.items():
+            for dim in dims:
+                standardize_mean_list.append(self.standardization[node_type][entity][dim]['mean'])
+                standardize_std_list.append(self.standardization[node_type][entity][dim]['std'])
+        standardize_mean = np.stack(standardize_mean_list)
+        standardize_std = np.stack(standardize_std_list)
+
+        self.standardize_param_memo[memo_key] = (standardize_mean, standardize_std)
+        return standardize_mean, standardize_std
+
+    def standardize(self, array, state, node_type, mean=None, std=None):
+        if mean is None and std is None:
+            mean, std = self.get_standardize_params(state, node_type)
+        elif mean is None and std is not None:
+            mean, _ = self.get_standardize_params(state, node_type)
+        elif mean is not None and std is None:
+            _, std = self.get_standardize_params(state, node_type)
+        return np.where(np.isnan(array), np.array(np.nan), (array - mean) / std)
+
+    def unstandardize(self, array, state, node_type, mean=None, std=None):
+        if mean is None and std is None:
+            mean, std = self.get_standardize_params(state, node_type)
+        elif mean is None and std is not None:
+            mean, _ = self.get_standardize_params(state, node_type)
+        elif mean is not None and std is None:
+            _, std = self.get_standardize_params(state, node_type)
+        return array * std + mean
+
+    @property
+    def scenes_resample_prop(self):
+        if self._scenes_resample_prop is None:
+            self._scenes_resample_prop = np.array([scene.resample_prob for scene in self.scenes])
+            self._scenes_resample_prop = self._scenes_resample_prop / np.sum(self._scenes_resample_prop)
+        return self._scenes_resample_prop
+
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py
new file mode 100644
index 000000000..d1b527d51
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/map.py
@@ -0,0 +1,185 @@
+import torch
+import numpy as np
+from model.dataset.homography_warper import get_rotation_matrix2d, warp_affine_crop
+
+
+class Map(object):
+    def __init__(self, data, homography, description=None):
+        self.data = data
+        self.homography = homography
+        self.description = description
+
+    def as_image(self):
+        raise NotImplementedError
+
+    def get_cropped_maps(self, world_pts, patch_size, rotation=None, device='cpu'):
+        raise NotImplementedError
+
+    def to_map_points(self, scene_pts):
+        raise NotImplementedError
+
+
+class GeometricMap(Map):
+    """
+    A Geometric Map is a int tensor of shape [layers, x, y]. The homography must transform a point in scene
+    coordinates to the respective point in map coordinates.
+
+    :param data: Numpy array of shape [layers, x, y]
+    :param homography: Numpy array of shape [3, 3]
+    """
+    def __init__(self, data, homography, description=None):
+        #assert isinstance(data.dtype, np.floating), "Geometric Maps must be float values."
+        super(GeometricMap, self).__init__(data, homography, description=description)
+
+        self._last_padding = None
+        self._last_padded_map = None
+        self._torch_map = None
+
+    def torch_map(self, device):
+        if self._torch_map is not None:
+            return self._torch_map
+        self._torch_map = torch.tensor(self.data, dtype=torch.uint8, device=device)
+        return self._torch_map
+
+    def as_image(self):
+        # We have to transpose x and y to rows and columns. Assumes origin is lower left for image
+        # Also we move the channels to the last dimension
+        return (np.transpose(self.data, (2, 1, 0))).astype(np.uint)
+
+    def get_padded_map(self, padding_x, padding_y, device):
+        if self._last_padding == (padding_x, padding_y):
+            return self._last_padded_map
+        else:
+            self._last_padding = (padding_x, padding_y)
+            self._last_padded_map = torch.full((self.data.shape[0],
+                                                self.data.shape[1] + 2 * padding_x,
+                                                self.data.shape[2] + 2 * padding_y),
+                                               False, dtype=torch.uint8)
+            self._last_padded_map[..., padding_x:-padding_x, padding_y:-padding_y] = self.torch_map(device)
+            return self._last_padded_map
+
+    @staticmethod
+    def batch_rotate(map_batched, centers, angles, out_height, out_width):
+        """
+        As the input is a map and the warp_affine works on an image coordinate system we would have to
+        flip the y axis updown, negate the angles, and flip it back after transformation.
+        This, however, is the same as not flipping at and not negating the radian.
+
+        :param map_batched:
+        :param centers:
+        :param angles:
+        :param out_height:
+        :param out_width:
+        :return:
+        """
+        M = get_rotation_matrix2d(centers, angles, torch.ones_like(angles))
+        rotated_map_batched = warp_affine_crop(map_batched, centers, M,
+                                               dsize=(out_height, out_width), padding_mode='zeros')
+
+        return rotated_map_batched
+
+    @classmethod
+    def get_cropped_maps_from_scene_map_batch(cls, maps, scene_pts, patch_size, rotation=None, device='cpu'):
+        """
+        Returns rotated patches of each map around the transformed scene points.
+        ___________________
+        |       |          |
+        |       |ps[3]     |
+        |       |          |
+        |       |          |
+        |      o|__________|
+        |       |    ps[2] |
+        |       |          |
+        |_______|__________|
+        ps = patch_size
+
+        :param maps: List of GeometricMap objects [bs]
+        :param scene_pts: Scene points: [bs, 2]
+        :param patch_size: Extracted Patch size after rotation: [-x, -y, +x, +y]
+        :param rotation: Rotations in degrees: [bs]
+        :param device: Device on which the rotated tensors should be returned.
+        :return: Rotated and cropped tensor patches.
+        """
+        batch_size = scene_pts.shape[0]
+        lat_size = 2 * np.max((patch_size[0], patch_size[2]))
+        long_size = 2 * np.max((patch_size[1], patch_size[3]))
+        assert lat_size % 2 == 0, "Patch width must be divisible by 2"
+        assert long_size % 2 == 0, "Patch length must be divisible by 2"
+        lat_size_half = lat_size // 2
+        long_size_half = long_size // 2
+
+        context_padding_x = int(np.ceil(np.sqrt(2) * lat_size))
+        context_padding_y = int(np.ceil(np.sqrt(2) * long_size))
+
+        centers = torch.tensor([s_map.to_map_points(scene_pts[np.newaxis, i]) for i, s_map in enumerate(maps)],
+                               dtype=torch.long, device=device).squeeze(dim=1) \
+                  + torch.tensor([context_padding_x, context_padding_y], device=device, dtype=torch.long)
+
+        padded_map = [s_map.get_padded_map(context_padding_x, context_padding_y, device=device) for s_map in maps]
+
+        padded_map_batched = torch.stack([padded_map[i][...,
+                                          centers[i, 0] - context_padding_x: centers[i, 0] + context_padding_x,
+                                          centers[i, 1] - context_padding_y: centers[i, 1] + context_padding_y]
+                                          for i in range(centers.shape[0])], dim=0)
+
+        center_patches = torch.tensor([[context_padding_y, context_padding_x]],
+                                      dtype=torch.int,
+                                      device=device).repeat(batch_size, 1)
+
+        if rotation is not None:
+            angles = torch.Tensor(rotation)
+        else:
+            angles = torch.zeros(batch_size)
+
+        rotated_map_batched = cls.batch_rotate(padded_map_batched/255.,
+                                                center_patches.float(),
+                                                angles,
+                                                long_size,
+                                                lat_size)
+
+        del padded_map_batched
+
+        return rotated_map_batched[...,
+               long_size_half - patch_size[1]:(long_size_half + patch_size[3]),
+               lat_size_half - patch_size[0]:(lat_size_half + patch_size[2])]
+
+    def get_cropped_maps(self, scene_pts, patch_size, rotation=None, device='cpu'):
+        """
+        Returns rotated patches of the map around the transformed scene points.
+        ___________________
+        |       |          |
+        |       |ps[3]     |
+        |       |          |
+        |       |          |
+        |      o|__________|
+        |       |    ps[2] |
+        |       |          |
+        |_______|__________|
+        ps = patch_size
+
+        :param scene_pts: Scene points: [bs, 2]
+        :param patch_size: Extracted Patch size after rotation: [-lat, -long, +lat, +long]
+        :param rotation: Rotations in degrees: [bs]
+        :param device: Device on which the rotated tensors should be returned.
+        :return: Rotated and cropped tensor patches.
+        """
+        return self.get_cropped_maps_from_scene_map_batch([self]*scene_pts.shape[0], scene_pts,
+                                                          patch_size, rotation=rotation, device=device)
+
+    def to_map_points(self, scene_pts):
+        org_shape = None
+        if len(scene_pts.shape) > 2:
+            org_shape = scene_pts.shape
+            scene_pts = scene_pts.reshape((-1, 2))
+        N, dims = scene_pts.shape
+        points_with_one = np.ones((dims + 1, N))
+        points_with_one[:dims] = scene_pts.T
+        map_points = (self.homography @ points_with_one).T[..., :dims]
+        if org_shape is not None:
+            map_points = map_points.reshape(org_shape)
+        return map_points
+
+
+class ImageMap(Map):  # TODO Implement for image maps -> watch flipped coordinate system
+    def __init__(self):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py
new file mode 100644
index 000000000..a160412f1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node.py
@@ -0,0 +1,240 @@
+import random
+import numpy as np
+import pandas as pd
+from .data_structures import DoubleHeaderNumpyArray
+# from ncls import NCLS
+
+
+class Node(object):
+    def __init__(self, node_type, node_id, data, length=None, width=None, height=None, first_timestep=0,
+                 is_robot=False, description="", frequency_multiplier=1, non_aug_node=None):
+        self.type = node_type
+        self.id = node_id
+        self.length = length
+        self.width = width
+        self.height = height
+        self.first_timestep = first_timestep
+        self.non_aug_node = non_aug_node
+
+        if data is not None:
+            if isinstance(data, pd.DataFrame):
+                self.data = DoubleHeaderNumpyArray(data.values, list(data.columns))
+            elif isinstance(data, DoubleHeaderNumpyArray):
+                self.data = data
+        else:
+            self.data = None
+
+        self.is_robot = is_robot
+        self._last_timestep = None
+        self.description = description
+        self.frequency_multiplier = frequency_multiplier
+
+        self.forward_in_time_on_next_override = False
+
+    def __eq__(self, other):
+        return ((isinstance(other, self.__class__)
+                 or isinstance(self, other.__class__))
+                and self.id == other.id
+                and self.type == other.type)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash((self.type, self.id))
+
+    def __repr__(self):
+        return '/'.join([self.type.name, self.id])
+
+    def overwrite_data(self, data, header, forward_in_time_on_next_overwrite=False):
+        """
+        This function hard overwrites the data matrix. When using it you have to make sure that the columns
+        in the new data matrix correspond to the old structure. As well as setting first_timestep.
+
+        :param data: New data matrix
+        :param forward_in_time_on_next_overwrite: On the !!NEXT!! call of overwrite_data first_timestep will be increased.
+        :return:  None
+        """
+        if header is None:
+            self.data.data = data
+        else:
+            self.data = DoubleHeaderNumpyArray(data, header)
+
+        self._last_timestep = None
+        if self.forward_in_time_on_next_override:
+            self.first_timestep += 1
+        self.forward_in_time_on_next_override = forward_in_time_on_next_overwrite
+
+    def scene_ts_to_node_ts(self, scene_ts) -> (np.ndarray, int, int):
+        """
+        Transforms timestamp from scene into timeframe of node data.
+
+        :param scene_ts: Scene timesteps
+        :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in
+                node data before data is available. paddingu: Number of timesteps in scene range which are not
+                available in node data after data is available.
+        """
+        paddingl = (self.first_timestep - scene_ts[0]).clip(0)
+        paddingu = (scene_ts[1] - self.last_timestep).clip(0)
+        ts = np.array(scene_ts).clip(min=self.first_timestep, max=self.last_timestep) - self.first_timestep
+        return ts, paddingl, paddingu
+
+    def history_points_at(self, ts) -> int:
+        """
+        Number of history points in trajectory. Timestep is exclusive.
+
+        :param ts: Scene timestep where the number of history points are queried.
+        :return: Number of history timesteps.
+        """
+        return ts - self.first_timestep
+
+    def get(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        """
+        Returns a time range of multiple properties of the node.
+
+        :param tr_scene: The timestep range (inklusive).
+        :param state: The state description for which the properties are returned.
+        :param padding: The value which should be used for padding if not enough information is available.
+        :return: Array of node property values.
+        """
+        if tr_scene.size == 1:
+            tr_scene = np.array([tr_scene[0], tr_scene[0]])
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+        tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene)
+        data_array = self.data[tr[0]:tr[1] + 1, state]
+        padded_data_array = np.full((length, data_array.shape[1]), fill_value=padding)
+        padded_data_array[paddingl:length - paddingu] = data_array
+        return padded_data_array
+
+    @property
+    def timesteps(self) -> int:
+        """
+        Number of available timesteps for node.
+
+        :return: Number of available timesteps.
+        """
+        return self.data.shape[0]
+
+    @property
+    def last_timestep(self) -> int:
+        """
+        Nodes last timestep in the Scene.
+
+        :return: Nodes last timestep.
+        """
+        if self._last_timestep is None:
+            self._last_timestep = self.first_timestep + self.timesteps - 1
+        return self._last_timestep
+
+
+class MultiNode(Node):
+    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
+        super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot)
+        self.nodes_list = nodes_list
+        for node in self.nodes_list:
+            node.is_robot = is_robot
+
+        self.first_timestep = min(node.first_timestep for node in self.nodes_list)
+        self._last_timestep = max(node.last_timestep for node in self.nodes_list)
+
+        starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64)
+        ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64)
+        ids = np.arange(len(self.nodes_list), dtype=np.int64)
+        self.interval_tree = NCLS(starts, ends, ids)
+
+    @staticmethod
+    def find_non_overlapping_nodes(nodes_list, min_timesteps=1) -> list:
+        """
+        Greedily finds a set of non-overlapping nodes in the provided scene.
+
+        :return: A list of non-overlapping nodes.
+        """
+        non_overlapping_nodes = list()
+        nodes = sorted(nodes_list, key=lambda n: n.last_timestep)
+        current_time = 0
+        for node in nodes:
+            if node.first_timestep >= current_time and node.timesteps >= min_timesteps:
+                # Include the node
+                non_overlapping_nodes.append(node)
+                current_time = node.last_timestep
+
+        return non_overlapping_nodes
+
+    def get_node_at_timesteps(self, scene_ts) -> Node:
+        possible_node_ranges = list(self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
+        if not possible_node_ranges:
+            return Node(node_type=self.type,
+                        node_id='EMPTY',
+                        data=self.nodes_list[0].data * np.nan,
+                        is_robot=self.is_robot)
+
+        node_idx = random.choice(possible_node_ranges)[2]
+        return self.nodes_list[node_idx]
+
+    def scene_ts_to_node_ts(self, scene_ts) -> (Node, np.ndarray, int, int):
+        """
+        Transforms timestamp from scene into timeframe of node data.
+
+        :param scene_ts: Scene timesteps
+        :return: ts: Transformed timesteps, paddingl: Number of timesteps in scene range which are not available in
+                node data before data is available. paddingu: Number of timesteps in scene range which are not
+                available in node data after data is available.
+        """
+        possible_node_ranges = list(self.interval_tree.find_overlap(scene_ts[0], scene_ts[1] + 1))
+        if not possible_node_ranges:
+            return None, None, None, None
+
+        node_idx = random.choice(possible_node_ranges)[2]
+        node = self.nodes_list[node_idx]
+
+        paddingl = (node.first_timestep - scene_ts[0]).clip(0)
+        paddingu = (scene_ts[1] - node.last_timestep).clip(0)
+        ts = np.array(scene_ts).clip(min=node.first_timestep, max=node.last_timestep) - node.first_timestep
+        return node, ts, paddingl, paddingu
+
+    def get(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        if tr_scene.size == 1:
+            tr_scene = np.array([tr_scene, tr_scene])
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+
+        node, tr, paddingl, paddingu = self.scene_ts_to_node_ts(tr_scene)
+        if node is None:
+            state_length = sum([len(entity_dims) for entity_dims in state.values()])
+            return np.full((length, state_length), fill_value=padding)
+
+        data_array = node.data[tr[0]:tr[1] + 1, state]
+        padded_data_array = np.full((length, data_array.shape[1]), fill_value=padding)
+        padded_data_array[paddingl:length - paddingu] = data_array
+        return padded_data_array
+
+    def get_all(self, tr_scene, state, padding=np.nan) -> np.ndarray:
+        # Assumption here is that the user is asking for all of the data in this MultiNode and to return it within a
+        # full scene-sized output array.
+        assert tr_scene.size == 2 and tr_scene[0] == 0 and self.last_timestep <= tr_scene[1]
+        length = tr_scene[1] - tr_scene[0] + 1  # tr is inclusive
+        state_length = sum([len(entity_dims) for entity_dims in state.values()])
+        padded_data_array = np.full((length, state_length), fill_value=padding)
+        for node in self.nodes_list:
+            padded_data_array[node.first_timestep:node.last_timestep + 1] = node.data[:, state]
+
+        return padded_data_array
+
+    def history_points_at(self, ts) -> int:
+        """
+        Number of history points in trajectory. Timestep is exclusive.
+
+        :param ts: Scene timestep where the number of history points are queried.
+        :return: Number of history timesteps.
+        """
+        node_idx = next(self.interval_tree.find_overlap(ts, ts + 1))[2]
+        node = self.nodes_list[node_idx]
+        return ts - node.first_timestep
+
+    @property
+    def timesteps(self) -> int:
+        """
+        Number of available timesteps for node.
+
+        :return: Number of available timesteps.
+        """
+        return self._last_timestep - self.first_timestep + 1
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py
new file mode 100644
index 000000000..a44917a22
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/node_type.py
@@ -0,0 +1,35 @@
+class NodeType(object):
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, other):
+        if type(other) == str and self.name == other:
+            return True
+        else:
+            return isinstance(other, self.__class__) and self.name == other.name
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __add__(self, other):
+        return self.name + other
+
+
+class NodeTypeEnum(list):
+    def __init__(self, node_type_list):
+        self.node_type_list = node_type_list
+        node_types = [NodeType(name, node_type_list.index(name) + 1) for name in node_type_list]
+        super().__init__(node_types)
+
+    def __getattr__(self, name):
+        if not name.startswith('_') and name in object.__getattribute__(self, "node_type_list"):
+            return self[object.__getattribute__(self, "node_type_list").index(name)]
+        else:
+            return object.__getattribute__(self, name)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py
new file mode 100644
index 000000000..299148278
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene.py
@@ -0,0 +1,219 @@
+import copy
+import numpy as np
+from .scene_graph import TemporalSceneGraph, SceneGraph
+from .node import MultiNode
+
+
+class Scene(object):
+    def __init__(self, timesteps, map=None, dt=1, name="", frequency_multiplier=1, aug_func=None,  non_aug_scene=None):
+        self.map = map
+        self.timesteps = timesteps
+        self.dt = dt
+        self.name = name
+
+        self.nodes = []
+
+        self.robot = None
+
+        self.temporal_scene_graph = None
+
+        self.frequency_multiplier = frequency_multiplier
+
+        self.description = ""
+
+        self.aug_func = aug_func
+        self.non_aug_scene = non_aug_scene
+
+    def add_robot_from_nodes(self, robot_type):
+        scenes = [self]
+        if hasattr(self, 'augmented'):
+            scenes += self.augmented
+
+        for scn in scenes:
+            nodes_list = [node for node in scn.nodes if node.type == robot_type]
+            non_overlapping_nodes = MultiNode.find_non_overlapping_nodes(nodes_list, min_timesteps=3)
+            scn.robot = MultiNode(robot_type, 'ROBOT', non_overlapping_nodes, is_robot=True)
+
+            for node in non_overlapping_nodes:
+                scn.nodes.remove(node)
+            scn.nodes.append(scn.robot)
+
+    def get_clipped_input_dict(self, timestep, state):
+        input_dict = dict()
+        existing_nodes = self.get_nodes_clipped_at_time(timesteps=np.array([timestep]),
+                                                        state=state)
+        tr_scene = np.array([timestep, timestep])
+        for node in existing_nodes:
+            input_dict[node] = node.get(tr_scene, state[node.type])
+
+        return input_dict
+
+    def get_scene_graph(self,
+                        timestep,
+                        attention_radius=None,
+                        edge_addition_filter=None,
+                        edge_removal_filter=None) -> SceneGraph:
+        """
+        Returns the Scene Graph for a given timestep. If the Temporal Scene Graph was pre calculated,
+        the temporal scene graph is sliced. Otherwise the scene graph is calculated on the spot.
+
+        :param timestep: Timestep for which the scene graph is returned.
+        :param attention_radius: Attention radius for each node type permutation. (Only online)
+        :param edge_addition_filter: Filter for adding edges (Only online)
+        :param edge_removal_filter:  Filter for removing edges (Only online)
+        :return: Scene Graph for given timestep.
+        """
+        if self.temporal_scene_graph is None:
+            timestep_range = np.array([timestep - len(edge_removal_filter), timestep])
+            node_pos_dict = dict()
+            present_nodes = self.present_nodes(np.array([timestep]))
+
+            for node in present_nodes[timestep]:
+                node_pos_dict[node] = np.squeeze(node.get(timestep_range, {'position': ['x', 'y']}))
+            tsg = TemporalSceneGraph.create_from_temp_scene_dict(node_pos_dict,
+                                                                 attention_radius,
+                                                                 duration=(len(edge_removal_filter) + 1),
+                                                                 edge_addition_filter=edge_addition_filter,
+                                                                 edge_removal_filter=edge_removal_filter
+                                                                 )
+
+            return tsg.to_scene_graph(t=len(edge_removal_filter),
+                                      t_hist=len(edge_removal_filter),
+                                      t_fut=len(edge_addition_filter))
+        else:
+            return self.temporal_scene_graph.to_scene_graph(timestep,
+                                                            len(edge_removal_filter),
+                                                            len(edge_addition_filter))
+
+    def calculate_scene_graph(self,
+                              attention_radius,
+                              edge_addition_filter=None,
+                              edge_removal_filter=None) -> None:
+        """
+        Calculate the Temporal Scene Graph for the entire Scene.
+
+        :param attention_radius: Attention radius for each node type permutation.
+        :param edge_addition_filter: Filter for adding edges.
+        :param edge_removal_filter: Filter for removing edges.
+        :return: None
+        """
+        timestep_range = np.array([0, self.timesteps-1])
+        node_pos_dict = dict()
+
+        for node in self.nodes:
+            if type(node) is MultiNode:
+                node_pos_dict[node] = np.squeeze(node.get_all(timestep_range, {'position': ['x', 'y']}))
+            else:
+                node_pos_dict[node] = np.squeeze(node.get(timestep_range, {'position': ['x', 'y']}))
+
+        self.temporal_scene_graph = TemporalSceneGraph.create_from_temp_scene_dict(node_pos_dict,
+                                                                                   attention_radius,
+                                                                                   duration=self.timesteps,
+                                                                                   edge_addition_filter=edge_addition_filter,
+                                                                                   edge_removal_filter=edge_removal_filter)
+
+    def duration(self):
+        """
+        Calculates the duration of the scene.
+
+        :return: Duration of the scene in s.
+        """
+        return self.timesteps * self.dt
+
+    def present_nodes(self,
+                      timesteps,
+                      type=None,
+                      min_history_timesteps=0,
+                      min_future_timesteps=0,
+                      return_robot=True) -> dict:
+        """
+        Finds all present nodes in the scene at a given timestemp
+
+        :param timesteps: Timestep(s) for which all present nodes should be returned
+        :param type: Node type which should be returned. If None all node types are returned.
+        :param min_history_timesteps: Minimum history timesteps of a node to be returned.
+        :param min_future_timesteps: Minimum future timesteps of a node to be returned.
+        :param return_robot: Return a node if it is the robot.
+        :return: Dictionary with timesteps as keys and list of nodes as value.
+        """
+
+        present_nodes = {}
+
+        for node in self.nodes:
+            if node.is_robot and not return_robot:
+                continue
+            if type is None or node.type == type:
+                lower_bound = timesteps - min_history_timesteps
+                upper_bound = timesteps + min_future_timesteps
+                mask = (node.first_timestep <= lower_bound) & (upper_bound <= node.last_timestep)
+                if mask.any():
+                    timestep_indices_present = np.nonzero(mask)[0]
+                    for timestep_index_present in timestep_indices_present:
+                        if timesteps[timestep_index_present] in present_nodes.keys():
+                            present_nodes[timesteps[timestep_index_present]].append(node)
+                        else:
+                            present_nodes[timesteps[timestep_index_present]] = [node]
+
+        return present_nodes
+
+    def get_nodes_clipped_at_time(self, timesteps, state):
+        clipped_nodes = list()
+
+        existing_nodes = self.present_nodes(timesteps)
+        all_nodes = set().union(*existing_nodes.values())
+        if not all_nodes:
+            return clipped_nodes
+
+        tr_scene = np.array([timesteps.min(), timesteps.max()])
+        data_header_memo = dict()
+        for node in all_nodes:
+            if isinstance(node, MultiNode):
+                copied_node = copy.deepcopy(node.get_node_at_timesteps(tr_scene))
+                copied_node.id = self.robot.id
+            else:
+                copied_node = copy.deepcopy(node)
+
+            clipped_value = node.get(tr_scene, state[node.type])
+
+            if node.type not in data_header_memo:
+                data_header = list()
+                for quantity, values in state[node.type].items():
+                    for value in values:
+                        data_header.append((quantity, value))
+
+                data_header_memo[node.type] = data_header
+
+            copied_node.overwrite_data(clipped_value, data_header_memo[node.type])
+            copied_node.first_timestep = tr_scene[0]
+
+            clipped_nodes.append(copied_node)
+
+        return clipped_nodes
+
+    def sample_timesteps(self, batch_size, min_future_timesteps=0) -> np.ndarray:
+        """
+        Sample a batch size of possible timesteps for the scene.
+
+        :param batch_size: Number of timesteps to sample.
+        :param min_future_timesteps: Minimum future timesteps in the scene for a timestep to be returned.
+        :return: Numpy Array of sampled timesteps.
+        """
+        if batch_size > self.timesteps:
+            batch_size = self.timesteps
+        return np.random.choice(np.arange(0, self.timesteps-min_future_timesteps), size=batch_size, replace=False)
+
+    def augment(self):
+        if self.aug_func is not None:
+            return self.aug_func(self)
+        else:
+            return self
+
+    def get_node_by_id(self, id):
+        for node in self.nodes:
+            if node.id == id:
+                return node
+
+    def __repr__(self):
+        return f"Scene: Duration: {self.duration()}s," \
+               f" Nodes: {len(self.nodes)}," \
+               f" Map: {'Yes' if self.map is not None else 'No'}."
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py
new file mode 100644
index 000000000..1113bd4d1
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/environment/scene_graph.py
@@ -0,0 +1,493 @@
+import numpy as np
+from scipy.spatial.distance import pdist, squareform
+import scipy.signal as ss
+from collections import defaultdict
+import warnings
+from .node import Node
+
+
+class Edge(object):
+    def __init__(self, curr_node, other_node):
+        self.id = self.get_edge_id(curr_node, other_node)
+        self.type = self.get_edge_type(curr_node, other_node)
+        self.curr_node = curr_node
+        self.other_node = other_node
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        raise NotImplementedError("Use one of the Edge subclasses!")
+
+    def __eq__(self, other):
+        return (isinstance(other, self.__class__)
+                and self.id == other.id)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __repr__(self):
+        return self.id
+
+
+class UndirectedEdge(Edge):
+    def __init__(self, curr_node, other_node):
+        super(UndirectedEdge, self).__init__(curr_node, other_node)
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        return '-'.join(sorted([str(n1), str(n2)]))
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        return '-'.join(sorted([nt1.name, nt2.name]))
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        return '-'.join(sorted([n1.type.name, n2.type.name]))
+
+
+class DirectedEdge(Edge):
+    def __init__(self, curr_node, other_node):
+        super(DirectedEdge, self).__init__(curr_node, other_node)
+
+    @staticmethod
+    def get_edge_id(n1, n2):
+        return '->'.join([str(n1), str(n2)])
+
+    @staticmethod
+    def get_str_from_types(nt1, nt2):
+        return '->'.join([nt1.name, nt2.name])
+
+    @staticmethod
+    def get_edge_type(n1, n2):
+        return '->'.join([n1.type.name, n2.type.name])
+
+
+class TemporalSceneGraph(object):
+    def __init__(self,
+                 edge_radius,
+                 nodes=None,
+                 adj_cube=np.zeros((1, 0, 0)),
+                 weight_cube=np.zeros((1, 0, 0)),
+                 node_type_mat=np.zeros((0, 0)),
+                 edge_scaling=None):
+        self.edge_radius = edge_radius
+        self.nodes = nodes
+        if nodes is None:
+            self.nodes = np.array([])
+        self.adj_cube = adj_cube
+        self.weight_cube = weight_cube
+        self.node_type_mat = node_type_mat
+        self.adj_mat = np.max(self.adj_cube, axis=0).clip(max=1.0)
+        self.edge_scaling = edge_scaling
+        self.node_index_lookup = None
+        self.calculate_node_index_lookup()
+
+    def calculate_node_index_lookup(self):
+        node_index_lookup = dict()
+        for i, node in enumerate(self.nodes):
+            node_index_lookup[node] = i
+
+        self.node_index_lookup = node_index_lookup
+
+    def get_num_edges(self, t=0):
+        return np.sum(self.adj_cube[t]) // 2
+
+    def get_index(self, node):
+        return self.node_index_lookup[node]
+
+    @classmethod
+    def create_from_temp_scene_dict(cls,
+                                    scene_temp_dict,
+                                    attention_radius,
+                                    duration=1,
+                                    edge_addition_filter=None,
+                                    edge_removal_filter=None,
+                                    online=False):
+        """
+        Construct a spatiotemporal graph from node positions in a dataset.
+
+        :param scene_temp_dict: Dict with all nodes in scene as keys and np.ndarray with positions as value
+        :param attention_radius: Attention radius dict.
+        :param duration: Temporal duration of the graph.
+        :param edge_addition_filter: -
+        :param edge_removal_filter: -
+        :return: TemporalSceneGraph
+        """
+
+        nodes = scene_temp_dict.keys()
+        N = len(nodes)
+        total_timesteps = duration
+
+        if N == 0:
+            return TemporalSceneGraph(attention_radius)
+
+        position_cube = np.full((total_timesteps, N, 2), np.nan)
+
+        adj_cube = np.zeros((total_timesteps, N, N), dtype=np.int8)
+        dist_cube = np.zeros((total_timesteps, N, N), dtype=np.float)
+
+        node_type_mat = np.zeros((N, N), dtype=np.int8)
+        node_attention_mat = np.zeros((N, N), dtype=np.float)
+
+        for node_idx, node in enumerate(nodes):
+            if online:
+                # RingBuffers do not have a fixed constant size. Instead, they grow up to their capacity. Thus,
+                # we need to fill the values preceding the RingBuffer values with NaNs to make them fill the
+                # position_cube.
+                position_cube[-scene_temp_dict[node].shape[0]:, node_idx] = scene_temp_dict[node]
+            else:
+                position_cube[:, node_idx] = scene_temp_dict[node]
+
+            node_type_mat[:, node_idx] = node.type.value
+            for node_idx_from, node_from in enumerate(nodes):
+                node_attention_mat[node_idx_from, node_idx] = attention_radius[(node_from.type, node.type)]
+
+        np.fill_diagonal(node_type_mat, 0)
+
+        for timestep in range(position_cube.shape[0]):
+            dists = squareform(pdist(position_cube[timestep], metric='euclidean'))
+
+            # Put a 1 for all agent pairs which are closer than the edge_radius.
+            # Can produce a warning as dists can be nan if no data for node is available.
+            # This is accepted as nan <= x evaluates to False
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                adj_matrix = (dists <= node_attention_mat).astype(np.int8) * node_type_mat
+
+            # Remove self-loops.
+            np.fill_diagonal(adj_matrix, 0)
+
+            adj_cube[timestep] = adj_matrix
+            dist_cube[timestep] = dists
+
+        dist_cube[np.isnan(dist_cube)] = 0.
+        weight_cube = np.divide(1.,
+                                dist_cube,
+                                out=np.zeros_like(dist_cube),
+                                where=(dist_cube > 0.))
+        edge_scaling = None
+        if edge_addition_filter is not None and edge_removal_filter is not None:
+            edge_scaling = cls.calculate_edge_scaling(adj_cube, edge_addition_filter, edge_removal_filter)
+        tsg = cls(attention_radius,
+                  np.array(list(nodes)),
+                  adj_cube, weight_cube,
+                  node_type_mat,
+                  edge_scaling=edge_scaling)
+        return tsg
+
+    @staticmethod
+    def calculate_edge_scaling(adj_cube, edge_addition_filter, edge_removal_filter):
+        shifted_right = np.pad(adj_cube, ((len(edge_addition_filter) - 1, 0), (0, 0), (0, 0)), 'constant', constant_values=0)
+
+        new_edges = np.minimum(
+            ss.convolve(shifted_right, np.reshape(edge_addition_filter, (-1, 1, 1)), 'full'), 1.
+        )[(len(edge_addition_filter) - 1):-(len(edge_addition_filter) - 1)]
+
+        new_edges[adj_cube == 0] = 0
+
+        result = np.minimum(
+            ss.convolve(new_edges, np.reshape(edge_removal_filter, (-1, 1, 1)), 'full'), 1.
+        )[:-(len(edge_removal_filter) - 1)]
+
+        return result
+
+    def to_scene_graph(self, t, t_hist=0, t_fut=0):
+        """
+        Creates a Scene Graph from a Temporal Scene Graph
+
+        :param t: Time in Temporal Scene Graph for which Scene Graph is created.
+        :param t_hist: Number of history timesteps which are considered to form edges in Scene Graph.
+        :param t_fut: Number of future timesteps which are considered to form edges in Scene Graph.
+        :return: SceneGraph
+        """
+        lower_t = np.clip(t-t_hist, a_min=0, a_max=None)
+        higher_t = np.clip(t + t_fut + 1, a_min=None, a_max=self.adj_cube.shape[0] + 1)
+        adj_mat = np.max(self.adj_cube[lower_t:higher_t], axis=0)
+        weight_mat = np.max(self.weight_cube[lower_t:higher_t], axis=0)
+        return SceneGraph(self.edge_radius,
+                          self.nodes,
+                          adj_mat,
+                          weight_mat,
+                          self.node_type_mat,
+                          self.node_index_lookup,
+                          edge_scaling=self.edge_scaling[t] if self.edge_scaling is not None else None)
+
+
+class SceneGraph(object):
+    def __init__(self,
+                 edge_radius,
+                 nodes=None,
+                 adj_mat=np.zeros((0, 0)),
+                 weight_mat=np.zeros((0, 0)),
+                 node_type_mat=np.zeros((0, 0)),
+                 node_index_lookup=None,
+                 edge_scaling=None):
+        self.edge_radius = edge_radius
+        self.nodes = nodes
+        if nodes is None:
+            self.nodes = np.array([])
+        self.node_type_mat = node_type_mat
+        self.adj_mat = adj_mat
+        self.weight_mat = weight_mat
+        self.edge_scaling = edge_scaling
+        self.node_index_lookup = node_index_lookup
+
+    def get_index(self, node):
+        return self.node_index_lookup[node]
+
+    def get_num_edges(self):
+        return np.sum(self.adj_mat) // 2
+
+    def get_neighbors(self, node, node_type):
+        """
+        Get all neighbors of a node.
+
+        :param node: Node for which all neighbors are returned.
+        :param node_type: Specifies node types which are returned.
+        :return: List of all neighbors.
+        """
+        node_index = self.get_index(node)
+        connection_mask = self.get_connection_mask(node_index)
+        mask = ((self.node_type_mat[node_index] == node_type.value) * connection_mask)
+        return self.nodes[mask]
+
+    def get_edge_scaling(self, node=None):
+        if node is None:
+            return self.edge_scaling
+        else:
+            node_index = self.get_index(node)
+            connection_mask = self.get_connection_mask(node_index)
+            return self.edge_scaling[node_index, connection_mask]
+
+    def get_edge_weight(self, node=None):
+        if node is None:
+            return self.weight_mat
+        else:
+            node_index = self.get_index(node)
+            connection_mask = self.get_connection_mask(node_index)
+            return self.weight_mat[node_index, connection_mask]
+
+    def get_connection_mask(self, node_index):
+        if self.edge_scaling is None: # We do not use edge scaling
+            return self.adj_mat[node_index] > 0.
+        else:
+            return self.edge_scaling[node_index] > 1e-2
+
+    def __sub__(self, other):
+        new_nodes = [node for node in self.nodes if node not in other.nodes]
+        removed_nodes = [node for node in other.nodes if node not in self.nodes]
+
+        our_types = set(node.type for node in self.nodes)
+        other_types = set(node.type for node in other.nodes)
+        all_node_types = our_types | other_types
+
+        new_neighbors = defaultdict(lambda: defaultdict(set))
+        for node in self.nodes:
+            if node in removed_nodes:
+                continue
+
+            if node in other.nodes:
+                for node_type in all_node_types:
+                    new_items = set(self.get_neighbors(node, node_type)) - set(other.get_neighbors(node, node_type))
+                    if len(new_items) > 0:
+                        new_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = new_items
+            else:
+                for node_type in our_types:
+                    neighbors = self.get_neighbors(node, node_type)
+                    if len(neighbors) > 0:
+                        new_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = set(neighbors)
+
+        removed_neighbors = defaultdict(lambda: defaultdict(set))
+        for node in other.nodes:
+            if node in removed_nodes:
+                continue
+
+            if node in self.nodes:
+                for node_type in all_node_types:
+                    removed_items = set(other.get_neighbors(node, node_type)) - set(self.get_neighbors(node, node_type))
+                    if len(removed_items) > 0:
+                        removed_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = removed_items
+            else:
+                for node_type in other_types:
+                    neighbors = other.get_neighbors(node, node_type)
+                    if len(neighbors) > 0:
+                        removed_neighbors[node][DirectedEdge.get_edge_type(node, Node(node_type, None, None))] = set(neighbors)
+
+        return new_nodes, removed_nodes, new_neighbors, removed_neighbors
+
+
+if __name__ == '__main__':
+    from environment import NodeTypeEnum
+    import time
+
+    # # # # # # # # # # # # # # # # #
+    # Testing edge mask calculation #
+    # # # # # # # # # # # # # # # # #
+    B = np.array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                  [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+                  [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
+                  [1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0]])[:, :, np.newaxis, np.newaxis]
+    print(B.shape)
+
+    edge_addition_filter = [0.25, 0.5, 0.75, 1.0]
+    edge_removal_filter = [1.0, 0.5, 0.0]
+    for i in range(B.shape[0]):
+        A = B[i]  # (time, N, N)
+
+        print(A[:, 0, 0])
+
+        start = time.time()
+        new_edges = np.minimum(ss.convolve(A, np.reshape(edge_addition_filter, (-1, 1, 1)), 'full'), 1.)[(len(edge_addition_filter) - 1):]
+        old_edges = np.minimum(ss.convolve(A, np.reshape(edge_removal_filter, (-1, 1, 1)), 'full'), 1.)[:-(len(edge_removal_filter) - 1)]
+        res = np.minimum(new_edges + old_edges, 1.)[:, 0, 0]
+        end = time.time()
+        print(end - start)
+        print(res)
+
+        start = time.time()
+        res = TemporalSceneGraph.calculate_edge_scaling(A, edge_addition_filter, edge_removal_filter)[:, 0, 0]
+        end = time.time()
+        print(end - start)
+        print(res)
+
+        print('-'*40)
+
+    # # # # # # # # # # # # # # #
+    # Testing graph subtraction #
+    # # # # # # # # # # # # # # #
+    print('\n' + '-' * 40 + '\n')
+
+    node_type_list = ['PEDESTRIAN',
+                      'BICYCLE',
+                      'VEHICLE']
+    nte = NodeTypeEnum(node_type_list)
+
+    attention_radius = dict()
+    attention_radius[(nte.PEDESTRIAN, nte.PEDESTRIAN)] = 5.0
+    attention_radius[(nte.PEDESTRIAN, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.PEDESTRIAN, nte.BICYCLE)] = 10.0
+    attention_radius[(nte.VEHICLE, nte.PEDESTRIAN)] = 20.0
+    attention_radius[(nte.VEHICLE, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.VEHICLE, nte.BICYCLE)] = 20.0
+    attention_radius[(nte.BICYCLE, nte.PEDESTRIAN)] = 10.0
+    attention_radius[(nte.BICYCLE, nte.VEHICLE)] = 20.0
+    attention_radius[(nte.BICYCLE, nte.BICYCLE)] = 10.0
+
+    scene_dict1 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([0, 1])}
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    scene_dict2 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([1, 1])}
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print('New Nodes:', new_nodes)
+    print('Removed Nodes:', removed_nodes)
+    print('New Neighbors:', new_neighbors)
+    print('Removed Neighbors:', removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print('\n' + '-' * 40 + '\n')
+
+    scene_dict1 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([0, 1])}
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    scene_dict2 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([1, 1]),
+                   Node(nte.PEDESTRIAN, node_id='3'): np.array([20, 1])}
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print('New Nodes:', new_nodes)
+    print('Removed Nodes:', removed_nodes)
+    print('New Neighbors:', new_neighbors)
+    print('Removed Neighbors:', removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print('\n' + '-' * 40 + '\n')
+
+    scene_dict1 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([0, 1])}
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    scene_dict2 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([1, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([10, 1]),
+                   Node(nte.PEDESTRIAN, node_id='3'): np.array([20, 1])}
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print('New Nodes:', new_nodes)
+    print('Removed Nodes:', removed_nodes)
+    print('New Neighbors:', new_neighbors)
+    print('Removed Neighbors:', removed_neighbors)
+
+    # # # # # # # # # # # # # # #
+    print('\n' + '-' * 40 + '\n')
+
+    scene_dict1 = {Node(nte.PEDESTRIAN, node_id='1'): np.array([0, 0]),
+                   Node(nte.PEDESTRIAN, node_id='2'): np.array([0, 1])}
+    sg1 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict1,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    scene_dict2 = {Node(nte.PEDESTRIAN, node_id='2'): np.array([10, 1]),
+                   Node(nte.PEDESTRIAN, node_id='3'): np.array([12, 1]),
+                   Node(nte.PEDESTRIAN, node_id='4'): np.array([13, 1])}
+    sg2 = TemporalSceneGraph.create_from_temp_scene_dict(
+        scene_dict2,
+        attention_radius=attention_radius,
+        duration=1,
+        edge_addition_filter=[0.25, 0.5, 0.75, 1.0],
+        edge_removal_filter=[1.0, 0.0]).to_scene_graph(t=0)
+
+    new_nodes, removed_nodes, new_neighbors, removed_neighbors = sg2 - sg1
+    print('New Nodes:', new_nodes)
+    print('Removed Nodes:', removed_nodes)
+    print('New Neighbors:', new_neighbors)
+    print('Removed Neighbors:', removed_neighbors)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py
new file mode 100644
index 000000000..6674ebbeb
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/__init__.py
@@ -0,0 +1 @@
+from .evaluation import compute_batch_statistics, log_batch_errors, print_batch_errors
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py
new file mode 100644
index 000000000..8e5a643c2
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/evaluation/evaluation.py
@@ -0,0 +1,140 @@
+import numpy as np
+from scipy.interpolate import RectBivariateSpline
+from scipy.ndimage import binary_dilation
+from scipy.stats import gaussian_kde
+from utils import prediction_output_to_trajectories
+import visualization
+from matplotlib import pyplot as plt
+
+
+def compute_ade(predicted_trajs, gt_traj):
+    error = np.linalg.norm(predicted_trajs - gt_traj, axis=-1)
+    ade = np.mean(error, axis=-1)
+    return ade.flatten()
+
+
+def compute_fde(predicted_trajs, gt_traj):
+    final_error = np.linalg.norm(predicted_trajs[:, :, -1] - gt_traj[-1], axis=-1)
+    return final_error.flatten()
+
+
+def compute_kde_nll(predicted_trajs, gt_traj):
+    kde_ll = 0.
+    log_pdf_lower_bound = -20
+    num_timesteps = gt_traj.shape[0]
+    num_batches = predicted_trajs.shape[0]
+
+    for batch_num in range(num_batches):
+        for timestep in range(num_timesteps):
+            try:
+                kde = gaussian_kde(predicted_trajs[batch_num, :, timestep].T)
+                pdf = np.clip(kde.logpdf(gt_traj[timestep].T), a_min=log_pdf_lower_bound, a_max=None)[0]
+                kde_ll += pdf / (num_timesteps * num_batches)
+            except np.linalg.LinAlgError:
+                kde_ll = np.nan
+
+    return -kde_ll
+
+
+def compute_obs_violations(predicted_trajs, map):
+    obs_map = map.data
+
+    interp_obs_map = RectBivariateSpline(range(obs_map.shape[1]),
+                                         range(obs_map.shape[0]),
+                                         binary_dilation(obs_map.T, iterations=4),
+                                         kx=1, ky=1)
+
+    old_shape = predicted_trajs.shape
+    pred_trajs_map = map.to_map_points(predicted_trajs.reshape((-1, 2)))
+
+    traj_obs_values = interp_obs_map(pred_trajs_map[:, 0], pred_trajs_map[:, 1], grid=False)
+    traj_obs_values = traj_obs_values.reshape((old_shape[0], old_shape[1]))
+    num_viol_trajs = np.sum(traj_obs_values.max(axis=1) > 0, dtype=float)
+
+    return num_viol_trajs
+
+
+def compute_batch_statistics(prediction_output_dict,
+                             dt,
+                             max_hl,
+                             ph,
+                             node_type_enum,
+                             kde=True,
+                             obs=False,
+                             map=None,
+                             prune_ph_to_future=False,
+                             best_of=False):
+
+    (prediction_dict,
+     _,
+     futures_dict) = prediction_output_to_trajectories(prediction_output_dict,
+                                                       dt,
+                                                       max_hl,
+                                                       ph,
+                                                       prune_ph_to_future=prune_ph_to_future)
+
+    batch_error_dict = dict()
+    for node_type in node_type_enum:
+        batch_error_dict[node_type] =  {'ade': list(), 'fde': list(), 'kde': list(), 'obs_viols': list()}
+
+    for t in prediction_dict.keys():
+        for node in prediction_dict[t].keys():
+            ade_errors = compute_ade(prediction_dict[t][node], futures_dict[t][node])
+            fde_errors = compute_fde(prediction_dict[t][node], futures_dict[t][node])
+            if kde:
+                kde_ll = compute_kde_nll(prediction_dict[t][node], futures_dict[t][node])
+            else:
+                kde_ll = 0
+            if obs:
+                obs_viols = compute_obs_violations(prediction_dict[t][node], map)
+            else:
+                obs_viols = 0
+            if best_of:
+                ade_errors = np.min(ade_errors, keepdims=True)
+                fde_errors = np.min(fde_errors, keepdims=True)
+                kde_ll = np.min(kde_ll)
+            batch_error_dict[node.type]['ade'].extend(list(ade_errors))
+            batch_error_dict[node.type]['fde'].extend(list(fde_errors))
+            batch_error_dict[node.type]['kde'].extend([kde_ll])
+            batch_error_dict[node.type]['obs_viols'].extend([obs_viols])
+
+    return batch_error_dict
+
+
+def log_batch_errors(batch_errors_list, log_writer, namespace, curr_iter, bar_plot=[], box_plot=[]):
+    for node_type in batch_errors_list[0].keys():
+        for metric in batch_errors_list[0][node_type].keys():
+            metric_batch_error = []
+            for batch_errors in batch_errors_list:
+                metric_batch_error.extend(batch_errors[node_type][metric])
+
+            if len(metric_batch_error) > 0:
+                log_writer.add_histogram(f"{node_type.name}/{namespace}/{metric}", metric_batch_error, curr_iter)
+                log_writer.add_scalar(f"{node_type.name}/{namespace}/{metric}_mean", np.mean(metric_batch_error), curr_iter)
+                log_writer.add_scalar(f"{node_type.name}/{namespace}/{metric}_median", np.median(metric_batch_error), curr_iter)
+
+                if metric in bar_plot:
+                    pd = {'dataset': [namespace] * len(metric_batch_error),
+                                  metric: metric_batch_error}
+                    kde_barplot_fig, ax = plt.subplots(figsize=(5, 5))
+                    visualization.visualization_utils.plot_barplots(ax, pd, 'dataset', metric)
+                    log_writer.add_figure(f"{node_type.name}/{namespace}/{metric}_bar_plot", kde_barplot_fig, curr_iter)
+
+                if metric in box_plot:
+                    mse_fde_pd = {'dataset': [namespace] * len(metric_batch_error),
+                                  metric: metric_batch_error}
+                    fig, ax = plt.subplots(figsize=(5, 5))
+                    visualization.visualization_utils.plot_boxplots(ax, mse_fde_pd, 'dataset', metric)
+                    log_writer.add_figure(f"{node_type.name}/{namespace}/{metric}_box_plot", fig, curr_iter)
+
+
+def print_batch_errors(batch_errors_list, namespace, curr_iter):
+    for node_type in batch_errors_list[0].keys():
+        for metric in batch_errors_list[0][node_type].keys():
+            metric_batch_error = []
+            for batch_errors in batch_errors_list:
+                metric_batch_error.extend(batch_errors[node_type][metric])
+
+            if len(metric_batch_error) > 0:
+                print(f"{curr_iter}: {node_type.name}/{namespace}/{metric}_mean", np.mean(metric_batch_error))
+                print(f"{curr_iter}: {node_type.name}/{namespace}/{metric}_median", np.median(metric_batch_error))
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py
new file mode 100644
index 000000000..2b3ee24cd
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/__init__.py
@@ -0,0 +1,2 @@
+from model.trajectron import Trajectron
+from model.mgcvae import MultimodalGenerativeCVAE
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py
new file mode 100644
index 000000000..116a37caf
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/__init__.py
@@ -0,0 +1,4 @@
+from .discrete_latent import DiscreteLatent
+from .gmm2d import GMM2D
+from .map_encoder import CNNMapEncoder
+from .additive_attention import AdditiveAttention, TemporallyBatchedAdditiveAttention
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py
new file mode 100644
index 000000000..93623242b
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/additive_attention.py
@@ -0,0 +1,67 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AdditiveAttention(nn.Module):
+    # Implementing the attention module of Bahdanau et al. 2015 where
+    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
+    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
+        super(AdditiveAttention, self).__init__()
+
+        if internal_dim is None:
+            internal_dim = int((encoder_hidden_state_dim + decoder_hidden_state_dim) / 2)
+
+        self.w1 = nn.Linear(encoder_hidden_state_dim, internal_dim, bias=False)
+        self.w2 = nn.Linear(decoder_hidden_state_dim, internal_dim, bias=False)
+        self.v = nn.Linear(internal_dim, 1, bias=False)
+
+    def score(self, encoder_state, decoder_state):
+        # encoder_state is of shape (batch, enc_dim)
+        # decoder_state is of shape (batch, dec_dim)
+        # return value should be of shape (batch, 1)
+        return self.v(torch.tanh(self.w1(encoder_state) + self.w2(decoder_state)))
+
+    def forward(self, encoder_states, decoder_state):
+        # encoder_states is of shape (batch, num_enc_states, enc_dim)
+        # decoder_state is of shape (batch, dec_dim)
+        score_vec = torch.cat([self.score(encoder_states[:, i], decoder_state) for i in range(encoder_states.shape[1])],
+                              dim=1)
+        # score_vec is of shape (batch, num_enc_states)
+
+        attention_probs = torch.unsqueeze(F.softmax(score_vec, dim=1), dim=2)
+        # attention_probs is of shape (batch, num_enc_states, 1)
+
+        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
+        # final_context_vec is of shape (batch, enc_dim)
+
+        return final_context_vec, attention_probs
+
+
+class TemporallyBatchedAdditiveAttention(AdditiveAttention):
+    # Implementing the attention module of Bahdanau et al. 2015 where
+    # score(h_j, s_(i-1)) = v . tanh(W_1 h_j + W_2 s_(i-1))
+    def __init__(self, encoder_hidden_state_dim, decoder_hidden_state_dim, internal_dim=None):
+        super(TemporallyBatchedAdditiveAttention, self).__init__(encoder_hidden_state_dim,
+                                                                 decoder_hidden_state_dim,
+                                                                 internal_dim)
+
+    def score(self, encoder_state, decoder_state):
+        # encoder_state is of shape (batch, num_enc_states, max_time, enc_dim)
+        # decoder_state is of shape (batch, max_time, dec_dim)
+        # return value should be of shape (batch, num_enc_states, max_time, 1)
+        return self.v(torch.tanh(self.w1(encoder_state) + torch.unsqueeze(self.w2(decoder_state), dim=1)))
+
+    def forward(self, encoder_states, decoder_state):
+        # encoder_states is of shape (batch, num_enc_states, max_time, enc_dim)
+        # decoder_state is of shape (batch, max_time, dec_dim)
+        score_vec = self.score(encoder_states, decoder_state)
+        # score_vec is of shape (batch, num_enc_states, max_time, 1)
+
+        attention_probs = F.softmax(score_vec, dim=1)
+        # attention_probs is of shape (batch, num_enc_states, max_time, 1)
+
+        final_context_vec = torch.sum(attention_probs * encoder_states, dim=1)
+        # final_context_vec is of shape (batch, max_time, enc_dim)
+
+        return final_context_vec, torch.squeeze(torch.transpose(attention_probs, 1, 2), dim=3)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py
new file mode 100644
index 000000000..b7ae68b00
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/discrete_latent.py
@@ -0,0 +1,109 @@
+import torch
+import torch.distributions as td
+import numpy as np
+from ..model_utils import ModeKeys
+
+
+class DiscreteLatent(object):
+    def __init__(self, hyperparams, device):
+        self.hyperparams = hyperparams
+        self.z_dim = hyperparams['N'] * hyperparams['K']
+        self.N = hyperparams['N']
+        self.K = hyperparams['K']
+        self.kl_min = hyperparams['kl_min']
+        self.device = device
+        self.temp = None  # filled in by MultimodalGenerativeCVAE.set_annealing_params
+        self.z_logit_clip = None  # filled in by MultimodalGenerativeCVAE.set_annealing_params
+        self.p_dist = None  # filled in by MultimodalGenerativeCVAE.encoder
+        self.q_dist = None  # filled in by MultimodalGenerativeCVAE.encoder
+
+    def dist_from_h(self, h, mode):
+        logits_separated = torch.reshape(h, (-1, self.N, self.K))
+        logits_separated_mean_zero = logits_separated - torch.mean(logits_separated, dim=-1, keepdim=True)
+        if self.z_logit_clip is not None and mode == ModeKeys.TRAIN:
+            c = self.z_logit_clip
+            logits = torch.clamp(logits_separated_mean_zero, min=-c, max=c)
+        else:
+            logits = logits_separated_mean_zero
+
+        return td.OneHotCategorical(logits=logits)
+
+    def sample_q(self, num_samples, mode):
+        bs = self.p_dist.probs.size()[0]
+        num_components = self.N * self.K
+        z_NK = torch.from_numpy(self.all_one_hot_combinations(self.N, self.K)).float().to(self.device).repeat(num_samples, bs)
+        return torch.reshape(z_NK, (num_samples * num_components, -1, self.z_dim))
+
+    def sample_p(self, num_samples, mode, most_likely_z=False, full_dist=True, all_z_sep=False):
+        num_components = 1
+        if full_dist:
+            bs = self.p_dist.probs.size()[0]
+            z_NK = torch.from_numpy(self.all_one_hot_combinations(self.N, self.K)).float().to(self.device).repeat(num_samples, bs)
+            num_components = self.K ** self.N
+            k = num_samples * num_components
+        elif all_z_sep:
+            bs = self.p_dist.probs.size()[0]
+            z_NK = torch.from_numpy(self.all_one_hot_combinations(self.N, self.K)).float().to(self.device).repeat(1, bs)
+            k = self.K ** self.N
+            num_samples = k
+        elif most_likely_z:
+            # Sampling the most likely z from p(z|x).
+            eye_mat = torch.eye(self.p_dist.event_shape[-1], device=self.device)
+            argmax_idxs = torch.argmax(self.p_dist.probs, dim=2)
+            z_NK = torch.unsqueeze(eye_mat[argmax_idxs], dim=0).expand(num_samples, -1, -1, -1)
+            k = num_samples
+        else:
+            z_NK = self.p_dist.sample((num_samples,))
+            k = num_samples
+
+        if mode == ModeKeys.PREDICT:
+            return torch.reshape(z_NK, (k, -1, self.N * self.K)), num_samples, num_components
+        else:
+            return torch.reshape(z_NK, (k, -1, self.N * self.K))
+
+    def kl_q_p(self, log_writer=None, prefix=None, curr_iter=None):
+        kl_separated = td.kl_divergence(self.q_dist, self.p_dist)
+        if len(kl_separated.size()) < 2:
+            kl_separated = torch.unsqueeze(kl_separated, dim=0)
+
+        kl_minibatch = torch.mean(kl_separated, dim=0, keepdim=True)
+
+        if log_writer is not None:
+            log_writer.add_scalar(prefix + '/true_kl', torch.sum(kl_minibatch), curr_iter)
+
+        if self.kl_min > 0:
+            kl_lower_bounded = torch.clamp(kl_minibatch, min=self.kl_min)
+            kl = torch.sum(kl_lower_bounded)
+        else:
+            kl = torch.sum(kl_minibatch)
+
+        return kl
+
+    def q_log_prob(self, z):
+        k = z.size()[0]
+        z_NK = torch.reshape(z, [k, -1, self.N, self.K])
+        return torch.sum(self.q_dist.log_prob(z_NK), dim=2)
+
+    def p_log_prob(self, z):
+        k = z.size()[0]
+        z_NK = torch.reshape(z, [k, -1, self.N, self.K])
+        return torch.sum(self.p_dist.log_prob(z_NK), dim=2)
+
+    def get_p_dist_probs(self):
+        return self.p_dist.probs
+
+    @staticmethod
+    def all_one_hot_combinations(N, K):
+        return np.eye(K).take(np.reshape(np.indices([K] * N), [N, -1]).T, axis=0).reshape(-1, N * K)  # [K**N, N*K]
+
+    def summarize_for_tensorboard(self, log_writer, prefix, curr_iter):
+        log_writer.add_histogram(prefix + "/latent/p_z_x", self.p_dist.probs, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/q_z_xy", self.q_dist.probs, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/p_z_x_logits", self.p_dist.logits, curr_iter)
+        log_writer.add_histogram(prefix + "/latent/q_z_xy_logits", self.q_dist.logits, curr_iter)
+        if self.z_dim <= 9:
+            for i in range(self.N):
+                for j in range(self.K):
+                    log_writer.add_histogram(prefix + "/latent/q_z_xy_logit{0}{1}".format(i, j),
+                                             self.q_dist.logits[:, i, j],
+                                             curr_iter)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py
new file mode 100644
index 000000000..37e373398
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/gmm2d.py
@@ -0,0 +1,158 @@
+import torch
+import torch.distributions as td
+import numpy as np
+from ..model_utils import to_one_hot
+
+
+class GMM2D(td.Distribution):
+    r"""
+    Gaussian Mixture Model using 2D Multivariate Gaussians each of as N components:
+    Cholesky decompesition and affine transformation for sampling:
+
+    .. math:: Z \sim N(0, I)
+
+    .. math:: S = \mu + LZ
+
+    .. math:: S \sim N(\mu, \Sigma) \rightarrow N(\mu, LL^T)
+
+    where :math:`L = chol(\Sigma)` and
+
+    .. math:: \Sigma = \left[ {\begin{array}{cc} \sigma^2_x & \rho \sigma_x \sigma_y \\ \rho \sigma_x \sigma_y & \sigma^2_y \\ \end{array} } \right]
+
+    such that
+
+    .. math:: L = chol(\Sigma) = \left[ {\begin{array}{cc} \sigma_x & 0 \\ \rho \sigma_y & \sigma_y \sqrt{1-\rho^2} \\ \end{array} } \right]
+
+    :param log_pis: Log Mixing Proportions :math:`log(\pi)`. [..., N]
+    :param mus: Mixture Components mean :math:`\mu`. [..., N * 2]
+    :param log_sigmas: Log Standard Deviations :math:`log(\sigma_d)`. [..., N * 2]
+    :param corrs: Cholesky factor of correlation :math:`\rho`. [..., N]
+    :param clip_lo: Clips the lower end of the standard deviation.
+    :param clip_hi: Clips the upper end of the standard deviation.
+    """
+    def __init__(self, log_pis, mus, log_sigmas, corrs):
+        super(GMM2D, self).__init__(batch_shape=log_pis.shape[0], event_shape=log_pis.shape[1:])
+        self.components = log_pis.shape[-1]
+        self.dimensions = 2
+        self.device = log_pis.device
+
+        log_pis = torch.clamp(log_pis, min=-1e5)
+        self.log_pis = log_pis - torch.logsumexp(log_pis, dim=-1, keepdim=True)  # [..., N]
+        self.mus = self.reshape_to_components(mus)         # [..., N, 2]
+        self.log_sigmas = self.reshape_to_components(log_sigmas)  # [..., N, 2]
+        self.sigmas = torch.exp(self.log_sigmas)                       # [..., N, 2]
+        self.one_minus_rho2 = 1 - corrs**2                        # [..., N]
+        self.one_minus_rho2 = torch.clamp(self.one_minus_rho2, min=1e-5, max=1)  # otherwise log can be nan
+        self.corrs = corrs  # [..., N]
+
+        self.L = torch.stack([torch.stack([self.sigmas[..., 0], torch.zeros_like(self.log_pis)], dim=-1),
+                              torch.stack([self.sigmas[..., 1] * self.corrs,
+                                           self.sigmas[..., 1] * torch.sqrt(self.one_minus_rho2)],
+                                          dim=-1)],
+                             dim=-2)
+
+        self.pis_cat_dist = td.Categorical(logits=log_pis)
+
+    @classmethod
+    def from_log_pis_mus_cov_mats(cls, log_pis, mus, cov_mats):
+        corrs_sigma12 = cov_mats[..., 0, 1]
+        sigma_1 = torch.clamp(cov_mats[..., 0, 0], min=1e-8)
+        sigma_2 = torch.clamp(cov_mats[..., 1, 1], min=1e-8)
+        sigmas = torch.stack([torch.sqrt(sigma_1), torch.sqrt(sigma_2)], dim=-1)
+        log_sigmas = torch.log(sigmas)
+        corrs = corrs_sigma12 / (torch.prod(sigmas, dim=-1))
+        return cls(log_pis, mus, log_sigmas, corrs)
+
+    def rsample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched.
+
+        :param sample_shape: Shape of the samples
+        :return: Samples from the GMM.
+        """
+        mvn_samples = (self.mus +
+                       torch.squeeze(
+                           torch.matmul(self.L,
+                                        torch.unsqueeze(
+                                            torch.randn(size=sample_shape + self.mus.shape, device=self.device),
+                                            dim=-1)
+                                        ),
+                           dim=-1))
+        component_cat_samples = self.pis_cat_dist.sample(sample_shape)
+        selector = torch.unsqueeze(to_one_hot(component_cat_samples, self.components), dim=-1)
+        return torch.sum(mvn_samples*selector, dim=-2)
+
+    def log_prob(self, value):
+        r"""
+        Calculates the log probability of a value using the PDF for bivariate normal distributions:
+
+        .. math::
+            f(x | \mu, \sigma, \rho)={\frac {1}{2\pi \sigma _{x}\sigma _{y}{\sqrt {1-\rho ^{2}}}}}\exp
+            \left(-{\frac {1}{2(1-\rho ^{2})}}\left[{\frac {(x-\mu _{x})^{2}}{\sigma _{x}^{2}}}+
+            {\frac {(y-\mu _{y})^{2}}{\sigma _{y}^{2}}}-{\frac {2\rho (x-\mu _{x})(y-\mu _{y})}
+            {\sigma _{x}\sigma _{y}}}\right]\right)
+
+        :param value: The log probability density function is evaluated at those values.
+        :return: Log probability
+        """
+        # x: [..., 2]
+        value = torch.unsqueeze(value, dim=-2)       # [..., 1, 2]
+        dx = value - self.mus                       # [..., N, 2]
+
+        exp_nominator = ((torch.sum((dx/self.sigmas)**2, dim=-1)  # first and second term of exp nominator
+                          - 2*self.corrs*torch.prod(dx, dim=-1)/torch.prod(self.sigmas, dim=-1)))    # [..., N]
+
+        component_log_p = -(2*np.log(2*np.pi)
+                            + torch.log(self.one_minus_rho2)
+                            + 2*torch.sum(self.log_sigmas, dim=-1)
+                            + exp_nominator/self.one_minus_rho2) / 2
+
+        return torch.logsumexp(self.log_pis + component_log_p, dim=-1)
+
+    def get_for_node_at_time(self, n, t):
+        return self.__class__(self.log_pis[:, n:n+1, t:t+1], self.mus[:, n:n+1, t:t+1],
+                              self.log_sigmas[:, n:n+1, t:t+1], self.corrs[:, n:n+1, t:t+1])
+
+    def mode(self):
+        """
+        Calculates the mode of the GMM by calculating probabilities of a 2D mesh grid
+
+        :param required_accuracy: Accuracy of the meshgrid
+        :return: Mode of the GMM
+        """
+        if self.mus.shape[-2] > 1:
+            samp, bs, time, comp, _ = self.mus.shape
+            assert samp == 1, "For taking the mode only one sample makes sense."
+            mode_node_list = []
+            for n in range(bs):
+                mode_t_list = []
+                for t in range(time):
+                    nt_gmm = self.get_for_node_at_time(n, t)
+                    x_min = self.mus[:, n, t, :, 0].min()
+                    x_max = self.mus[:, n, t, :, 0].max()
+                    y_min = self.mus[:, n, t, :, 1].min()
+                    y_max = self.mus[:, n, t, :, 1].max()
+                    search_grid = torch.stack(torch.meshgrid([torch.arange(x_min, x_max, 0.01),
+                                                              torch.arange(y_min, y_max, 0.01)]), dim=2
+                                              ).view(-1, 2).float().to(self.device)
+
+                    ll_score = nt_gmm.log_prob(search_grid)
+                    argmax = torch.argmax(ll_score.squeeze(), dim=0)
+                    mode_t_list.append(search_grid[argmax])
+                mode_node_list.append(torch.stack(mode_t_list, dim=0))
+            return torch.stack(mode_node_list, dim=0).unsqueeze(dim=0)
+        return torch.squeeze(self.mus, dim=-2)
+
+    def reshape_to_components(self, tensor):
+        if len(tensor.shape) == 5:
+            return tensor
+        return torch.reshape(tensor, list(tensor.shape[:-1]) + [self.components, self.dimensions])
+
+    def get_covariance_matrix(self):
+        cov = self.corrs * torch.prod(self.sigmas, dim=-1)
+        E = torch.stack([torch.stack([self.sigmas[..., 0]**2, cov], dim=-1),
+                         torch.stack([cov, self.sigmas[..., 1]**2], dim=-1)],
+                        dim=-2)
+        return E
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py
new file mode 100644
index 000000000..fc8d89a03
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/graph_attention.py
@@ -0,0 +1,58 @@
+import warnings
+import math
+import numbers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init, Parameter
+
+
+class GraphMultiTypeAttention(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features, bias=True, types=1):
+        super(GraphMultiTypeAttention, self).__init__()
+        self.types = types
+        self.in_features = in_features
+        self.out_features = out_features
+        self.node_self_loop_weight = Parameter(torch.Tensor(hidden_features, in_features[0]))
+
+        self.weight_per_type = nn.ParameterList()
+        for i in range(types):
+            self.weight_per_type.append(Parameter(torch.Tensor(hidden_features, in_features[i])))
+        if bias:
+            self.bias = Parameter(torch.Tensor(hidden_features))
+        else:
+            self.register_parameter('bias', None)
+
+        self.linear_to_out = nn.Linear(hidden_features, out_features, bias=bias)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for weight in self.weight_per_type:
+            bound = 1 / math.sqrt(weight.size(1))
+            init.uniform_(weight, -bound, bound)
+        bound = 1 / math.sqrt(self.node_self_loop_weight.size(1))
+        init.uniform_(self.node_self_loop_weight, -bound, bound)
+        if self.bias is not None:
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, inputs, types, edge_weights):
+        weight_list = list()
+        for i, type in enumerate(types):
+            weight_list.append((edge_weights[i] / len(edge_weights)) * self.weight_per_type[type].T)
+        weight_list.append(self.node_self_loop_weight.T)
+        weight = torch.cat(weight_list, dim=0)
+        stacked_input = torch.cat(inputs, dim=-1)
+        output = stacked_input.matmul(weight)
+
+        output = output
+
+        if self.bias is not None:
+            output += self.bias
+
+        return torch.relu(self.linear_to_out(torch.relu(output)))
+
+    def extra_repr(self):
+        return 'in_features={}, hidden_features={},, out_features={}, types={}, bias={}'.format(
+            self.in_features, self.hidden_features, self.out_features, self.types, self.bias is not None
+        )
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py
new file mode 100644
index 000000000..27d6e1d36
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/components/map_encoder.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class CNNMapEncoder(nn.Module):
+    def __init__(self, map_channels, hidden_channels, output_size, masks, strides, patch_size):
+        super(CNNMapEncoder, self).__init__()
+        self.convs = nn.ModuleList()
+        patch_size_x = patch_size[0] + patch_size[2]
+        patch_size_y = patch_size[1] + patch_size[3]
+        input_size = (map_channels, patch_size_x, patch_size_y)
+        x_dummy = torch.ones(input_size).unsqueeze(0) * torch.tensor(float('nan'))
+
+        for i, hidden_size in enumerate(hidden_channels):
+            self.convs.append(nn.Conv2d(map_channels if i == 0 else hidden_channels[i-1],
+                                        hidden_channels[i], masks[i],
+                                        stride=strides[i]))
+            x_dummy = self.convs[i](x_dummy)
+
+        self.fc = nn.Linear(x_dummy.numel(), output_size)
+
+    def forward(self, x, training):
+        for conv in self.convs:
+            x = F.leaky_relu(conv(x), 0.2)
+        x = torch.flatten(x, start_dim=1)
+        x = self.fc(x)
+        return x
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py
new file mode 100644
index 000000000..a01f88e8c
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/__init__.py
@@ -0,0 +1,2 @@
+from .dataset import EnvironmentDataset, NodeTypeDataset
+from .preprocessing import collate, get_node_timestep_data, get_timesteps_data, restore, get_relative_robot_traj
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py
new file mode 100644
index 000000000..ef361826c
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/dataset.py
@@ -0,0 +1,76 @@
+from torch.utils import data
+import numpy as np
+from .preprocessing import get_node_timestep_data
+
+
+class EnvironmentDataset(object):
+    def __init__(self, env, state, pred_state, node_freq_mult, scene_freq_mult, hyperparams, **kwargs):
+        self.env = env
+        self.state = state
+        self.pred_state = pred_state
+        self.hyperparams = hyperparams
+        self.max_ht = self.hyperparams['maximum_history_length']
+        self.max_ft = kwargs['min_future_timesteps']
+        self.node_type_datasets = list()
+        self._augment = False
+        for node_type in env.NodeType:
+            if node_type not in hyperparams['pred_state']:
+                continue
+            self.node_type_datasets.append(NodeTypeDataset(env, node_type, state, pred_state, node_freq_mult,
+                                                           scene_freq_mult, hyperparams, **kwargs))
+
+    @property
+    def augment(self):
+        return self._augment
+
+    @augment.setter
+    def augment(self, value):
+        self._augment = value
+        for node_type_dataset in self.node_type_datasets:
+            node_type_dataset.augment = value
+
+    def __iter__(self):
+        return iter(self.node_type_datasets)
+
+
+class NodeTypeDataset(data.Dataset):
+    def __init__(self, env, node_type, state, pred_state, node_freq_mult,
+                 scene_freq_mult, hyperparams, augment=False, **kwargs):
+        self.env = env
+        self.state = state
+        self.pred_state = pred_state
+        self.hyperparams = hyperparams
+        self.max_ht = self.hyperparams['maximum_history_length']
+        self.max_ft = kwargs['min_future_timesteps']
+
+        self.augment = augment
+
+        self.node_type = node_type
+        self.index = self.index_env(node_freq_mult, scene_freq_mult, **kwargs)
+        self.len = len(self.index)
+        self.edge_types = [edge_type for edge_type in env.get_edge_types() if edge_type[0] is node_type]
+
+    def index_env(self, node_freq_mult, scene_freq_mult, **kwargs):
+        index = list()
+        for scene in self.env.scenes:
+            present_node_dict = scene.present_nodes(np.arange(0, scene.timesteps), type=self.node_type, **kwargs)
+            for t, nodes in present_node_dict.items():
+                for node in nodes:
+                    index += [(scene, t, node)] *\
+                             (scene.frequency_multiplier if scene_freq_mult else 1) *\
+                             (node.frequency_multiplier if node_freq_mult else 1)
+
+        return index
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, i):
+        (scene, t, node) = self.index[i]
+
+        if self.augment:
+            scene = scene.augment()
+            node = scene.get_node_by_id(node.id)
+
+        return get_node_timestep_data(self.env, scene, t, node, self.state, self.pred_state,
+                                      self.edge_types, self.max_ht, self.max_ft, self.hyperparams)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py
new file mode 100644
index 000000000..885ab5f9a
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/homography_warper.py
@@ -0,0 +1,471 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+
+
+pi = torch.tensor(3.14159265358979323846)
+
+
+def deg2rad(tensor: torch.Tensor) -> torch.Tensor:
+    r"""Function that converts angles from degrees to radians.
+    Args:
+        tensor (torch.Tensor): Tensor of arbitrary shape.
+    Returns:
+        torch.Tensor: tensor with same shape as input.
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(tensor)))
+
+    return tensor * pi.to(tensor.device).type(tensor.dtype) / 180.
+
+
+def angle_to_rotation_matrix(angle: torch.Tensor) -> torch.Tensor:
+    """
+    Creates a rotation matrix out of angles in degrees
+    Args:
+        angle: (torch.Tensor): tensor of angles in degrees, any shape.
+    Returns:
+        torch.Tensor: tensor of *x2x2 rotation matrices.
+    Shape:
+        - Input: :math:`(*)`
+        - Output: :math:`(*, 2, 2)`
+    Example:
+        >>> input = torch.rand(1, 3)  # Nx3
+        >>> output = kornia.angle_to_rotation_matrix(input)  # Nx3x2x2
+    """
+    ang_rad = deg2rad(angle)
+    cos_a: torch.Tensor = torch.cos(ang_rad)
+    sin_a: torch.Tensor = torch.sin(ang_rad)
+    return torch.stack([cos_a, sin_a, -sin_a, cos_a], dim=-1).view(*angle.shape, 2, 2)
+
+
+def get_rotation_matrix2d(
+        center: torch.Tensor,
+        angle: torch.Tensor,
+        scale: torch.Tensor) -> torch.Tensor:
+    r"""Calculates an affine matrix of 2D rotation.
+    The function calculates the following matrix:
+    .. math::
+        \begin{bmatrix}
+            \alpha & \beta & (1 - \alpha) \cdot \text{x}
+            - \beta \cdot \text{y} \\
+            -\beta & \alpha & \beta \cdot \text{x}
+            + (1 - \alpha) \cdot \text{y}
+        \end{bmatrix}
+    where
+    .. math::
+        \alpha = \text{scale} \cdot cos(\text{radian}) \\
+        \beta = \text{scale} \cdot sin(\text{radian})
+    The transformation maps the rotation center to itself
+    If this is not the target, adjust the shift.
+    Args:
+        center (Tensor): center of the rotation in the source image.
+        angle (Tensor): rotation radian in degrees. Positive values mean
+            counter-clockwise rotation (the coordinate origin is assumed to
+            be the top-left corner).
+        scale (Tensor): isotropic scale factor.
+    Returns:
+        Tensor: the affine matrix of 2D rotation.
+    Shape:
+        - Input: :math:`(B, 2)`, :math:`(B)` and :math:`(B)`
+        - Output: :math:`(B, 2, 3)`
+    Example:
+        >>> center = torch.zeros(1, 2)
+        >>> scale = torch.ones(1)
+        >>> radian = 45. * torch.ones(1)
+        >>> M = kornia.get_rotation_matrix2d(center, radian, scale)
+        tensor([[[ 0.7071,  0.7071,  0.0000],
+                 [-0.7071,  0.7071,  0.0000]]])
+    """
+    if not torch.is_tensor(center):
+        raise TypeError("Input center type is not a torch.Tensor. Got {}"
+                        .format(type(center)))
+    if not torch.is_tensor(angle):
+        raise TypeError("Input radian type is not a torch.Tensor. Got {}"
+                        .format(type(angle)))
+    if not torch.is_tensor(scale):
+        raise TypeError("Input scale type is not a torch.Tensor. Got {}"
+                        .format(type(scale)))
+    if not (len(center.shape) == 2 and center.shape[1] == 2):
+        raise ValueError("Input center must be a Bx2 tensor. Got {}"
+                         .format(center.shape))
+    if not len(angle.shape) == 1:
+        raise ValueError("Input radian must be a B tensor. Got {}"
+                         .format(angle.shape))
+    if not len(scale.shape) == 1:
+        raise ValueError("Input scale must be a B tensor. Got {}"
+                         .format(scale.shape))
+    if not (center.shape[0] == angle.shape[0] == scale.shape[0]):
+        raise ValueError("Inputs must have same batch size dimension. Got {}"
+                         .format(center.shape, angle.shape, scale.shape))
+    # convert radian and apply scale
+    scaled_rotation: torch.Tensor = angle_to_rotation_matrix(angle) * scale.view(-1, 1, 1)
+    alpha: torch.Tensor = scaled_rotation[:, 0, 0]
+    beta: torch.Tensor = scaled_rotation[:, 0, 1]
+
+    # unpack the center to x, y coordinates
+    x: torch.Tensor = center[..., 0]
+    y: torch.Tensor = center[..., 1]
+
+    # create output tensor
+    batch_size: int = center.shape[0]
+    M: torch.Tensor = torch.zeros(
+        batch_size, 2, 3, device=center.device, dtype=center.dtype)
+    M[..., 0:2, 0:2] = scaled_rotation
+    M[..., 0, 2] = (torch.tensor(1.) - alpha) * x - beta * y
+    M[..., 1, 2] = beta * x + (torch.tensor(1.) - alpha) * y
+    return M
+
+def convert_points_to_homogeneous(points: torch.Tensor) -> torch.Tensor:
+    r"""Function that converts points from Euclidean to homogeneous space.
+    Examples::
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = kornia.convert_points_to_homogeneous(input)  # BxNx4
+    """
+    if not isinstance(points, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    return torch.nn.functional.pad(points, [0, 1], "constant", 1.0)
+
+
+def convert_points_from_homogeneous(
+        points: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
+    r"""Function that converts points from homogeneous to Euclidean space.
+    Examples::
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = kornia.convert_points_from_homogeneous(input)  # BxNx2
+    """
+    if not isinstance(points, torch.Tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    # we check for points at infinity
+    z_vec: torch.Tensor = points[..., -1:]
+
+    # set the results of division by zeror/near-zero to 1.0
+    # follow the convention of opencv:
+    # https://github.com/opencv/opencv/pull/14411/files
+    mask: torch.Tensor = torch.abs(z_vec) > eps
+    scale: torch.Tensor = torch.ones_like(z_vec).masked_scatter_(
+        mask, torch.tensor(1.0).to(points.device) / z_vec[mask])
+
+    return scale * points[..., :-1]
+
+def transform_points(trans_01: torch.Tensor,
+                     points_1: torch.Tensor) -> torch.Tensor:
+    r"""Function that applies transformations to a set of points.
+    Args:
+        trans_01 (torch.Tensor): tensor for transformations of shape
+          :math:`(B, D+1, D+1)`.
+        points_1 (torch.Tensor): tensor of points of shape :math:`(B, N, D)`.
+    Returns:
+        torch.Tensor: tensor of N-dimensional points.
+    Shape:
+        - Output: :math:`(B, N, D)`
+    Examples:
+        >>> points_1 = torch.rand(2, 4, 3)  # BxNx3
+        >>> trans_01 = torch.eye(4).view(1, 4, 4)  # Bx4x4
+        >>> points_0 = kornia.transform_points(trans_01, points_1)  # BxNx3
+    """
+    if not torch.is_tensor(trans_01) or not torch.is_tensor(points_1):
+        raise TypeError("Input type is not a torch.Tensor")
+    if not trans_01.device == points_1.device:
+        raise TypeError("Tensor must be in the same device")
+    if not trans_01.shape[0] == points_1.shape[0] and trans_01.shape[0] != 1:
+        raise ValueError("Input batch size must be the same for both tensors or 1")
+    if not trans_01.shape[-1] == (points_1.shape[-1] + 1):
+        raise ValueError("Last input dimensions must differe by one unit")
+    # to homogeneous
+    points_1_h = convert_points_to_homogeneous(points_1)  # BxNxD+1
+    # transform coordinates
+    points_0_h = torch.matmul(
+        trans_01.unsqueeze(1), points_1_h.unsqueeze(-1))
+    points_0_h = torch.squeeze(points_0_h, dim=-1)
+    # to euclidean
+    points_0 = convert_points_from_homogeneous(points_0_h)  # BxNxD
+    return points_0
+
+
+def multi_linspace(a, b, num, endpoint=True, device='cpu', dtype=torch.float):
+    """This function is just like np.linspace, but will create linearly
+    spaced vectors from a start to end vector.
+    Inputs:
+        a - Start vector.
+        b - End vector.
+        num - Number of samples to generate. Default is 50. Must be above 0.
+        endpoint - If True, b is the last sample.
+                   Otherwise, it is not included. Default is True.
+    """
+
+    return a[..., None] + (b-a)[..., None]/(num-endpoint) * torch.arange(num, device=device, dtype=dtype)
+
+
+def create_batched_meshgrid(
+        x_min: torch.Tensor,
+        y_min: torch.Tensor,
+        x_max: torch.Tensor,
+        y_max: torch.Tensor,
+        height: int,
+        width: int,
+        device: Optional[torch.device] = torch.device('cpu')) -> torch.Tensor:
+    """Generates a coordinate grid for an image.
+    When the flag `normalized_coordinates` is set to True, the grid is
+    normalized to be in the range [-1,1] to be consistent with the pytorch
+    function grid_sample.
+    http://pytorch.org/docs/master/nn.html#torch.nn.functional.grid_sample
+    Args:
+        height (int): the image height (rows).
+        width (int): the image width (cols).
+        normalized_coordinates (Optional[bool]): whether to normalize
+          coordinates in the range [-1, 1] in order to be consistent with the
+          PyTorch function grid_sample.
+    Return:
+        torch.Tensor: returns a grid tensor with shape :math:`(1, H, W, 2)`.
+    """
+    # generate coordinates
+    xs = multi_linspace(x_min, x_max, width, device=device, dtype=torch.float)
+    ys = multi_linspace(y_min, y_max, height, device=device, dtype=torch.float)
+
+    # generate grid by stacking coordinates
+    bs = x_min.shape[0]
+    batched_grid_i_list = list()
+    for i in range(bs):
+        batched_grid_i_list.append(torch.stack(torch.meshgrid([xs[i], ys[i]])).transpose(1, 2))  # 2xHxW
+    batched_grid: torch.Tensor = torch.stack(batched_grid_i_list, dim=0)
+    return batched_grid.permute(0, 2, 3, 1)  # BxHxWx2
+
+
+def homography_warp(patch_src: torch.Tensor,
+                    centers: torch.Tensor,
+                    dst_homo_src: torch.Tensor,
+                    dsize: Tuple[int, int],
+                    mode: str = 'bilinear',
+                    padding_mode: str = 'zeros') -> torch.Tensor:
+    r"""Function that warps image patchs or tensors by homographies.
+    See :class:`~kornia.geometry.warp.HomographyWarper` for details.
+    Args:
+        patch_src (torch.Tensor): The image or tensor to warp. Should be from
+                                  source of shape :math:`(N, C, H, W)`.
+        dst_homo_src (torch.Tensor): The homography or stack of homographies
+                                     from source to destination of shape
+                                     :math:`(N, 3, 3)`.
+        dsize (Tuple[int, int]): The height and width of the image to warp.
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+    Return:
+        torch.Tensor: Patch sampled at locations from source to destination.
+    Example:
+        >>> input = torch.rand(1, 3, 32, 32)
+        >>> homography = torch.eye(3).view(1, 3, 3)
+        >>> output = kornia.homography_warp(input, homography, (32, 32))
+    """
+
+    out_height, out_width = dsize
+    image_height, image_width = patch_src.shape[-2:]
+    x_min = 2. * (centers[..., 0] - out_width/2) / image_width - 1.
+    y_min = 2. * (centers[..., 1] - out_height/2) / image_height - 1.
+    x_max = 2. * (centers[..., 0] + out_width/2) / image_width - 1.
+    y_max = 2. * (centers[..., 1] + out_height/2) / image_height - 1.
+    warper = HomographyWarper(x_min, y_min, x_max, y_max, out_height, out_width, mode, padding_mode)
+    return warper(patch_src, dst_homo_src)
+
+
+def normal_transform_pixel(height, width):
+
+    tr_mat = torch.Tensor([[1.0, 0.0, -1.0],
+                           [0.0, 1.0, -1.0],
+                           [0.0, 0.0, 1.0]])  # 1x3x3
+
+    tr_mat[0, 0] = tr_mat[0, 0] * 2.0 / (width - 1.0)
+    tr_mat[1, 1] = tr_mat[1, 1] * 2.0 / (height - 1.0)
+
+    tr_mat = tr_mat.unsqueeze(0)
+
+    return tr_mat
+
+
+def src_norm_to_dst_norm(dst_pix_trans_src_pix: torch.Tensor,
+                         dsize_src: Tuple[int, int], dsize_dst: Tuple[int, int]) -> torch.Tensor:
+    # source and destination sizes
+    src_h, src_w = dsize_src
+    dst_h, dst_w = dsize_dst
+    # the devices and types
+    device: torch.device = dst_pix_trans_src_pix.device
+    dtype: torch.dtype = dst_pix_trans_src_pix.dtype
+    # compute the transformation pixel/norm for src/dst
+    src_norm_trans_src_pix: torch.Tensor = normal_transform_pixel(
+        src_h, src_w).to(device, dtype)
+    src_pix_trans_src_norm = torch.inverse(src_norm_trans_src_pix)
+    dst_norm_trans_dst_pix: torch.Tensor = normal_transform_pixel(
+        dst_h, dst_w).to(device, dtype)
+    # compute chain transformations
+    dst_norm_trans_src_norm: torch.Tensor = (
+        dst_norm_trans_dst_pix @ (dst_pix_trans_src_pix @ src_pix_trans_src_norm)
+    )
+    return dst_norm_trans_src_norm
+
+
+def transform_warp_impl(src: torch.Tensor, centers: torch.Tensor, dst_pix_trans_src_pix: torch.Tensor,
+                        dsize_src: Tuple[int, int], dsize_dst: Tuple[int, int],
+                        grid_mode: str, padding_mode: str) -> torch.Tensor:
+    """Compute the transform in normalized cooridnates and perform the warping.
+    """
+    dst_norm_trans_src_norm: torch.Tensor = src_norm_to_dst_norm(
+        dst_pix_trans_src_pix, dsize_src, dsize_src)
+
+    src_norm_trans_dst_norm = torch.inverse(dst_norm_trans_src_norm)
+    return homography_warp(src, centers, src_norm_trans_dst_norm, dsize_dst, grid_mode, padding_mode)
+
+
+class HomographyWarper(nn.Module):
+    r"""Warps image patches or tensors by homographies.
+    .. math::
+        X_{dst} = H_{src}^{\{dst\}} * X_{src}
+    Args:
+        height (int): The height of the image to warp.
+        width (int): The width of the image to warp.
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+    """
+
+    def __init__(
+            self,
+            x_min: torch.Tensor,
+            y_min: torch.Tensor,
+            x_max: torch.Tensor,
+            y_max: torch.Tensor,
+            height: int,
+            width: int,
+            mode: str = 'bilinear',
+            padding_mode: str = 'zeros') -> None:
+        super(HomographyWarper, self).__init__()
+        self.width: int = width
+        self.height: int = height
+        self.mode: str = mode
+        self.padding_mode: str = padding_mode
+
+        # create base grid to compute the flow
+        self.grid: torch.Tensor = create_batched_meshgrid(x_min, y_min, x_max, y_max, height, width)
+
+    def warp_grid(self, dst_homo_src: torch.Tensor) -> torch.Tensor:
+        r"""Computes the grid to warp the coordinates grid by an homography.
+        Args:
+            dst_homo_src (torch.Tensor): Homography or homographies (stacked) to
+                              transform all points in the grid. Shape of the
+                              homography has to be :math:`(N, 3, 3)`.
+        Returns:
+            torch.Tensor: the transformed grid of shape :math:`(N, H, W, 2)`.
+        """
+        batch_size: int = dst_homo_src.shape[0]
+        device: torch.device = dst_homo_src.device
+        dtype: torch.dtype = dst_homo_src.dtype
+        # expand grid to match the input batch size
+        grid: torch.Tensor = self.grid
+        if len(dst_homo_src.shape) == 3:  # local homography case
+            dst_homo_src = dst_homo_src.view(batch_size, 1, 3, 3)  # NxHxWx3x3
+        # perform the actual grid transformation,
+        # the grid is copied to input device and casted to the same type
+        flow: torch.Tensor = transform_points(
+            dst_homo_src, grid.to(device).to(dtype))  # NxHxWx2
+        return flow.view(batch_size, self.height, self.width, 2)  # NxHxWx2
+
+    def forward(  # type: ignore
+            self,
+            patch_src: torch.Tensor,
+            dst_homo_src: torch.Tensor) -> torch.Tensor:
+        r"""Warps an image or tensor from source into reference frame.
+        Args:
+            patch_src (torch.Tensor): The image or tensor to warp.
+                                      Should be from source.
+            dst_homo_src (torch.Tensor): The homography or stack of homographies
+             from source to destination. The homography assumes normalized
+             coordinates [-1, 1].
+        Return:
+            torch.Tensor: Patch sampled at locations from source to destination.
+        Shape:
+            - Input: :math:`(N, C, H, W)` and :math:`(N, 3, 3)`
+            - Output: :math:`(N, C, H, W)`
+        Example:
+            >>> input = torch.rand(1, 3, 32, 32)
+            >>> homography = torch.eye(3).view(1, 3, 3)
+            >>> warper = kornia.HomographyWarper(32, 32)
+            >>> output = warper(input, homography)  # NxCxHxW
+        """
+        if not dst_homo_src.device == patch_src.device:
+            raise TypeError("Patch and homography must be on the same device. \
+                            Got patch.device: {} dst_H_src.device: {}."
+                            .format(patch_src.device, dst_homo_src.device))
+
+        return F.grid_sample(patch_src, self.warp_grid(dst_homo_src),  # type: ignore
+                             mode=self.mode, padding_mode=self.padding_mode, align_corners=True)
+
+
+def warp_affine_crop(src: torch.Tensor, centers: torch.Tensor, M: torch.Tensor,
+                dsize: Tuple[int, int], flags: str = 'bilinear',
+                padding_mode: str = 'zeros') -> torch.Tensor:
+    r"""Applies an affine transformation to a tensor.
+
+    The function warp_affine transforms the source tensor using
+    the specified matrix:
+
+    .. math::
+        \text{dst}(x, y) = \text{src} \left( M_{11} x + M_{12} y + M_{13} ,
+        M_{21} x + M_{22} y + M_{23} \right )
+
+    Args:
+        src (torch.Tensor): input tensor of shape :math:`(B, C, H, W)`.
+        M (torch.Tensor): affine transformation of shape :math:`(B, 2, 3)`.
+        dsize (Tuple[int, int]): size of the output image (height, width).
+        mode (str): interpolation mode to calculate output values
+          'bilinear' | 'nearest'. Default: 'bilinear'.
+        padding_mode (str): padding mode for outside grid values
+          'zeros' | 'border' | 'reflection'. Default: 'zeros'.
+
+    Returns:
+        torch.Tensor: the warped tensor.
+
+    Shape:
+        - Output: :math:`(B, C, H, W)`
+
+    .. note::
+       See a working example `here <https://kornia.readthedocs.io/en/latest/
+       tutorials/warp_affine.html>`__.
+    """
+    if not torch.is_tensor(src):
+        raise TypeError("Input src type is not a torch.Tensor. Got {}"
+                        .format(type(src)))
+
+    if not torch.is_tensor(M):
+        raise TypeError("Input M type is not a torch.Tensor. Got {}"
+                        .format(type(M)))
+
+    if not len(src.shape) == 4:
+        raise ValueError("Input src must be a BxCxHxW tensor. Got {}"
+                         .format(src.shape))
+
+    if not (len(M.shape) == 3 or M.shape[-2:] == (2, 3)):
+        raise ValueError("Input M must be a Bx2x3 tensor. Got {}"
+                         .format(src.shape))
+
+    # we generate a 3x3 transformation matrix from 2x3 affine
+    M_3x3: torch.Tensor = F.pad(M, [0, 0, 0, 1, 0, 0],
+                                mode="constant", value=0)
+    M_3x3[:, 2, 2] += 1.0
+
+    # launches the warper
+    h, w = src.shape[-2:]
+    return transform_warp_impl(src, centers, M_3x3, (h, w), dsize, flags, padding_mode)
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py
new file mode 100644
index 000000000..844d3068c
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dataset/preprocessing.py
@@ -0,0 +1,234 @@
+import torch
+import numpy as np
+import collections.abc
+from torch.utils.data._utils.collate import default_collate
+import dill
+container_abcs = collections.abc
+
+
+def restore(data):
+    """
+    In case we dilled some structures to share between multiple process this function will restore them.
+    If the data input are not bytes we assume it was not dilled in the first place
+
+    :param data: Possibly dilled data structure
+    :return: Un-dilled data structure
+    """
+    if type(data) is bytes:
+        return dill.loads(data)
+    return data
+
+
+def collate(batch):
+    if len(batch) == 0:
+        return batch
+    elem = batch[0]
+    if elem is None:
+        return None
+    elif isinstance(elem, container_abcs.Sequence):
+        if len(elem) == 4: # We assume those are the maps, map points, headings and patch_size
+            scene_map, scene_pts, heading_angle, patch_size = zip(*batch)
+            if heading_angle[0] is None:
+                heading_angle = None
+            else:
+                heading_angle = torch.Tensor(heading_angle)
+            map = scene_map[0].get_cropped_maps_from_scene_map_batch(scene_map,
+                                                                     scene_pts=torch.Tensor(scene_pts),
+                                                                     patch_size=patch_size[0],
+                                                                     rotation=heading_angle)
+            return map
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+    elif isinstance(elem, container_abcs.Mapping):
+        # We have to dill the neighbors structures. Otherwise each tensor is put into
+        # shared memory separately -> slow, file pointer overhead
+        # we only do this in multiprocessing
+        neighbor_dict = {key: [d[key] for d in batch] for key in elem}
+        return dill.dumps(neighbor_dict) if torch.utils.data.get_worker_info() else neighbor_dict
+    return default_collate(batch)
+
+
+def get_relative_robot_traj(env, state, node_traj, robot_traj, node_type, robot_type):
+    # TODO: We will have to make this more generic if robot_type != node_type
+    # Make Robot State relative to node
+    _, std = env.get_standardize_params(state[robot_type], node_type=robot_type)
+    std[0:2] = env.attention_radius[(node_type, robot_type)]
+    robot_traj_st = env.standardize(robot_traj,
+                                    state[robot_type],
+                                    node_type=robot_type,
+                                    mean=node_traj,
+                                    std=std)
+    robot_traj_st_t = torch.tensor(robot_traj_st, dtype=torch.float)
+
+    return robot_traj_st_t
+
+
+def get_node_timestep_data(env, scene, t, node, state, pred_state,
+                           edge_types, max_ht, max_ft, hyperparams,
+                           scene_graph=None):
+    """
+    Pre-processes the data for a single batch element: node state over time for a specific time in a specific scene
+    as well as the neighbour data for it.
+
+    :param env: Environment
+    :param scene: Scene
+    :param t: Timestep in scene
+    :param node: Node
+    :param state: Specification of the node state
+    :param pred_state: Specification of the prediction state
+    :param edge_types: List of all Edge Types for which neighbours are pre-processed
+    :param max_ht: Maximum history timesteps
+    :param max_ft: Maximum future timesteps (prediction horizon)
+    :param hyperparams: Model hyperparameters
+    :param scene_graph: If scene graph was already computed for this scene and time you can pass it here
+    :return: Batch Element
+    """
+
+    # Node
+    timestep_range_x = np.array([t - max_ht, t])
+    timestep_range_y = np.array([t + 1, t + max_ft])
+
+    x = node.get(timestep_range_x, state[node.type])
+    y = node.get(timestep_range_y, pred_state[node.type])
+    first_history_index = (max_ht - node.history_points_at(t)).clip(0)
+
+    _, std = env.get_standardize_params(state[node.type], node.type)
+    std[0:2] = env.attention_radius[(node.type, node.type)]
+    rel_state = np.zeros_like(x[0])
+    rel_state[0:2] = np.array(x)[-1, 0:2]
+    x_st = env.standardize(x, state[node.type], node.type, mean=rel_state, std=std)
+    if list(pred_state[node.type].keys())[0] == 'position':  # If we predict position we do it relative to current pos
+        y_st = env.standardize(y, pred_state[node.type], node.type, mean=rel_state[0:2])
+    else:
+        y_st = env.standardize(y, pred_state[node.type], node.type)
+
+    x_t = torch.tensor(x, dtype=torch.float)
+    y_t = torch.tensor(y, dtype=torch.float)
+    x_st_t = torch.tensor(x_st, dtype=torch.float)
+    y_st_t = torch.tensor(y_st, dtype=torch.float)
+
+    # Neighbors
+    neighbors_data_st = None
+    neighbors_edge_value = None
+    if hyperparams['edge_encoding']:
+        # Scene Graph
+        scene_graph = scene.get_scene_graph(t,
+                                            env.attention_radius,
+                                            hyperparams['edge_addition_filter'],
+                                            hyperparams['edge_removal_filter']) if scene_graph is None else scene_graph
+
+        neighbors_data_st = dict()
+        neighbors_edge_value = dict()
+        for edge_type in edge_types:
+            neighbors_data_st[edge_type] = list()
+            # We get all nodes which are connected to the current node for the current timestep
+            connected_nodes = scene_graph.get_neighbors(node, edge_type[1])
+
+            if hyperparams['dynamic_edges'] == 'yes':
+                # We get the edge masks for the current node at the current timestep
+                edge_masks = torch.tensor(scene_graph.get_edge_scaling(node), dtype=torch.float)
+                neighbors_edge_value[edge_type] = edge_masks
+
+            for connected_node in connected_nodes:
+                neighbor_state_np = connected_node.get(np.array([t - max_ht, t]),
+                                                       state[connected_node.type],
+                                                       padding=0.0)
+
+                # Make State relative to node where neighbor and node have same state
+                _, std = env.get_standardize_params(state[connected_node.type], node_type=connected_node.type)
+                std[0:2] = env.attention_radius[edge_type]
+                equal_dims = np.min((neighbor_state_np.shape[-1], x.shape[-1]))
+                rel_state = np.zeros_like(neighbor_state_np)
+                rel_state[:, ..., :equal_dims] = x[-1, ..., :equal_dims]
+                neighbor_state_np_st = env.standardize(neighbor_state_np,
+                                                       state[connected_node.type],
+                                                       node_type=connected_node.type,
+                                                       mean=rel_state,
+                                                       std=std)
+
+                neighbor_state = torch.tensor(neighbor_state_np_st, dtype=torch.float)
+                neighbors_data_st[edge_type].append(neighbor_state)
+
+    # Robot
+    robot_traj_st_t = None
+    if hyperparams['incl_robot_node']:
+        timestep_range_r = np.array([t, t + max_ft])
+        if scene.non_aug_scene is not None:
+            robot = scene.get_node_by_id(scene.non_aug_scene.robot.id)
+        else:
+            robot = scene.robot
+        robot_type = robot.type
+        robot_traj = robot.get(timestep_range_r, state[robot_type], padding=0.0)
+        node_state = np.zeros_like(robot_traj[0])
+        node_state[:x.shape[1]] = x[-1]
+        robot_traj_st_t = get_relative_robot_traj(env, state, node_state, robot_traj, node.type, robot_type)
+
+    # Map
+    map_tuple = None
+    if hyperparams['use_map_encoding']:
+        if node.type in hyperparams['map_encoder']:
+            if node.non_aug_node is not None:
+                x = node.non_aug_node.get(np.array([t]), state[node.type])
+            me_hyp = hyperparams['map_encoder'][node.type]
+            if 'heading_state_index' in me_hyp:
+                heading_state_index = me_hyp['heading_state_index']
+                # We have to rotate the map in the opposit direction of the agent to match them
+                if type(heading_state_index) is list:  # infer from velocity or heading vector
+                    heading_angle = -np.arctan2(x[-1, heading_state_index[1]],
+                                                x[-1, heading_state_index[0]]) * 180 / np.pi
+                else:
+                    heading_angle = -x[-1, heading_state_index] * 180 / np.pi
+            else:
+                heading_angle = None
+
+            scene_map = scene.map[node.type]
+            map_point = x[-1, :2]
+
+
+            patch_size = hyperparams['map_encoder'][node.type]['patch_size']
+            map_tuple = (scene_map, map_point, heading_angle, patch_size)
+
+    return (first_history_index, x_t, y_t, x_st_t, y_st_t, neighbors_data_st,
+            neighbors_edge_value, robot_traj_st_t, map_tuple)
+
+
+def get_timesteps_data(env, scene, t, node_type, state, pred_state,
+                       edge_types, min_ht, max_ht, min_ft, max_ft, hyperparams):
+    """
+    Puts together the inputs for ALL nodes in a given scene and timestep in it.
+
+    :param env: Environment
+    :param scene: Scene
+    :param t: Timestep in scene
+    :param node_type: Node Type of nodes for which the data shall be pre-processed
+    :param state: Specification of the node state
+    :param pred_state: Specification of the prediction state
+    :param edge_types: List of all Edge Types for which neighbors are pre-processed
+    :param max_ht: Maximum history timesteps
+    :param max_ft: Maximum future timesteps (prediction horizon)
+    :param hyperparams: Model hyperparameters
+    :return:
+    """
+    nodes_per_ts = scene.present_nodes(t,
+                                       type=node_type,
+                                       min_history_timesteps=min_ht,
+                                       min_future_timesteps=max_ft,
+                                       return_robot=not hyperparams['incl_robot_node'])
+    batch = list()
+    nodes = list()
+    out_timesteps = list()
+    for timestep in nodes_per_ts.keys():
+            scene_graph = scene.get_scene_graph(timestep,
+                                                env.attention_radius,
+                                                hyperparams['edge_addition_filter'],
+                                                hyperparams['edge_removal_filter'])
+            present_nodes = nodes_per_ts[timestep]
+            for node in present_nodes:
+                nodes.append(node)
+                out_timesteps.append(timestep)
+                batch.append(get_node_timestep_data(env, scene, timestep, node, state, pred_state,
+                                                    edge_types, max_ht, max_ft, hyperparams,
+                                                    scene_graph=scene_graph))
+    if len(out_timesteps) == 0:
+        return None
+    return collate(batch), nodes, out_timesteps
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py
new file mode 100644
index 000000000..5853fbb5f
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/__init__.py
@@ -0,0 +1,4 @@
+from model.dynamics.dynamic import Dynamic
+from model.dynamics.single_integrator import SingleIntegrator
+from model.dynamics.unicycle import Unicycle
+from model.dynamics.linear import Linear
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py
new file mode 100644
index 000000000..6b03e13fd
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/dynamic.py
@@ -0,0 +1,30 @@
+
+
+class Dynamic(object):
+    def __init__(self, dt, dyn_limits, device, model_registrar, xz_size, node_type):
+        self.dt = dt
+        self.device = device
+        self.dyn_limits = dyn_limits
+        self.initial_conditions = None
+        self.model_registrar = model_registrar
+        self.node_type = node_type
+        self.init_constants()
+        self.create_graph(xz_size)
+
+    def set_initial_condition(self, init_con):
+        self.initial_conditions = init_con
+
+    def init_constants(self):
+        pass
+
+    def create_graph(self, xz_size):
+        pass
+
+    def integrate_samples(self, s, x):
+        raise NotImplementedError
+
+    def integrate_distribution(self, dist, x):
+        raise NotImplementedError
+
+    def create_graph(self, xz_size):
+        pass
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py
new file mode 100644
index 000000000..bce54c1d6
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/linear.py
@@ -0,0 +1,12 @@
+from ..dynamics import Dynamic
+
+
+class Linear(Dynamic):
+    def init_constants(self):
+        pass
+
+    def integrate_samples(self, v, x):
+        return v
+
+    def integrate_distribution(self, v_dist, x):
+        return v_dist
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py
new file mode 100644
index 000000000..984f5fa68
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/single_integrator.py
@@ -0,0 +1,64 @@
+import torch
+from model.dynamics import Dynamic
+from utils import block_diag
+from model.components import GMM2D
+
+
+class SingleIntegrator(Dynamic):
+    def init_constants(self):
+        self.F = torch.eye(4, device=self.device, dtype=torch.float32)
+        self.F[0:2, 2:] = torch.eye(2, device=self.device, dtype=torch.float32) * self.dt
+        self.F_t = self.F.transpose(-2, -1)
+
+    def integrate_samples(self, v, x=None):
+        """
+        Integrates deterministic samples of velocity.
+
+        :param v: Velocity samples
+        :param x: Not used for SI.
+        :return: Position samples
+        """
+        p_0 = self.initial_conditions['pos'].unsqueeze(1)
+        return torch.cumsum(v, dim=2) * self.dt + p_0
+
+    def integrate_distribution(self, v_dist, x=None):
+        r"""
+        Integrates the GMM velocity distribution to a distribution over position.
+        The Kalman Equations are used.
+
+        .. math:: \mu_{t+1} =\textbf{F} \mu_{t}
+
+        .. math:: \mathbf{\Sigma}_{t+1}={\textbf {F}} \mathbf{\Sigma}_{t} {\textbf {F}}^{T}
+
+        .. math::
+            \textbf{F} = \left[
+                            \begin{array}{cccc}
+                                \sigma_x^2 & \rho_p \sigma_x \sigma_y & 0 & 0 \\
+                                \rho_p \sigma_x \sigma_y & \sigma_y^2 & 0 & 0 \\
+                                0 & 0 & \sigma_{v_x}^2 & \rho_v \sigma_{v_x} \sigma_{v_y} \\
+                                0 & 0 & \rho_v \sigma_{v_x} \sigma_{v_y} & \sigma_{v_y}^2 \\
+                            \end{array}
+                        \right]_{t}
+
+        :param v_dist: Joint GMM Distribution over velocity in x and y direction.
+        :param x: Not used for SI.
+        :return: Joint GMM Distribution over position in x and y direction.
+        """
+        p_0 = self.initial_conditions['pos'].unsqueeze(1)
+        ph = v_dist.mus.shape[-3]
+        sample_batch_dim = list(v_dist.mus.shape[0:2])
+        pos_dist_sigma_matrix_list = []
+
+        pos_mus = p_0[:, None] + torch.cumsum(v_dist.mus, dim=2) * self.dt
+
+        vel_dist_sigma_matrix = v_dist.get_covariance_matrix()
+        pos_dist_sigma_matrix_t = torch.zeros(sample_batch_dim + [v_dist.components, 2, 2], device=self.device)
+
+        for t in range(ph):
+            vel_sigma_matrix_t = vel_dist_sigma_matrix[:, :, t]
+            full_sigma_matrix_t = block_diag([pos_dist_sigma_matrix_t, vel_sigma_matrix_t])
+            pos_dist_sigma_matrix_t = self.F[..., :2, :].matmul(full_sigma_matrix_t.matmul(self.F_t)[..., :2])
+            pos_dist_sigma_matrix_list.append(pos_dist_sigma_matrix_t)
+
+        pos_dist_sigma_matrix = torch.stack(pos_dist_sigma_matrix_list, dim=2)
+        return GMM2D.from_log_pis_mus_cov_mats(v_dist.log_pis, pos_mus, pos_dist_sigma_matrix)
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py
new file mode 100644
index 000000000..1a83c4274
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/dynamics/unicycle.py
@@ -0,0 +1,234 @@
+import torch
+import torch.nn as nn
+from model.dynamics import Dynamic
+from utils import block_diag
+from model.components import GMM2D
+
+
+class Unicycle(Dynamic):
+    def init_constants(self):
+        self.F_s = torch.eye(4, device=self.device, dtype=torch.float32)
+        self.F_s[0:2, 2:] = torch.eye(2, device=self.device, dtype=torch.float32) * self.dt
+        self.F_s_t = self.F_s.transpose(-2, -1)
+
+    def create_graph(self, xz_size):
+        model_if_absent = nn.Linear(xz_size + 1, 1)
+        self.p0_model = self.model_registrar.get_model(f"{self.node_type}/unicycle_initializer", model_if_absent)
+
+    def dynamic(self, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        x_p = x[0]
+        y_p = x[1]
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        d1 = torch.stack([(x_p
+                           + (a / dphi) * dcos_domega
+                           + v * dsin_domega
+                           + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt),
+                          (y_p
+                           - v * dcos_domega
+                           + (a / dphi) * dsin_domega
+                           - (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt),
+                          phi + dphi * self.dt,
+                          v + a * self.dt], dim=0)
+        d2 = torch.stack([x_p + v * torch.cos(phi) * self.dt + (a / 2) * torch.cos(phi) * self.dt ** 2,
+                          y_p + v * torch.sin(phi) * self.dt + (a / 2) * torch.sin(phi) * self.dt ** 2,
+                          phi * torch.ones_like(a),
+                          v + a * self.dt], dim=0)
+        return torch.where(~mask, d1, d2)
+
+    def integrate_samples(self, control_samples, x=None):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        ph = control_samples.shape[-2]
+        p_0 = self.initial_conditions['pos'].unsqueeze(1)
+        v_0 = self.initial_conditions['vel'].unsqueeze(1)
+
+        # In case the input is batched because of the robot in online use we repeat this to match the batch size of x.
+        if p_0.size()[0] != x.size()[0]:
+            p_0 = p_0.repeat(x.size()[0], 1, 1)
+            v_0 = v_0.repeat(x.size()[0], 1, 1)
+
+        phi_0 = torch.atan2(v_0[..., 1], v_0[..., 0])
+
+        phi_0 = phi_0 + torch.tanh(self.p0_model(torch.cat((x, phi_0), dim=-1)))
+
+        u = torch.stack([control_samples[..., 0], control_samples[..., 1]], dim=0)
+        x = torch.stack([p_0[..., 0], p_0[..., 1], phi_0, torch.norm(v_0, dim=-1)], dim = 0).squeeze(dim=-1)
+
+        mus_list = []
+        for t in range(ph):
+            x = self.dynamic(x, u[..., t])
+            mus_list.append(torch.stack((x[0], x[1]), dim=-1))
+
+        pos_mus = torch.stack(mus_list, dim=2)
+        return pos_mus
+
+    def compute_control_jacobian(self, sample_batch_dim, components, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        F = torch.zeros(sample_batch_dim + [components, 4, 2],
+                        device=self.device,
+                        dtype=torch.float32)
+
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        F[..., 0, 0] = ((v / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+                        - (v / dphi) * dsin_domega
+                        - (2 * a / dphi ** 2) * torch.sin(phi_p_omega_dt) * self.dt
+                        - (2 * a / dphi ** 2) * dcos_domega
+                        + (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt ** 2)
+        F[..., 0, 1] = (1 / dphi) * dcos_domega + (1 / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+
+        F[..., 1, 0] = ((v / dphi) * dcos_domega
+                        - (2 * a / dphi ** 2) * dsin_domega
+                        + (2 * a / dphi ** 2) * torch.cos(phi_p_omega_dt) * self.dt
+                        + (v / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+                        + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt ** 2)
+        F[..., 1, 1] = (1 / dphi) * dsin_domega - (1 / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+
+        F[..., 2, 0] = self.dt
+
+        F[..., 3, 1] = self.dt
+
+        F_sm = torch.zeros(sample_batch_dim + [components, 4, 2],
+                           device=self.device,
+                           dtype=torch.float32)
+
+        F_sm[..., 0, 1] = (torch.cos(phi) * self.dt ** 2) / 2
+
+        F_sm[..., 1, 1] = (torch.sin(phi) * self.dt ** 2) / 2
+
+        F_sm[..., 3, 1] = self.dt
+
+        return torch.where(~mask.unsqueeze(-1).unsqueeze(-1), F, F_sm)
+
+    def compute_jacobian(self, sample_batch_dim, components, x, u):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        one = torch.tensor(1)
+        F = torch.zeros(sample_batch_dim + [components, 4, 4],
+                        device=self.device,
+                        dtype=torch.float32)
+
+        phi = x[2]
+        v = x[3]
+        dphi = u[0]
+        a = u[1]
+
+        mask = torch.abs(dphi) <= 1e-2
+        dphi = ~mask * dphi + (mask) * 1
+
+        phi_p_omega_dt = phi + dphi * self.dt
+        dsin_domega = (torch.sin(phi_p_omega_dt) - torch.sin(phi)) / dphi
+        dcos_domega = (torch.cos(phi_p_omega_dt) - torch.cos(phi)) / dphi
+
+        F[..., 0, 0] = one
+        F[..., 1, 1] = one
+        F[..., 2, 2] = one
+        F[..., 3, 3] = one
+
+        F[..., 0, 2] = v * dcos_domega - (a / dphi) * dsin_domega + (a / dphi) * torch.cos(phi_p_omega_dt) * self.dt
+        F[..., 0, 3] = dsin_domega
+
+        F[..., 1, 2] = v * dsin_domega + (a / dphi) * dcos_domega + (a / dphi) * torch.sin(phi_p_omega_dt) * self.dt
+        F[..., 1, 3] = -dcos_domega
+
+        F_sm = torch.zeros(sample_batch_dim + [components, 4, 4],
+                           device=self.device,
+                           dtype=torch.float32)
+
+        F_sm[..., 0, 0] = one
+        F_sm[..., 1, 1] = one
+        F_sm[..., 2, 2] = one
+        F_sm[..., 3, 3] = one
+
+        F_sm[..., 0, 2] = -v * torch.sin(phi) * self.dt - (a * torch.sin(phi) * self.dt ** 2) / 2
+        F_sm[..., 0, 3] = torch.cos(phi) * self.dt
+
+        F_sm[..., 1, 2] = v * torch.cos(phi) * self.dt + (a * torch.cos(phi) * self.dt ** 2) / 2
+        F_sm[..., 1, 3] = torch.sin(phi) * self.dt
+
+        return torch.where(~mask.unsqueeze(-1).unsqueeze(-1), F, F_sm)
+
+    def integrate_distribution(self, control_dist_dphi_a, x):
+        r"""
+        TODO: Boris: Add docstring
+        :param x:
+        :param u:
+        :return:
+        """
+        sample_batch_dim = list(control_dist_dphi_a.mus.shape[0:2])
+        ph = control_dist_dphi_a.mus.shape[-3]
+        p_0 = self.initial_conditions['pos'].unsqueeze(1)
+        v_0 = self.initial_conditions['vel'].unsqueeze(1)
+
+        # In case the input is batched because of the robot in online use we repeat this to match the batch size of x.
+        if p_0.size()[0] != x.size()[0]:
+            p_0 = p_0.repeat(x.size()[0], 1, 1)
+            v_0 = v_0.repeat(x.size()[0], 1, 1)
+
+        phi_0 = torch.atan2(v_0[..., 1], v_0[..., 0])
+
+        phi_0 = phi_0 + torch.tanh(self.p0_model(torch.cat((x, phi_0), dim=-1)))
+
+        dist_sigma_matrix = control_dist_dphi_a.get_covariance_matrix()
+        pos_dist_sigma_matrix_t = torch.zeros(sample_batch_dim + [control_dist_dphi_a.components, 4, 4],
+                                              device=self.device)
+
+        u = torch.stack([control_dist_dphi_a.mus[..., 0], control_dist_dphi_a.mus[..., 1]], dim=0)
+        x = torch.stack([p_0[..., 0], p_0[..., 1], phi_0, torch.norm(v_0, dim=-1)], dim=0)
+
+        pos_dist_sigma_matrix_list = []
+        mus_list = []
+        for t in range(ph):
+            F_t = self.compute_jacobian(sample_batch_dim, control_dist_dphi_a.components, x, u[:, :, :, t])
+            G_t = self.compute_control_jacobian(sample_batch_dim, control_dist_dphi_a.components, x, u[:, :, :, t])
+            dist_sigma_matrix_t = dist_sigma_matrix[:, :, t]
+            pos_dist_sigma_matrix_t = (F_t.matmul(pos_dist_sigma_matrix_t.matmul(F_t.transpose(-2, -1)))
+                                       + G_t.matmul(dist_sigma_matrix_t.matmul(G_t.transpose(-2, -1))))
+            pos_dist_sigma_matrix_list.append(pos_dist_sigma_matrix_t[..., :2, :2])
+
+            x = self.dynamic(x, u[:, :, :, t])
+            mus_list.append(torch.stack((x[0], x[1]), dim=-1))
+
+        pos_dist_sigma_matrix = torch.stack(pos_dist_sigma_matrix_list, dim=2)
+        pos_mus = torch.stack(mus_list, dim=2)
+        return GMM2D.from_log_pis_mus_cov_mats(control_dist_dphi_a.log_pis, pos_mus, pos_dist_sigma_matrix)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py
new file mode 100644
index 000000000..b731e89e8
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/mgcvae.py
@@ -0,0 +1,1161 @@
+import warnings
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from model.components import *
+from model.model_utils import *
+import model.dynamics as dynamic_module
+from environment.scene_graph import DirectedEdge
+
+
+class MultimodalGenerativeCVAE(torch.nn.Module):
+    def __init__(self,
+                 env,
+                 node_type,
+                 model_registrar,
+                 hyperparams,
+                 device,
+                 edge_types,
+                 log_writer=None):
+        super().__init__()
+        self.hyperparams = hyperparams
+        self.env = env
+        self.node_type = node_type
+        self.model_registrar = model_registrar
+        self.log_writer = log_writer
+        self.device = device
+        self.edge_types = [edge_type for edge_type in edge_types if edge_type[0] is node_type]
+        self.curr_iter = 0
+
+        self.node_modules = dict()
+        self.node_modules = torch.nn.ModuleDict()
+
+        self.min_hl = self.hyperparams['minimum_history_length']
+        self.max_hl = self.hyperparams['maximum_history_length']
+        self.ph = self.hyperparams['prediction_horizon']
+        self.state = self.hyperparams['state']
+        self.pred_state = self.hyperparams['pred_state'][node_type]
+        self.state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[node_type].values()]))
+        if self.hyperparams['incl_robot_node']:
+            self.robot_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[env.robot_type].values()])
+            )
+        self.pred_state_length = int(np.sum([len(entity_dims) for entity_dims in self.pred_state.values()]))
+
+        edge_types_str = [DirectedEdge.get_str_from_types(*edge_type) for edge_type in self.edge_types]
+        self.create_graphical_model(edge_types_str)
+
+        dynamic_class = getattr(dynamic_module, hyperparams['dynamic'][self.node_type]['name'])
+        dyn_limits = hyperparams['dynamic'][self.node_type]['limits']
+        self.dynamic = dynamic_class(self.env.scenes[0].dt, dyn_limits, device,
+                                     self.model_registrar, self.x_size, self.node_type)
+
+    def eval(self):
+        super().eval()
+        for key in self.node_modules.keys():
+            self.node_modules[key].eval()
+
+    def set_curr_iter(self, curr_iter):
+        self.curr_iter = curr_iter
+
+    def add_submodule(self, name, model_if_absent):
+        self.node_modules[name] = self.model_registrar.get_model(name, model_if_absent)
+
+    def clear_submodules(self):
+        self.node_modules.clear()
+
+    def create_node_models(self):
+        ############################
+        #   Node History Encoder   #
+        ############################
+        self.add_submodule(self.node_type + '/node_history_encoder',
+                           model_if_absent=nn.LSTM(input_size=self.state_length,
+                                                   hidden_size=self.hyperparams['enc_rnn_dim_history'],
+                                                   batch_first=True))
+
+        ###########################
+        #   Node Future Encoder   #
+        ###########################
+        # We'll create this here, but then later check if in training mode.
+        # Based on that, we'll factor this into the computation graph (or not).
+        self.add_submodule(self.node_type + '/node_future_encoder',
+                           model_if_absent=nn.LSTM(input_size=self.pred_state_length,
+                                                   hidden_size=self.hyperparams['enc_rnn_dim_future'],
+                                                   bidirectional=True,
+                                                   batch_first=True))
+        # These are related to how you initialize states for the node future encoder.
+        self.add_submodule(self.node_type + '/node_future_encoder/initial_h',
+                           model_if_absent=nn.Linear(self.state_length,
+                                                     self.hyperparams['enc_rnn_dim_future']))
+        self.add_submodule(self.node_type + '/node_future_encoder/initial_c',
+                           model_if_absent=nn.Linear(self.state_length,
+                                                     self.hyperparams['enc_rnn_dim_future']))
+
+        ############################
+        #   Robot Future Encoder   #
+        ############################
+        # We'll create this here, but then later check if we're next to the robot.
+        # Based on that, we'll factor this into the computation graph (or not).
+        if self.hyperparams['incl_robot_node']:
+            self.add_submodule('robot_future_encoder',
+                               model_if_absent=nn.LSTM(input_size=self.robot_state_length,
+                                                       hidden_size=self.hyperparams['enc_rnn_dim_future'],
+                                                       bidirectional=True,
+                                                       batch_first=True))
+            # These are related to how you initialize states for the robot future encoder.
+            self.add_submodule('robot_future_encoder/initial_h',
+                               model_if_absent=nn.Linear(self.robot_state_length,
+                                                         self.hyperparams['enc_rnn_dim_future']))
+            self.add_submodule('robot_future_encoder/initial_c',
+                               model_if_absent=nn.Linear(self.robot_state_length,
+                                                         self.hyperparams['enc_rnn_dim_future']))
+
+        if self.hyperparams['edge_encoding']:
+            ##############################
+            #   Edge Influence Encoder   #
+            ##############################
+            # NOTE: The edge influence encoding happens during calls
+            # to forward or incremental_forward, so we don't create
+            # a model for it here for the max and sum variants.
+            if self.hyperparams['edge_influence_combine_method'] == 'bi-rnn':
+                self.add_submodule(self.node_type + '/edge_influence_encoder',
+                                   model_if_absent=nn.LSTM(input_size=self.hyperparams['enc_rnn_dim_edge'],
+                                                           hidden_size=self.hyperparams['enc_rnn_dim_edge_influence'],
+                                                           bidirectional=True,
+                                                           batch_first=True))
+
+                # Four times because we're trying to mimic a bi-directional
+                # LSTM's output (which, here, is c and h from both ends).
+                self.eie_output_dims = 4 * self.hyperparams['enc_rnn_dim_edge_influence']
+
+            elif self.hyperparams['edge_influence_combine_method'] == 'attention':
+                # Chose additive attention because of https://arxiv.org/pdf/1703.03906.pdf
+                # We calculate an attention context vector using the encoded edges as the "encoder"
+                # (that we attend _over_)
+                # and the node history encoder representation as the "decoder state" (that we attend _on_).
+                self.add_submodule(self.node_type + '/edge_influence_encoder',
+                                   model_if_absent=AdditiveAttention(
+                                       encoder_hidden_state_dim=self.hyperparams['enc_rnn_dim_edge_influence'],
+                                       decoder_hidden_state_dim=self.hyperparams['enc_rnn_dim_history']))
+
+                self.eie_output_dims = self.hyperparams['enc_rnn_dim_edge_influence']
+
+        ###################
+        #   Map Encoder   #
+        ###################
+        if self.hyperparams['use_map_encoding']:
+            if self.node_type in self.hyperparams['map_encoder']:
+                me_params = self.hyperparams['map_encoder'][self.node_type]
+                self.add_submodule(self.node_type + '/map_encoder',
+                                   model_if_absent=CNNMapEncoder(me_params['map_channels'],
+                                                                 me_params['hidden_channels'],
+                                                                 me_params['output_size'],
+                                                                 me_params['masks'],
+                                                                 me_params['strides'],
+                                                                 me_params['patch_size']))
+
+        ################################
+        #   Discrete Latent Variable   #
+        ################################
+        self.latent = DiscreteLatent(self.hyperparams, self.device)
+
+        ######################################################################
+        #   Various Fully-Connected Layers from Encoder to Latent Variable   #
+        ######################################################################
+        # Node History Encoder
+        x_size = self.hyperparams['enc_rnn_dim_history']
+        if self.hyperparams['edge_encoding']:
+            #              Edge Encoder
+            x_size += self.eie_output_dims
+        if self.hyperparams['incl_robot_node']:
+            #              Future Conditional Encoder
+            x_size += 4 * self.hyperparams['enc_rnn_dim_future']
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            #              Map Encoder
+            x_size += self.hyperparams['map_encoder'][self.node_type]['output_size']
+
+        z_size = self.hyperparams['N'] * self.hyperparams['K']
+
+        if self.hyperparams['p_z_x_MLP_dims'] is not None:
+            self.add_submodule(self.node_type + '/p_z_x',
+                               model_if_absent=nn.Linear(x_size, self.hyperparams['p_z_x_MLP_dims']))
+            hx_size = self.hyperparams['p_z_x_MLP_dims']
+        else:
+            hx_size = x_size
+
+        self.add_submodule(self.node_type + '/hx_to_z',
+                           model_if_absent=nn.Linear(hx_size, self.latent.z_dim))
+
+        if self.hyperparams['q_z_xy_MLP_dims'] is not None:
+            self.add_submodule(self.node_type + '/q_z_xy',
+                               #                                           Node Future Encoder
+                               model_if_absent=nn.Linear(x_size + 4 * self.hyperparams['enc_rnn_dim_future'],
+                                                         self.hyperparams['q_z_xy_MLP_dims']))
+            hxy_size = self.hyperparams['q_z_xy_MLP_dims']
+        else:
+            #                           Node Future Encoder
+            hxy_size = x_size + 4 * self.hyperparams['enc_rnn_dim_future']
+
+        self.add_submodule(self.node_type + '/hxy_to_z',
+                           model_if_absent=nn.Linear(hxy_size, self.latent.z_dim))
+
+        ####################
+        #   Decoder LSTM   #
+        ####################
+        if self.hyperparams['incl_robot_node']:
+            decoder_input_dims = self.pred_state_length + self.robot_state_length + z_size + x_size
+        else:
+            decoder_input_dims = self.pred_state_length + z_size + x_size
+
+        self.add_submodule(self.node_type + '/decoder/state_action',
+                           model_if_absent=nn.Sequential(
+                               nn.Linear(self.state_length, self.pred_state_length)))
+
+        self.add_submodule(self.node_type + '/decoder/rnn_cell',
+                           model_if_absent=nn.GRUCell(decoder_input_dims, self.hyperparams['dec_rnn_dim']))
+        self.add_submodule(self.node_type + '/decoder/initial_h',
+                           model_if_absent=nn.Linear(z_size + x_size, self.hyperparams['dec_rnn_dim']))
+
+        ###################
+        #   Decoder GMM   #
+        ###################
+        self.add_submodule(self.node_type + '/decoder/proj_to_GMM_log_pis',
+                           model_if_absent=nn.Linear(self.hyperparams['dec_rnn_dim'],
+                                                     self.hyperparams['GMM_components']))
+        self.add_submodule(self.node_type + '/decoder/proj_to_GMM_mus',
+                           model_if_absent=nn.Linear(self.hyperparams['dec_rnn_dim'],
+                                                     self.hyperparams['GMM_components'] * self.pred_state_length))
+        self.add_submodule(self.node_type + '/decoder/proj_to_GMM_log_sigmas',
+                           model_if_absent=nn.Linear(self.hyperparams['dec_rnn_dim'],
+                                                     self.hyperparams['GMM_components'] * self.pred_state_length))
+        self.add_submodule(self.node_type + '/decoder/proj_to_GMM_corrs',
+                           model_if_absent=nn.Linear(self.hyperparams['dec_rnn_dim'],
+                                                     self.hyperparams['GMM_components']))
+
+        self.x_size = x_size
+        self.z_size = z_size
+
+    def create_edge_models(self, edge_types):
+        for edge_type in edge_types:
+            neighbor_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[edge_type.split('->')[1]].values()]))
+            if self.hyperparams['edge_state_combine_method'] == 'pointnet':
+                self.add_submodule(edge_type + '/pointnet_encoder',
+                                   model_if_absent=nn.Sequential(
+                                       nn.Linear(self.state_length, 2 * self.state_length),
+                                       nn.ReLU(),
+                                       nn.Linear(2 * self.state_length, 2 * self.state_length),
+                                       nn.ReLU()))
+
+                edge_encoder_input_size = 2 * self.state_length + self.state_length
+
+            elif self.hyperparams['edge_state_combine_method'] == 'attention':
+                self.add_submodule(self.node_type + '/edge_attention_combine',
+                                   model_if_absent=TemporallyBatchedAdditiveAttention(
+                                       encoder_hidden_state_dim=self.state_length,
+                                       decoder_hidden_state_dim=self.state_length))
+                edge_encoder_input_size = self.state_length + neighbor_state_length
+
+            else:
+                edge_encoder_input_size = self.state_length + neighbor_state_length
+
+            self.add_submodule(edge_type + '/edge_encoder',
+                               model_if_absent=nn.LSTM(input_size=edge_encoder_input_size,
+                                                       hidden_size=self.hyperparams['enc_rnn_dim_edge'],
+                                                       batch_first=True))
+
+    def create_graphical_model(self, edge_types):
+        """
+        Creates or queries all trainable components.
+
+        :param edge_types: List containing strings for all possible edge types for the node type.
+        :return: None
+        """
+        self.clear_submodules()
+
+        ############################
+        #   Everything but Edges   #
+        ############################
+        self.create_node_models()
+
+        #####################
+        #   Edge Encoders   #
+        #####################
+        if self.hyperparams['edge_encoding']:
+            self.create_edge_models(edge_types)
+
+        for name, module in self.node_modules.items():
+            module.to(self.device)
+
+    def create_new_scheduler(self, name, annealer, annealer_kws, creation_condition=True):
+        value_scheduler = None
+        rsetattr(self, name + '_scheduler', value_scheduler)
+        if creation_condition:
+            annealer_kws['device'] = self.device
+            value_annealer = annealer(annealer_kws)
+            rsetattr(self, name + '_annealer', value_annealer)
+
+            # This is the value that we'll update on each call of
+            # step_annealers().
+            rsetattr(self, name, value_annealer(0).clone().detach())
+            dummy_optimizer = optim.Optimizer([rgetattr(self, name)], {'lr': value_annealer(0).clone().detach()})
+            rsetattr(self, name + '_optimizer', dummy_optimizer)
+
+            value_scheduler = CustomLR(dummy_optimizer,
+                                       value_annealer)
+            rsetattr(self, name + '_scheduler', value_scheduler)
+
+        self.schedulers.append(value_scheduler)
+        self.annealed_vars.append(name)
+
+    def set_annealing_params(self):
+        self.schedulers = list()
+        self.annealed_vars = list()
+
+        self.create_new_scheduler(name='kl_weight',
+                                  annealer=sigmoid_anneal,
+                                  annealer_kws={
+                                      'start': self.hyperparams['kl_weight_start'],
+                                      'finish': self.hyperparams['kl_weight'],
+                                      'center_step': self.hyperparams['kl_crossover'],
+                                      'steps_lo_to_hi': self.hyperparams['kl_crossover'] / self.hyperparams[
+                                          'kl_sigmoid_divisor']
+                                  })
+
+        self.create_new_scheduler(name='latent.temp',
+                                  annealer=exp_anneal,
+                                  annealer_kws={
+                                      'start': self.hyperparams['tau_init'],
+                                      'finish': self.hyperparams['tau_final'],
+                                      'rate': self.hyperparams['tau_decay_rate']
+                                  })
+
+        self.create_new_scheduler(name='latent.z_logit_clip',
+                                  annealer=sigmoid_anneal,
+                                  annealer_kws={
+                                      'start': self.hyperparams['z_logit_clip_start'],
+                                      'finish': self.hyperparams['z_logit_clip_final'],
+                                      'center_step': self.hyperparams['z_logit_clip_crossover'],
+                                      'steps_lo_to_hi': self.hyperparams['z_logit_clip_crossover'] / self.hyperparams[
+                                          'z_logit_clip_divisor']
+                                  },
+                                  creation_condition=self.hyperparams['use_z_logit_clipping'])
+
+    def step_annealers(self):
+        # This should manage all of the step-wise changed
+        # parameters automatically.
+        for idx, annealed_var in enumerate(self.annealed_vars):
+            if rgetattr(self, annealed_var + '_scheduler') is not None:
+                # First we step the scheduler.
+                with warnings.catch_warnings():  # We use a dummy optimizer: Warning because no .step() was called on it
+                    warnings.simplefilter("ignore")
+                    rgetattr(self, annealed_var + '_scheduler').step()
+
+                # Then we set the annealed vars' value.
+                rsetattr(self, annealed_var, rgetattr(self, annealed_var + '_optimizer').param_groups[0]['lr'])
+
+        self.summarize_annealers()
+
+    def summarize_annealers(self):
+        if self.log_writer is not None:
+            for annealed_var in self.annealed_vars:
+                if rgetattr(self, annealed_var) is not None:
+                    self.log_writer.add_scalar('%s/%s' % (str(self.node_type), annealed_var.replace('.', '/')),
+                                               rgetattr(self, annealed_var), self.curr_iter)
+
+    def obtain_encoded_tensors(self,
+                               mode,
+                               inputs,
+                               inputs_st,
+                               packed_inputs_st,
+                               labels,
+                               labels_st,
+                               first_history_indices,
+                               neighbors,
+                               neighbors_edge_value,
+                               robot,
+                               map) -> (torch.Tensor,
+                                        torch.Tensor,
+                                        torch.Tensor,
+                                        torch.Tensor,
+                                        torch.Tensor,
+                                        torch.Tensor):
+        """
+        Encodes input and output tensors for node and robot.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :return: tuple(x, x_nr_t, y_e, y_r, y, n_s_t0)
+            WHERE
+            - x: Encoded input / condition tensor to the CVAE x_e.
+            - x_r_t: Robot state (if robot is in scene).
+            - y_e: Encoded label / future of the node.
+            - y_r: Encoded future of the robot.
+            - y: Label / future of the node.
+            - n_s_t0: Standardized current state of the node.
+        """
+
+        x, x_r_t, y_e, y_r, y = None, None, None, None, None
+        initial_dynamics = dict()
+
+        batch_size = inputs.shape[0]
+
+        #########################################
+        # Provide basic information to encoders #
+        #########################################
+        node_history = inputs
+        node_present_state = inputs[:, -1]
+        node_pos = inputs[:, -1, 0:2]
+        node_vel = inputs[:, -1, 2:4]
+
+        node_history_st = packed_inputs_st
+        node_present_state_st = inputs_st[:, -1]
+        node_pos_st = inputs_st[:, -1, 0:2]
+        node_vel_st = inputs_st[:, -1, 2:4]
+
+        n_s_t0 = node_present_state_st
+
+        initial_dynamics['pos'] = node_pos
+        initial_dynamics['vel'] = node_vel
+
+        self.dynamic.set_initial_condition(initial_dynamics)
+
+        if self.hyperparams['incl_robot_node']:
+            x_r_t, y_r = robot[..., 0, :], robot[..., 1:, :]
+
+        ##################
+        # Encode History #
+        ##################
+        node_history_encoded = self.encode_node_history(mode,
+                                                        node_history_st,
+                                                        first_history_indices)
+        
+        return node_history_encoded
+
+        ##################
+        # Encode Present #
+        ##################
+        node_present = node_present_state_st  # [bs, state_dim]
+
+        ##################
+        # Encode Future #
+        ##################
+        if mode != ModeKeys.PREDICT:
+            y = labels_st
+
+        ##############################
+        # Encode Node Edges per Type #
+        ##############################
+        if self.hyperparams['edge_encoding']:
+            node_edges_encoded = list()
+            for edge_type in self.edge_types:
+                # Encode edges for given edge type
+                encoded_edges_type = self.encode_edge(mode,
+                                                      node_history,
+                                                      node_history_st,
+                                                      edge_type,
+                                                      neighbors[edge_type],
+                                                      neighbors_edge_value[edge_type],
+                                                      first_history_indices)
+                node_edges_encoded.append(encoded_edges_type)  # List of [bs/nbs, enc_rnn_dim]
+            #####################
+            # Encode Node Edges #
+            #####################
+            total_edge_influence = self.encode_total_edge_influence(mode,
+                                                                    node_edges_encoded,
+                                                                    node_history_encoded,
+                                                                    batch_size)
+
+        ################
+        # Map Encoding #
+        ################
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            if self.log_writer and (self.curr_iter + 1) % 500 == 0:
+                map_clone = map.clone()
+                map_patch = self.hyperparams['map_encoder'][self.node_type]['patch_size']
+                map_clone[:, :, map_patch[1] - 5:map_patch[1] + 5, map_patch[0] - 5:map_patch[0] + 5] = 1.
+                self.log_writer.add_images(f"{self.node_type}/cropped_maps", map_clone,
+                                           self.curr_iter, dataformats='NCWH')
+
+            encoded_map = self.node_modules[self.node_type + '/map_encoder'](map * 2. - 1., (mode == ModeKeys.TRAIN))
+            do = self.hyperparams['map_encoder'][self.node_type]['dropout']
+            encoded_map = F.dropout(encoded_map, do, training=(mode == ModeKeys.TRAIN))
+
+        ######################################
+        # Concatenate Encoder Outputs into x #
+        ######################################
+        x_concat_list = list()
+
+        # Every node has an edge-influence encoder (which could just be zero).
+        if self.hyperparams['edge_encoding']:
+            x_concat_list.append(total_edge_influence)  # [bs/nbs, 4*enc_rnn_dim]
+
+        # Every node has a history encoder.
+        x_concat_list.append(node_history_encoded)  # [bs/nbs, enc_rnn_dim_history]
+
+        if self.hyperparams['incl_robot_node']:
+            robot_future_encoder = self.encode_robot_future(mode, x_r_t, y_r)
+            x_concat_list.append(robot_future_encoder)
+
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            if self.log_writer:
+                self.log_writer.add_scalar(f"{self.node_type}/encoded_map_max",
+                                           torch.max(torch.abs(encoded_map)), self.curr_iter)
+            x_concat_list.append(encoded_map)
+
+        x = torch.cat(x_concat_list, dim=1)
+
+        if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL:
+            y_e = self.encode_node_future(mode, node_present, y)
+
+        return x, x_r_t, y_e, y_r, y, n_s_t0
+
+    def encode_node_history(self, mode, node_hist, first_history_indices):
+        """
+        Encodes the nodes history.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param node_hist: Historic and current state of the node. [bs, mhl, state]
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :return: Encoded node history tensor. [bs, enc_rnn_dim]
+        """
+        outputs = run_lstm_on_variable_length_seqs(self.node_modules[self.node_type + '/node_history_encoder'],
+        # outputs, _ = run_lstm_on_variable_length_seqs(self.node_modules[self.node_type + '/node_history_encoder'],
+                                                      original_seqs=node_hist,
+                                                      lower_indices=first_history_indices)
+
+        return outputs
+
+        outputs = F.dropout(outputs,
+                            p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                            training=(mode == ModeKeys.TRAIN))  # [bs, max_time, enc_rnn_dim]
+
+        last_index_per_sequence = -(first_history_indices + 1)
+
+        return outputs[torch.arange(first_history_indices.shape[0]), last_index_per_sequence]
+
+    def encode_edge(self,
+                    mode,
+                    node_history,
+                    node_history_st,
+                    edge_type,
+                    neighbors,
+                    neighbors_edge_value,
+                    first_history_indices):
+
+        max_hl = self.hyperparams['maximum_history_length']
+
+        edge_states_list = list()  # list of [#of neighbors, max_ht, state_dim]
+        for i, neighbor_states in enumerate(neighbors):  # Get neighbors for timestep in batch
+            if len(neighbor_states) == 0:  # There are no neighbors for edge type # TODO necessary?
+                neighbor_state_length = int(
+                    np.sum([len(entity_dims) for entity_dims in self.state[edge_type[1]].values()])
+                )
+                edge_states_list.append(torch.zeros((1, max_hl + 1, neighbor_state_length), device=self.device))
+            else:
+                edge_states_list.append(torch.stack(neighbor_states, dim=0).to(self.device))
+
+        if self.hyperparams['edge_state_combine_method'] == 'sum':
+            # Used in Structural-RNN to combine edges as well.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.sum(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.sum(edge_value.to(self.device),
+                                                                           dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams['edge_state_combine_method'] == 'max':
+            # Used in NLP, e.g. max over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.max(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.max(edge_value.to(self.device),
+                                                                           dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams['edge_state_combine_method'] == 'mean':
+            # Used in NLP, e.g. mean over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.mean(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_value in neighbors_edge_value:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.mean(edge_value.to(self.device),
+                                                                            dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        joint_history = torch.cat([combined_neighbors, node_history_st], dim=-1)
+
+        outputs, _ = run_lstm_on_variable_length_seqs(
+            self.node_modules[DirectedEdge.get_str_from_types(*edge_type) + '/edge_encoder'],
+            original_seqs=joint_history,
+            lower_indices=first_history_indices
+        )
+
+        outputs = F.dropout(outputs,
+                            p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                            training=(mode == ModeKeys.TRAIN))  # [bs, max_time, enc_rnn_dim]
+
+        last_index_per_sequence = -(first_history_indices + 1)
+        ret = outputs[torch.arange(last_index_per_sequence.shape[0]), last_index_per_sequence]
+        if self.hyperparams['dynamic_edges'] == 'yes':
+            return ret * combined_edge_masks
+        else:
+            return ret
+
+    def encode_total_edge_influence(self, mode, encoded_edges, node_history_encoder, batch_size):
+        if self.hyperparams['edge_influence_combine_method'] == 'sum':
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.sum(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams['edge_influence_combine_method'] == 'mean':
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.mean(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams['edge_influence_combine_method'] == 'max':
+            stacked_encoded_edges = torch.stack(encoded_edges, dim=0)
+            combined_edges = torch.max(stacked_encoded_edges, dim=0)
+
+        elif self.hyperparams['edge_influence_combine_method'] == 'bi-rnn':
+            if len(encoded_edges) == 0:
+                combined_edges = torch.zeros((batch_size, self.eie_output_dims), device=self.device)
+
+            else:
+                # axis=1 because then we get size [batch_size, max_time, depth]
+                encoded_edges = torch.stack(encoded_edges, dim=1)
+
+                _, state = self.node_modules[self.node_type + '/edge_influence_encoder'](encoded_edges)
+                combined_edges = unpack_RNN_state(state)
+                combined_edges = F.dropout(combined_edges,
+                                           p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                                           training=(mode == ModeKeys.TRAIN))
+
+        elif self.hyperparams['edge_influence_combine_method'] == 'attention':
+            # Used in Social Attention (https://arxiv.org/abs/1710.04689)
+            if len(encoded_edges) == 0:
+                combined_edges = torch.zeros((batch_size, self.eie_output_dims), device=self.device)
+
+            else:
+                # axis=1 because then we get size [batch_size, max_time, depth]
+                encoded_edges = torch.stack(encoded_edges, dim=1)
+                combined_edges, _ = self.node_modules[self.node_type + '/edge_influence_encoder'](encoded_edges,
+                                                                                                  node_history_encoder)
+                combined_edges = F.dropout(combined_edges,
+                                           p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                                           training=(mode == ModeKeys.TRAIN))
+
+        return combined_edges
+
+    def encode_node_future(self, mode, node_present, node_future) -> torch.Tensor:
+        """
+        Encodes the node future (during training) using a bi-directional LSTM
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param node_present: Current state of the node. [bs, state]
+        :param node_future: Future states of the node. [bs, ph, state]
+        :return: Encoded future.
+        """
+        initial_h_model = self.node_modules[self.node_type + '/node_future_encoder/initial_h']
+        initial_c_model = self.node_modules[self.node_type + '/node_future_encoder/initial_c']
+
+        # Here we're initializing the forward hidden states,
+        # but zeroing the backward ones.
+        initial_h = initial_h_model(node_present)
+        initial_h = torch.stack([initial_h, torch.zeros_like(initial_h, device=self.device)], dim=0)
+
+        initial_c = initial_c_model(node_present)
+        initial_c = torch.stack([initial_c, torch.zeros_like(initial_c, device=self.device)], dim=0)
+
+        initial_state = (initial_h, initial_c)
+
+        _, state = self.node_modules[self.node_type + '/node_future_encoder'](node_future, initial_state)
+        state = unpack_RNN_state(state)
+        state = F.dropout(state,
+                          p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                          training=(mode == ModeKeys.TRAIN))
+
+        return state
+
+    def encode_robot_future(self, mode, robot_present, robot_future) -> torch.Tensor:
+        """
+        Encodes the robot future (during training) using a bi-directional LSTM
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param robot_present: Current state of the robot. [bs, state]
+        :param robot_future: Future states of the robot. [bs, ph, state]
+        :return: Encoded future.
+        """
+        initial_h_model = self.node_modules['robot_future_encoder/initial_h']
+        initial_c_model = self.node_modules['robot_future_encoder/initial_c']
+
+        # Here we're initializing the forward hidden states,
+        # but zeroing the backward ones.
+        initial_h = initial_h_model(robot_present)
+        initial_h = torch.stack([initial_h, torch.zeros_like(initial_h, device=self.device)], dim=0)
+
+        initial_c = initial_c_model(robot_present)
+        initial_c = torch.stack([initial_c, torch.zeros_like(initial_c, device=self.device)], dim=0)
+
+        initial_state = (initial_h, initial_c)
+
+        _, state = self.node_modules['robot_future_encoder'](robot_future, initial_state)
+        state = unpack_RNN_state(state)
+        state = F.dropout(state,
+                          p=1. - self.hyperparams['rnn_kwargs']['dropout_keep_prob'],
+                          training=(mode == ModeKeys.TRAIN))
+
+        return state
+
+    def q_z_xy(self, mode, x, y_e) -> torch.Tensor:
+        r"""
+        .. math:: q_\phi(z \mid \mathbf{x}_i, \mathbf{y}_i)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param y_e: Encoded future tensor.
+        :return: Latent distribution of the CVAE.
+        """
+        xy = torch.cat([x, y_e], dim=1)
+
+        if self.hyperparams['q_z_xy_MLP_dims'] is not None:
+            dense = self.node_modules[self.node_type + '/q_z_xy']
+            h = F.dropout(F.relu(dense(xy)),
+                          p=1. - self.hyperparams['MLP_dropout_keep_prob'],
+                          training=(mode == ModeKeys.TRAIN))
+
+        else:
+            h = xy
+
+        to_latent = self.node_modules[self.node_type + '/hxy_to_z']
+        return self.latent.dist_from_h(to_latent(h), mode)
+
+    def p_z_x(self, mode, x):
+        r"""
+        .. math:: p_\theta(z \mid \mathbf{x}_i)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :return: Latent distribution of the CVAE.
+        """
+        if self.hyperparams['p_z_x_MLP_dims'] is not None:
+            dense = self.node_modules[self.node_type + '/p_z_x']
+            h = F.dropout(F.relu(dense(x)),
+                          p=1. - self.hyperparams['MLP_dropout_keep_prob'],
+                          training=(mode == ModeKeys.TRAIN))
+
+        else:
+            h = x
+
+        to_latent = self.node_modules[self.node_type + '/hx_to_z']
+        return self.latent.dist_from_h(to_latent(h), mode)
+
+    def project_to_GMM_params(self, tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
+        """
+        Projects tensor to parameters of a GMM with N components and D dimensions.
+
+        :param tensor: Input tensor.
+        :return: tuple(log_pis, mus, log_sigmas, corrs)
+            WHERE
+            - log_pis: Weight (logarithm) of each GMM component. [N]
+            - mus: Mean of each GMM component. [N, D]
+            - log_sigmas: Standard Deviation (logarithm) of each GMM component. [N, D]
+            - corrs: Correlation between the GMM components. [N]
+        """
+        log_pis = self.node_modules[self.node_type + '/decoder/proj_to_GMM_log_pis'](tensor)
+        mus = self.node_modules[self.node_type + '/decoder/proj_to_GMM_mus'](tensor)
+        log_sigmas = self.node_modules[self.node_type + '/decoder/proj_to_GMM_log_sigmas'](tensor)
+        corrs = torch.tanh(self.node_modules[self.node_type + '/decoder/proj_to_GMM_corrs'](tensor))
+        return log_pis, mus, log_sigmas, corrs
+
+    def p_y_xz(self, mode, x, x_nr_t, y_r, n_s_t0, z_stacked, prediction_horizon,
+               num_samples, num_components=1, gmm_mode=False):
+        r"""
+        .. math:: p_\psi(\mathbf{y}_i \mid \mathbf{x}_i, z)
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param x_nr_t: Joint state of node and robot (if robot is in scene).
+        :param y: Future tensor.
+        :param y_r: Encoded future tensor.
+        :param n_s_t0: Standardized current state of the node.
+        :param z_stacked: Stacked latent state. [num_samples_z * num_samples_gmm, bs, latent_state]
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :param num_components: Number of GMM components.
+        :param gmm_mode: If True: The mode of the GMM is sampled.
+        :return: GMM2D. If mode is Predict, also samples from the GMM.
+        """
+        ph = prediction_horizon
+        pred_dim = self.pred_state_length
+
+        z = torch.reshape(z_stacked, (-1, self.latent.z_dim))
+        zx = torch.cat([z, x.repeat(num_samples * num_components, 1)], dim=1)
+
+        cell = self.node_modules[self.node_type + '/decoder/rnn_cell']
+        initial_h_model = self.node_modules[self.node_type + '/decoder/initial_h']
+
+        initial_state = initial_h_model(zx)
+
+        log_pis, mus, log_sigmas, corrs, a_sample = [], [], [], [], []
+
+        # Infer initial action state for node from current state
+        a_0 = self.node_modules[self.node_type + '/decoder/state_action'](n_s_t0)
+
+        state = initial_state
+        if self.hyperparams['incl_robot_node']:
+            input_ = torch.cat([zx,
+                                a_0.repeat(num_samples * num_components, 1),
+                                x_nr_t.repeat(num_samples * num_components, 1)], dim=1)
+        else:
+            input_ = torch.cat([zx, a_0.repeat(num_samples * num_components, 1)], dim=1)
+
+        for j in range(ph):
+            h_state = cell(input_, state)
+            log_pi_t, mu_t, log_sigma_t, corr_t = self.project_to_GMM_params(h_state)
+
+            gmm = GMM2D(log_pi_t, mu_t, log_sigma_t, corr_t)  # [k;bs, pred_dim]
+
+            if mode == ModeKeys.PREDICT and gmm_mode:
+                a_t = gmm.mode()
+            else:
+                a_t = gmm.rsample()
+
+            if num_components > 1:
+                if mode == ModeKeys.PREDICT:
+                    log_pis.append(self.latent.p_dist.logits.repeat(num_samples, 1, 1))
+                else:
+                    log_pis.append(self.latent.q_dist.logits.repeat(num_samples, 1, 1))
+            else:
+                log_pis.append(
+                    torch.ones_like(corr_t.reshape(num_samples, num_components, -1).permute(0, 2, 1).reshape(-1, 1))
+                )
+
+            mus.append(
+                mu_t.reshape(
+                    num_samples, num_components, -1, 2
+                ).permute(0, 2, 1, 3).reshape(-1, 2 * num_components)
+            )
+            log_sigmas.append(
+                log_sigma_t.reshape(
+                    num_samples, num_components, -1, 2
+                ).permute(0, 2, 1, 3).reshape(-1, 2 * num_components))
+            corrs.append(
+                corr_t.reshape(
+                    num_samples, num_components, -1
+                ).permute(0, 2, 1).reshape(-1, num_components))
+
+            if self.hyperparams['incl_robot_node']:
+                dec_inputs = [zx, a_t, y_r[:, j].repeat(num_samples * num_components, 1)]
+            else:
+                dec_inputs = [zx, a_t]
+            input_ = torch.cat(dec_inputs, dim=1)
+            state = h_state
+
+        log_pis = torch.stack(log_pis, dim=1)
+        mus = torch.stack(mus, dim=1)
+        log_sigmas = torch.stack(log_sigmas, dim=1)
+        corrs = torch.stack(corrs, dim=1)
+
+        a_dist = GMM2D(torch.reshape(log_pis, [num_samples, -1, ph, num_components]),
+                       torch.reshape(mus, [num_samples, -1, ph, num_components * pred_dim]),
+                       torch.reshape(log_sigmas, [num_samples, -1, ph, num_components * pred_dim]),
+                       torch.reshape(corrs, [num_samples, -1, ph, num_components]))
+
+        if self.hyperparams['dynamic'][self.node_type]['distribution']:
+            y_dist = self.dynamic.integrate_distribution(a_dist, x)
+        else:
+            y_dist = a_dist
+
+        if mode == ModeKeys.PREDICT:
+            if gmm_mode:
+                a_sample = a_dist.mode()
+            else:
+                a_sample = a_dist.rsample()
+            sampled_future = self.dynamic.integrate_samples(a_sample, x)
+            return y_dist, sampled_future
+        else:
+            return y_dist
+
+    def encoder(self, mode, x, y_e, num_samples=None):
+        """
+        Encoder of the CVAE.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param y_e: Encoded future tensor.
+        :param num_samples: Number of samples from the latent space during Prediction.
+        :return: tuple(z, kl_obj)
+            WHERE
+            - z: Samples from the latent space.
+            - kl_obj: KL Divergenze between q and p
+        """
+        if mode == ModeKeys.TRAIN:
+            sample_ct = self.hyperparams['k']
+        elif mode == ModeKeys.EVAL:
+            sample_ct = self.hyperparams['k_eval']
+        elif mode == ModeKeys.PREDICT:
+            sample_ct = num_samples
+            if num_samples is None:
+                raise ValueError("num_samples cannot be None with mode == PREDICT.")
+
+        self.latent.q_dist = self.q_z_xy(mode, x, y_e)
+        self.latent.p_dist = self.p_z_x(mode, x)
+
+        z = self.latent.sample_q(sample_ct, mode)
+
+        if mode == ModeKeys.TRAIN:
+            kl_obj = self.latent.kl_q_p(self.log_writer, '%s' % str(self.node_type), self.curr_iter)
+            if self.log_writer is not None:
+                self.log_writer.add_scalar('%s/%s' % (str(self.node_type), 'kl'), kl_obj, self.curr_iter)
+        else:
+            kl_obj = None
+
+        return z, kl_obj
+
+    def decoder(self, mode, x, x_nr_t, y, y_r, n_s_t0, z, labels, prediction_horizon, num_samples):
+        """
+        Decoder of the CVAE.
+
+        :param mode: Mode in which the model is operated. E.g. Train, Eval, Predict.
+        :param x: Input / Condition tensor.
+        :param x: Input / Condition tensor.
+        :param x_nr_t: Joint state of node and robot (if robot is in scene).
+        :param y: Future tensor.
+        :param y_r: Encoded future tensor.
+        :param n_s_t0: Standardized current state of the node.
+        :param z: Stacked latent state.
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :return: Log probability of y over p.
+        """
+
+        num_components = self.hyperparams['N'] * self.hyperparams['K']
+        y_dist = self.p_y_xz(mode, x, x_nr_t, y_r, n_s_t0, z,
+                             prediction_horizon, num_samples, num_components=num_components)
+        log_p_yt_xz = torch.clamp(y_dist.log_prob(labels), max=self.hyperparams['log_p_yt_xz_max'])
+        if self.hyperparams['log_histograms'] and self.log_writer is not None:
+            self.log_writer.add_histogram('%s/%s' % (str(self.node_type), 'log_p_yt_xz'), log_p_yt_xz, self.curr_iter)
+
+        log_p_y_xz = torch.sum(log_p_yt_xz, dim=2)
+        return log_p_y_xz
+
+    def train_loss(self,
+                   inputs,
+                   inputs_st,
+                   first_history_indices,
+                   labels,
+                   labels_st,
+                   neighbors,
+                   neighbors_edge_value,
+                   robot,
+                   map,
+                   prediction_horizon) -> torch.Tensor:
+        """
+        Calculates the training loss for a batch.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :return: Scalar tensor -> nll loss
+        """
+        mode = ModeKeys.TRAIN
+
+        x, x_nr_t, y_e, y_r, y, n_s_t0 = self.obtain_encoded_tensors(mode=mode,
+                                                                     inputs=inputs,
+                                                                     inputs_st=inputs_st,
+                                                                     labels=labels,
+                                                                     labels_st=labels_st,
+                                                                     first_history_indices=first_history_indices,
+                                                                     neighbors=neighbors,
+                                                                     neighbors_edge_value=neighbors_edge_value,
+                                                                     robot=robot,
+                                                                     map=map)
+
+        z, kl = self.encoder(mode, x, y_e)
+        log_p_y_xz = self.decoder(mode, x, x_nr_t, y, y_r, n_s_t0, z,
+                                  labels,  # Loss is calculated on unstandardized label
+                                  prediction_horizon,
+                                  self.hyperparams['k'])
+
+        log_p_y_xz_mean = torch.mean(log_p_y_xz, dim=0)  # [nbs]
+        log_likelihood = torch.mean(log_p_y_xz_mean)
+
+        mutual_inf_q = mutual_inf_mc(self.latent.q_dist)
+        mutual_inf_p = mutual_inf_mc(self.latent.p_dist)
+
+        ELBO = log_likelihood - self.kl_weight * kl + 1. * mutual_inf_p
+        loss = -ELBO
+
+        if self.hyperparams['log_histograms'] and self.log_writer is not None:
+            self.log_writer.add_histogram('%s/%s' % (str(self.node_type), 'log_p_y_xz'),
+                                          log_p_y_xz_mean,
+                                          self.curr_iter)
+
+        if self.log_writer is not None:
+            self.log_writer.add_scalar('%s/%s' % (str(self.node_type), 'mutual_information_q'),
+                                       mutual_inf_q,
+                                       self.curr_iter)
+            self.log_writer.add_scalar('%s/%s' % (str(self.node_type), 'mutual_information_p'),
+                                       mutual_inf_p,
+                                       self.curr_iter)
+            self.log_writer.add_scalar('%s/%s' % (str(self.node_type), 'log_likelihood'),
+                                       log_likelihood,
+                                       self.curr_iter)
+            self.log_writer.add_scalar('%s/%s' % (str(self.node_type), 'loss'),
+                                       loss,
+                                       self.curr_iter)
+            if self.hyperparams['log_histograms']:
+                self.latent.summarize_for_tensorboard(self.log_writer, str(self.node_type), self.curr_iter)
+        return loss
+
+    def eval_loss(self,
+                  inputs,
+                  inputs_st,
+                  first_history_indices,
+                  labels,
+                  labels_st,
+                  neighbors,
+                  neighbors_edge_value,
+                  robot,
+                  map,
+                  prediction_horizon) -> torch.Tensor:
+        """
+        Calculates the evaluation loss for a batch.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param labels: Label tensor including the label output for each agent over time [bs, t, pred_state].
+        :param labels_st: Standardized label tensor.
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :return: tuple(nll_q_is, nll_p, nll_exact, nll_sampled)
+        """
+
+        mode = ModeKeys.EVAL
+
+        x, x_nr_t, y_e, y_r, y, n_s_t0 = self.obtain_encoded_tensors(mode=mode,
+                                                                     inputs=inputs,
+                                                                     inputs_st=inputs_st,
+                                                                     labels=labels,
+                                                                     labels_st=labels_st,
+                                                                     first_history_indices=first_history_indices,
+                                                                     neighbors=neighbors,
+                                                                     neighbors_edge_value=neighbors_edge_value,
+                                                                     robot=robot,
+                                                                     map=map)
+
+        num_components = self.hyperparams['N'] * self.hyperparams['K']
+        ### Importance sampled NLL estimate
+        z, _ = self.encoder(mode, x, y_e)  # [k_eval, nbs, N*K]
+        z = self.latent.sample_p(1, mode, full_dist=True)
+        y_dist, _ = self.p_y_xz(ModeKeys.PREDICT, x, x_nr_t, y_r, n_s_t0, z,
+                                prediction_horizon, num_samples=1, num_components=num_components)
+        # We use unstandardized labels to compute the loss
+        log_p_yt_xz = torch.clamp(y_dist.log_prob(labels), max=self.hyperparams['log_p_yt_xz_max'])
+        log_p_y_xz = torch.sum(log_p_yt_xz, dim=2)
+        log_p_y_xz_mean = torch.mean(log_p_y_xz, dim=0)  # [nbs]
+        log_likelihood = torch.mean(log_p_y_xz_mean)
+        nll = -log_likelihood
+
+        return nll
+
+    def predict(self,
+                inputs,
+                inputs_st,
+                packed_inputs_st,
+                first_history_indices,
+                neighbors,
+                neighbors_edge_value,
+                robot,
+                map,
+                prediction_horizon,
+                num_samples,
+                z_mode=False,
+                gmm_mode=False,
+                full_dist=True,
+                all_z_sep=False):
+        """
+        Predicts the future of a batch of nodes.
+
+        :param inputs: Input tensor including the state for each agent over time [bs, t, state].
+        :param inputs_st: Standardized input tensor.
+        :param first_history_indices: First timestep (index) in scene for which data is available for a node [bs]
+        :param neighbors: Preprocessed dict (indexed by edge type) of list of neighbor states over time.
+                            [[bs, t, neighbor state]]
+        :param neighbors_edge_value: Preprocessed edge values for all neighbor nodes [[N]]
+        :param robot: Standardized robot state over time. [bs, t, robot_state]
+        :param map: Tensor of Map information. [bs, channels, x, y]
+        :param prediction_horizon: Number of prediction timesteps.
+        :param num_samples: Number of samples from the latent space.
+        :param z_mode: If True: Select the most likely latent state.
+        :param gmm_mode: If True: The mode of the GMM is sampled.
+        :param all_z_sep: Samples each latent mode individually without merging them into a GMM.
+        :param full_dist: Samples all latent states and merges them into a GMM as output.
+        :return:
+        """
+        mode = ModeKeys.PREDICT
+
+        # x, x_nr_t, _, y_r, _, n_s_t0 = self.obtain_encoded_tensors(mode=mode,
+        out = self.obtain_encoded_tensors(mode=mode,
+                                                                   inputs=inputs,
+                                                                   inputs_st=inputs_st,
+                                                                   packed_inputs_st=packed_inputs_st,
+                                                                   labels=None,
+                                                                   labels_st=None,
+                                                                   first_history_indices=first_history_indices,
+                                                                   neighbors=neighbors,
+                                                                   neighbors_edge_value=neighbors_edge_value,
+                                                                   robot=robot,
+                                                                   map=map)
+        # return x, n_s_t0
+        return out
+
+        self.latent.p_dist = self.p_z_x(mode, x)
+        z, num_samples, num_components = self.latent.sample_p(num_samples,
+                                                              mode,
+                                                              most_likely_z=z_mode,
+                                                              full_dist=full_dist,
+                                                              all_z_sep=all_z_sep)
+
+        _, our_sampled_future = self.p_y_xz(mode, x, x_nr_t, y_r, n_s_t0, z,
+                                            prediction_horizon,
+                                            num_samples,
+                                            num_components,
+                                            gmm_mode)
+
+        return our_sampled_future
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py
new file mode 100644
index 000000000..111a8ab3e
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_registrar.py
@@ -0,0 +1,75 @@
+import os
+import torch
+import torch.nn as nn
+
+
+def get_model_device(model):
+    return next(model.parameters()).device
+
+
+class ModelRegistrar(nn.Module):
+    def __init__(self, model_dir, device):
+        super(ModelRegistrar, self).__init__()
+        self.model_dict = nn.ModuleDict()
+        self.model_dir = model_dir
+        self.device = device
+
+    def forward(self):
+        raise NotImplementedError('Although ModelRegistrar is a nn.Module, it is only to store parameters.')
+
+    def get_model(self, name, model_if_absent=None):
+        # 4 cases: name in self.model_dict and model_if_absent is None         (OK)
+        #          name in self.model_dict and model_if_absent is not None     (OK)
+        #          name not in self.model_dict and model_if_absent is not None (OK)
+        #          name not in self.model_dict and model_if_absent is None     (NOT OK)
+
+        if name in self.model_dict:
+            return self.model_dict[name]
+
+        elif model_if_absent is not None:
+            self.model_dict[name] = model_if_absent.to(self.device)
+            return self.model_dict[name]
+
+        else:
+            raise ValueError(f'{name} was never initialized in this Registrar!')
+
+    def get_name_match(self, name):
+        ret_model_list = nn.ModuleList()
+        for key in self.model_dict.keys():
+            if name in key:
+                ret_model_list.append(self.model_dict[key])
+        return ret_model_list
+
+    def get_all_but_name_match(self, name):
+        ret_model_list = nn.ModuleList()
+        for key in self.model_dict.keys():
+            if name not in key:
+                ret_model_list.append(self.model_dict[key])
+        return ret_model_list
+
+    def print_model_names(self):
+        print(self.model_dict.keys())
+
+    def save_models(self, curr_iter):
+        # Create the model directiory if it's not present.
+        save_path = os.path.join(self.model_dir,
+                                 'model_registrar-%d.pt' % curr_iter)
+
+        torch.save(self.model_dict, save_path)
+
+    def load_models(self, iter_num):
+        self.model_dict.clear()
+        
+        save_path = os.path.join(self.model_dir,
+                                 'model_registrar-%d.pt' % iter_num)
+
+        print('')
+        print('Loading from ' + save_path)
+        self.model_dict = torch.load(save_path, map_location=self.device)
+        print('Loaded!')
+        print('')
+
+    def to(self, device):
+        for name, model in self.model_dict.items():
+            if get_model_device(model) != device:
+                model.to(device)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py
new file mode 100644
index 000000000..688d99261
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/model_utils.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn.utils.rnn as rnn
+from enum import Enum
+import functools
+import numpy as np
+import math
+
+
+class ModeKeys(Enum):
+    TRAIN = 1
+    EVAL = 2
+    PREDICT = 3
+
+
+def cyclical_lr(stepsize, min_lr=3e-4, max_lr=3e-3, decay=1.):
+    # Lambda function to calculate the LR
+    lr_lambda = lambda it: min_lr + (max_lr - min_lr) * relative(it, stepsize) * decay**it
+
+    # Additional function to see where on the cycle we are
+    def relative(it, stepsize):
+        cycle = math.floor(1 + it / (2 * stepsize))
+        x = abs(it / stepsize - 2 * cycle + 1)
+        return max(0, (1 - x))
+
+    return lr_lambda
+
+
+def to_one_hot(labels, n_labels):
+    return torch.eye(n_labels, device=labels.device)[labels]
+
+
+def exp_anneal(anneal_kws):
+    device = anneal_kws['device']
+    start = torch.tensor(anneal_kws['start'], device=device)
+    finish = torch.tensor(anneal_kws['finish'], device=device)
+    rate = torch.tensor(anneal_kws['rate'], device=device)
+    return lambda step: finish - (finish - start)*torch.pow(rate, torch.tensor(step, dtype=torch.float, device=device))
+
+
+def sigmoid_anneal(anneal_kws):
+    device = anneal_kws['device']
+    start = torch.tensor(anneal_kws['start'], device=device)
+    finish = torch.tensor(anneal_kws['finish'], device=device)
+    center_step = torch.tensor(anneal_kws['center_step'], device=device, dtype=torch.float)
+    steps_lo_to_hi = torch.tensor(anneal_kws['steps_lo_to_hi'], device=device, dtype=torch.float)
+    return lambda step: start + (finish - start)*torch.sigmoid((torch.tensor(float(step), device=device) - center_step) * (1./steps_lo_to_hi))
+
+
+class CustomLR(torch.optim.lr_scheduler.LambdaLR):
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1):
+        super(CustomLR, self).__init__(optimizer, lr_lambda, last_epoch)
+
+    def get_lr(self):
+        return [lmbda(self.last_epoch)
+                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+
+
+def mutual_inf_mc(x_dist):
+    dist = x_dist.__class__
+    H_y = dist(probs=x_dist.probs.mean(dim=0)).entropy()
+    return (H_y - x_dist.entropy().mean(dim=0)).sum()
+
+
+def run_lstm_on_variable_length_seqs(lstm_module, original_seqs, lower_indices=None, upper_indices=None, total_length=None):
+    # breakpoint()
+    # bs, tf = original_seqs.shape[:2]
+    # if lower_indices is None:
+    #     lower_indices = torch.zeros(bs, dtype=torch.int)
+    # if upper_indices is None:
+    #     upper_indices = torch.ones(bs, dtype=torch.int) * (tf - 1)
+    # if total_length is None:
+    #     total_length = max(upper_indices) + 1
+    # # This is done so that we can just pass in self.prediction_timesteps
+    # # (which we want to INCLUDE, so this will exclude the next timestep).
+    # inclusive_break_indices = upper_indices + 1
+
+    # pad_list = list()
+    # for i, seq_len in enumerate(inclusive_break_indices):
+    #     pad_list.append(original_seqs[i, lower_indices[i]:seq_len])
+
+    # packed_seqs = rnn.pack_sequence(pad_list, enforce_sorted=False)
+    # return packed_seqs  # TypeError: int() argument must be a string, a bytes-like object or a real number, not 'Any'
+
+    packed_seqs = original_seqs
+    packed_output, (h_n, c_n) = lstm_module(packed_seqs)
+    return packed_output  # TypeError: object of type 'Call' has no len()
+    output, _ = rnn.pad_packed_sequence(packed_output,
+                                        batch_first=True,
+                                        total_length=total_length)
+
+    return output, (h_n, c_n)
+
+
+def extract_subtensor_per_batch_element(tensor, indices):
+    batch_idxs = torch.arange(start=0, end=len(indices))
+
+    batch_idxs = batch_idxs[~torch.isnan(indices)]
+    indices = indices[~torch.isnan(indices)]
+    if indices.size == 0:
+        return None
+    else:
+        indices = indices.long()
+    if tensor.is_cuda:
+        batch_idxs = batch_idxs.to(tensor.get_device())
+        indices = indices.to(tensor.get_device())
+    return tensor[batch_idxs, indices]
+
+
+def unpack_RNN_state(state_tuple):
+    # PyTorch returned LSTM states have 3 dims:
+    # (num_layers * num_directions, batch, hidden_size)
+
+    state = torch.cat(state_tuple, dim=0).permute(1, 0, 2)
+    # Now state is (batch, 2 * num_layers * num_directions, hidden_size)
+
+    state_size = state.size()
+    return torch.reshape(state, (-1, state_size[1] * state_size[2]))
+
+
+def rsetattr(obj, attr, val):
+    pre, _, post = attr.rpartition('.')
+    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
+def rgetattr(obj, attr, *args):
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py
new file mode 100644
index 000000000..a1c907062
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/__init__.py
@@ -0,0 +1,2 @@
+from .online_trajectron import OnlineTrajectron
+from .online_mgcvae import OnlineMultimodalGenerativeCVAE
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py
new file mode 100644
index 000000000..c614c37a4
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_mgcvae.py
@@ -0,0 +1,430 @@
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from collections import defaultdict, Counter
+from model.components import *
+from model.model_utils import *
+from model.dataset import get_relative_robot_traj
+import model.dynamics as dynamic_module
+from model.mgcvae import MultimodalGenerativeCVAE
+from environment.scene_graph import DirectedEdge
+from environment.node_type import NodeType
+
+
+class OnlineMultimodalGenerativeCVAE(MultimodalGenerativeCVAE):
+    def __init__(self,
+                 env,
+                 node,
+                 model_registrar,
+                 hyperparams,
+                 device):
+        self.hyperparams = hyperparams
+        self.node = node
+        self.node_type = self.node.type
+
+        if len(env.scenes) != 1:
+            raise ValueError("Passed in Environment has number of scenes != 1")
+        self.robot = env.scenes[0].robot
+        self.model_registrar = model_registrar
+        self.device = device
+
+        self.node_modules = dict()
+        self.env = env
+        self.scene_graph = None
+
+        self.state = self.hyperparams['state']
+        self.pred_state = self.hyperparams['pred_state'][self.node.type]
+        self.state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[self.node.type].values()]))
+        if self.hyperparams['incl_robot_node']:
+            self.robot_state_length = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[self.robot.type].values()]))
+        self.pred_state_length = int(np.sum([len(entity_dims) for entity_dims in self.pred_state.values()]))
+
+        self.curr_hidden_states = dict()
+        self.edge_types = Counter()
+
+        self.create_graphical_model()
+
+        dynamic_class = getattr(dynamic_module, self.hyperparams['dynamic'][self.node_type]['name'])
+        dyn_limits = hyperparams['dynamic'][self.node_type]['limits']
+        self.dynamic = dynamic_class(self.env.scenes[0].dt, dyn_limits, device,
+                                     self.model_registrar, self.x_size, self.node_type)
+
+    def create_graphical_model(self):
+        """
+        Creates or queries all trainable components.
+
+        :return: None
+        """
+        self.clear_submodules()
+
+        ############################
+        #   Everything but Edges   #
+        ############################
+        self.create_node_models()
+
+        for name, module in self.node_modules.items():
+            module.to(self.device)
+
+    def update_graph(self, new_scene_graph, new_neighbors, removed_neighbors):
+        self.scene_graph = new_scene_graph
+
+        if self.node in new_neighbors:
+            for edge_type, new_neighbor_nodes in new_neighbors[self.node].items():
+                self.add_edge_model(edge_type)
+                self.edge_types += Counter({edge_type: len(new_neighbor_nodes)})
+
+        if self.node in removed_neighbors:
+            for edge_type, removed_neighbor_nodes in removed_neighbors[self.node].items():
+                self.remove_edge_model(edge_type)
+                self.edge_types -= Counter({edge_type: len(removed_neighbor_nodes)})
+
+    def get_edge_to(self, other_node):
+        return DirectedEdge(self.node, other_node)
+
+    def add_edge_model(self, edge_type):
+        if self.hyperparams['edge_encoding']:
+            if edge_type + '/edge_encoder' not in self.node_modules:
+                neighbor_state_length = int(
+                    np.sum([len(entity_dims) for entity_dims in
+                            self.state[self._get_other_node_type_from_edge(edge_type)].values()]))
+                if self.hyperparams['edge_state_combine_method'] == 'pointnet':
+                    self.add_submodule(edge_type + '/pointnet_encoder',
+                                       model_if_absent=nn.Sequential(
+                                           nn.Linear(self.state_length, 2 * self.state_length),
+                                           nn.ReLU(),
+                                           nn.Linear(2 * self.state_length, 2 * self.state_length),
+                                           nn.ReLU()))
+
+                    edge_encoder_input_size = 2 * self.state_length + self.state_length
+
+                elif self.hyperparams['edge_state_combine_method'] == 'attention':
+                    self.add_submodule(self.node.type + '/edge_attention_combine',
+                                       model_if_absent=TemporallyBatchedAdditiveAttention(
+                                           encoder_hidden_state_dim=self.state_length,
+                                           decoder_hidden_state_dim=self.state_length))
+                    edge_encoder_input_size = self.state_length + neighbor_state_length
+
+                else:
+                    edge_encoder_input_size = self.state_length + neighbor_state_length
+
+                self.add_submodule(edge_type + '/edge_encoder',
+                                   model_if_absent=nn.LSTM(input_size=edge_encoder_input_size,
+                                                           hidden_size=self.hyperparams['enc_rnn_dim_edge'],
+                                                           batch_first=True))
+
+    def _get_other_node_type_from_edge(self, edge_type_str):
+        n2_type_str = edge_type_str.split('->')[1]
+        return NodeType(n2_type_str, self.env.node_type_list.index(n2_type_str) + 1)
+
+    def _get_edge_type_from_str(self, edge_type_str):
+        n1_type_str, n2_type_str = edge_type_str.split('->')
+        return (NodeType(n1_type_str, self.env.node_type_list.index(n1_type_str) + 1),
+                NodeType(n2_type_str, self.env.node_type_list.index(n2_type_str) + 1))
+
+    def remove_edge_model(self, edge_type):
+        if self.hyperparams['edge_encoding']:
+            if len(self.scene_graph.get_neighbors(self.node, self._get_other_node_type_from_edge(edge_type))) == 0:
+                del self.node_modules[edge_type + '/edge_encoder']
+
+    def obtain_encoded_tensors(self,
+                               mode,
+                               inputs,
+                               inputs_st,
+                               inputs_np,
+                               robot_present_and_future,
+                               maps):
+        x, x_r_t, y_r = None, None, None
+        batch_size = 1
+
+        our_inputs = inputs[self.node]
+        our_inputs_st = inputs_st[self.node]
+
+        initial_dynamics = dict()
+        initial_dynamics['pos'] = our_inputs[:, 0:2]  # TODO: Generalize
+        initial_dynamics['vel'] = our_inputs[:, 2:4]  # TODO: Generalize
+        self.dynamic.set_initial_condition(initial_dynamics)
+
+        #########################################
+        # Provide basic information to encoders #
+        #########################################
+        if self.hyperparams['incl_robot_node'] and self.robot is not None:
+            robot_present_and_future_st = get_relative_robot_traj(self.env, self.state,
+                                                                  our_inputs, robot_present_and_future,
+                                                                  self.node.type, self.robot.type)
+            x_r_t = robot_present_and_future_st[..., 0, :]
+            y_r = robot_present_and_future_st[..., 1:, :]
+
+        ##################
+        # Encode History #
+        ##################
+        node_history_encoded = self.encode_node_history(our_inputs_st)
+
+        ##############################
+        # Encode Node Edges per Type #
+        ##############################
+        total_edge_influence = None
+        if self.hyperparams['edge_encoding']:
+            node_edges_encoded = list()
+            for edge_type in self.edge_types:
+                connected_nodes_batched = list()
+                edge_masks_batched = list()
+
+                # We get all nodes which are connected to the current node for the current timestep
+                connected_nodes_batched.append(self.scene_graph.get_neighbors(self.node,
+                                                                              self._get_other_node_type_from_edge(
+                                                                                  edge_type)))
+
+                if self.hyperparams['dynamic_edges'] == 'yes':
+                    # We get the edge masks for the current node at the current timestep
+                    edge_masks_for_node = self.scene_graph.get_edge_scaling(self.node)
+                    edge_masks_batched.append(torch.tensor(edge_masks_for_node, dtype=torch.float, device=self.device))
+
+                # Encode edges for given edge type
+                encoded_edges_type = self.encode_edge(inputs,
+                                                      inputs_st,
+                                                      inputs_np,
+                                                      edge_type,
+                                                      connected_nodes_batched,
+                                                      edge_masks_batched)
+                node_edges_encoded.append(encoded_edges_type)  # List of [bs/nbs, enc_rnn_dim]
+
+            #####################
+            # Encode Node Edges #
+            #####################
+            total_edge_influence = self.encode_total_edge_influence(mode,
+                                                                    node_edges_encoded,
+                                                                    node_history_encoded,
+                                                                    batch_size)
+
+        self.TD = {'node_history_encoded': node_history_encoded,
+                   'total_edge_influence': total_edge_influence}
+
+        ################
+        # Map Encoding #
+        ################
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            if self.node not in maps:
+                # This means the node was removed (it is only being kept around because of the edge removal filter).
+                me_params = self.hyperparams['map_encoder'][self.node_type]
+                self.TD['encoded_map'] = torch.zeros((1, me_params['output_size']))
+            else:
+                encoded_map = self.node_modules[self.node_type + '/map_encoder'](maps[self.node] * 2. - 1.,
+                                                                                 (mode == ModeKeys.TRAIN))
+                do = self.hyperparams['map_encoder'][self.node_type]['dropout']
+                encoded_map = F.dropout(encoded_map, do, training=(mode == ModeKeys.TRAIN))
+                self.TD['encoded_map'] = encoded_map
+
+        ######################################
+        # Concatenate Encoder Outputs into x #
+        ######################################
+        return self.create_encoder_rep(mode, self.TD, x_r_t, y_r)
+
+    def create_encoder_rep(self, mode,
+                           TD,
+                           robot_present_st,
+                           robot_future_st):
+        # Unpacking TD
+        node_history_encoded = TD['node_history_encoded']
+        if self.hyperparams['edge_encoding']:
+            total_edge_influence = TD['total_edge_influence']
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            encoded_map = TD['encoded_map']
+
+        if (self.hyperparams['incl_robot_node']
+                and self.robot is not None
+                and robot_future_st is not None
+                and robot_present_st is not None):
+            robot_future_encoder = self.encode_robot_future(mode, robot_present_st, robot_future_st)
+
+            # Tiling for multiple samples
+            # This tiling is done because:
+            #   a) we must consider the prediction case where there are many candidate robot future actions,
+            #   b) the edge and history encoders are all the same regardless of which candidate future robot action
+            #      we're evaluating.
+            node_history_encoded = TD['node_history_encoded'].repeat(robot_future_st.size()[0], 1)
+            if self.hyperparams['edge_encoding']:
+                total_edge_influence = TD['total_edge_influence'].repeat(robot_future_st.size()[0], 1)
+            if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+                encoded_map = TD['encoded_map'].repeat(robot_future_st.size()[0], 1)
+
+        elif self.hyperparams['incl_robot_node'] and self.robot is not None:
+            # Four times because we're trying to mimic a bi-directional RNN's output (which is c and h from both ends).
+            robot_future_encoder = torch.zeros([1, 4 * self.hyperparams['enc_rnn_dim_future']], device=self.device)
+
+        x_concat_list = list()
+
+        # Every node has an edge-influence encoder (which could just be zero).
+        if self.hyperparams['edge_encoding']:
+            x_concat_list.append(total_edge_influence)  # [bs/nbs, 4*enc_rnn_dim]
+
+        # Every node has a history encoder.
+        x_concat_list.append(node_history_encoded)  # [bs/nbs, enc_rnn_dim_history]
+
+        if self.hyperparams['incl_robot_node'] and self.robot is not None:
+            x_concat_list.append(robot_future_encoder)  # [bs/nbs, 4*enc_rnn_dim_history]
+
+        if self.hyperparams['use_map_encoding'] and self.node_type in self.hyperparams['map_encoder']:
+            x_concat_list.append(encoded_map)  # [bs/nbs, CNN output size]
+
+        return torch.cat(x_concat_list, dim=1)
+
+    def encode_node_history(self, inputs_st):
+        new_state = torch.unsqueeze(inputs_st, dim=1)  # [bs, 1, state_dim]
+        if self.node.type + '/node_history_encoder' not in self.curr_hidden_states:
+            outputs, self.curr_hidden_states[self.node.type + '/node_history_encoder'] = self.node_modules[
+                self.node.type + '/node_history_encoder'](new_state)
+        else:
+            outputs, self.curr_hidden_states[self.node.type + '/node_history_encoder'] = self.node_modules[
+                self.node.type + '/node_history_encoder'](new_state, self.curr_hidden_states[
+                self.node.type + '/node_history_encoder'])
+
+        return outputs[:, 0, :]
+
+    def encode_edge(self, inputs, inputs_st, inputs_np, edge_type, connected_nodes, edge_masks):
+        edge_type_tuple = self._get_edge_type_from_str(edge_type)
+        edge_states_list = list()  # list of [#of neighbors, max_ht, state_dim]
+        neighbor_states = list()
+
+        orig_rel_state = inputs[self.node].cpu().numpy()
+        for node in connected_nodes[0]:
+            neighbor_state_np = inputs_np[node]
+
+            # Make State relative to node
+            _, std = self.env.get_standardize_params(self.state[node.type], node_type=node.type)
+            std[0:2] = self.env.attention_radius[edge_type_tuple]
+
+            # TODO: This all makes the unsafe assumption that the first n dims
+            #  refer to the same quantities even for different agent types!
+            equal_dims = np.min((neighbor_state_np.shape[-1], orig_rel_state.shape[-1]))
+            rel_state = np.zeros_like(neighbor_state_np)
+            rel_state[..., :equal_dims] = orig_rel_state[..., :equal_dims]
+            neighbor_state_np_st = self.env.standardize(neighbor_state_np,
+                                                        self.state[node.type],
+                                                        node_type=node.type,
+                                                        mean=rel_state,
+                                                        std=std)
+
+            neighbor_state = torch.tensor(neighbor_state_np_st).float().to(self.device)
+            neighbor_states.append(neighbor_state)
+
+        if len(neighbor_states) == 0:  # There are no neighbors for edge type # TODO necessary?
+            neighbor_state_length = int(np.sum([len(entity_dims) for entity_dims in self.state[edge_type[1]].values()]))
+            edge_states_list.append(torch.zeros((1, 1, neighbor_state_length), device=self.device))
+        else:
+            edge_states_list.append(torch.stack(neighbor_states, dim=0))
+
+        if self.hyperparams['edge_state_combine_method'] == 'sum':
+            # Used in Structural-RNN to combine edges as well.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.sum(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.sum(edge_mask, dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams['edge_state_combine_method'] == 'max':
+            # Used in NLP, e.g. max over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.max(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.max(edge_mask, dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        elif self.hyperparams['edge_state_combine_method'] == 'mean':
+            # Used in NLP, e.g. mean over word embeddings in a sentence.
+            op_applied_edge_states_list = list()
+            for neighbors_state in edge_states_list:
+                op_applied_edge_states_list.append(torch.mean(neighbors_state, dim=0))
+            combined_neighbors = torch.stack(op_applied_edge_states_list, dim=0)
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                # Should now be (bs, time, 1)
+                op_applied_edge_mask_list = list()
+                for edge_mask in edge_masks:
+                    op_applied_edge_mask_list.append(torch.clamp(torch.mean(edge_mask, dim=0, keepdim=True), max=1.))
+                combined_edge_masks = torch.stack(op_applied_edge_mask_list, dim=0)
+
+        joint_history = torch.cat([combined_neighbors, torch.unsqueeze(inputs_st[self.node], dim=0)], dim=-1)
+
+        if edge_type + '/edge_encoder' not in self.curr_hidden_states:
+            outputs, self.curr_hidden_states[edge_type + '/edge_encoder'] = self.node_modules[
+                edge_type + '/edge_encoder'](joint_history)
+        else:
+            outputs, self.curr_hidden_states[edge_type + '/edge_encoder'] = self.node_modules[
+                edge_type + '/edge_encoder'](joint_history, self.curr_hidden_states[edge_type + '/edge_encoder'])
+
+        if self.hyperparams['dynamic_edges'] == 'yes':
+            return outputs[:, 0, :] * combined_edge_masks
+        else:
+            return outputs[:, 0, :]  # [bs, enc_rnn_dim]
+
+    def encoder_forward(self, inputs, inputs_st, inputs_np, robot_present_and_future=None, maps=None):
+        # Always predicting with the online model.
+        mode = ModeKeys.PREDICT
+
+        self.x = self.obtain_encoded_tensors(mode,
+                                             inputs,
+                                             inputs_st,
+                                             inputs_np,
+                                             robot_present_and_future,
+                                             maps)
+        self.n_s_t0 = inputs_st[self.node]
+
+        self.latent.p_dist = self.p_z_x(mode, self.x)
+
+    # robot_future_st is optional here since you can use the same one from encoder_forward,
+    # but if it's given then we'll re-run that part of the model (if the node is adjacent to the robot).
+    def decoder_forward(self, prediction_horizon,
+                        num_samples,
+                        robot_present_and_future=None,
+                        z_mode=False,
+                        gmm_mode=False,
+                        full_dist=False,
+                        all_z_sep=False):
+        # Always predicting with the online model.
+        mode = ModeKeys.PREDICT
+
+        x_nr_t, y_r = None, None
+        if (self.hyperparams['incl_robot_node']
+                and self.robot is not None
+                and robot_present_and_future is not None):
+            our_inputs = torch.tensor(self.node.get(np.array([self.node.last_timestep]),
+                                                    self.state[self.node.type],
+                                                    padding=0.0),
+                                      dtype=torch.float,
+                                      device=self.device)
+            robot_present_and_future_st = get_relative_robot_traj(self.env, self.state,
+                                                                  our_inputs, robot_present_and_future,
+                                                                  self.node.type, self.robot.type)
+            x_nr_t = robot_present_and_future_st[..., 0, :]
+            y_r = robot_present_and_future_st[..., 1:, :]
+            self.x = self.create_encoder_rep(mode, self.TD, x_nr_t, y_r)
+            self.latent.p_dist = self.p_z_x(mode, self.x)
+
+            # Making sure n_s_t0 has the same batch size as x_nr_t
+            self.n_s_t0 = self.n_s_t0[[0]].repeat(x_nr_t.size()[0], 1)
+
+        z, num_samples, num_components = self.latent.sample_p(num_samples,
+                                                              mode,
+                                                              most_likely_z=z_mode,
+                                                              full_dist=full_dist,
+                                                              all_z_sep=all_z_sep)
+
+        y_dist, our_sampled_future = self.p_y_xz(mode, self.x, x_nr_t, y_r, self.n_s_t0, z,
+                                                 prediction_horizon,
+                                                 num_samples,
+                                                 num_components,
+                                                 gmm_mode)
+
+        return y_dist, our_sampled_future
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py
new file mode 100644
index 000000000..f1c5063be
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/online/online_trajectron.py
@@ -0,0 +1,304 @@
+import torch
+import numpy as np
+from collections import Counter
+from model.trajectron import Trajectron
+from model.online.online_mgcvae import OnlineMultimodalGenerativeCVAE
+from model.model_utils import ModeKeys
+from environment import RingBuffer, TemporalSceneGraph, SceneGraph, derivative_of
+
+
+class OnlineTrajectron(Trajectron):
+    def __init__(self, model_registrar,
+                 hyperparams, device):
+        super(OnlineTrajectron, self).__init__(model_registrar=model_registrar,
+                                               hyperparams=hyperparams,
+                                               log_writer=False,
+                                               device=device)
+        self.node_data = dict()
+        self.scene_graph = None
+        self.RING_CAPACITY = max(len(self.hyperparams['edge_removal_filter']),
+                                 len(self.hyperparams['edge_addition_filter']),
+                                 self.hyperparams['maximum_history_length']) + 1
+        self.rel_states = dict()
+        self.removed_nodes = Counter()
+
+    def __repr__(self):
+        return f"OnlineTrajectron(# nodes: {len(self.nodes)}, device: {self.device}, hyperparameters: {str(self.hyperparams)}) "
+
+    def _add_node_model(self, node):
+        if node in self.nodes:
+            raise ValueError('%s was already added to this graph!' % str(node))
+
+        self.nodes.add(node)
+        self.node_models_dict[node] = OnlineMultimodalGenerativeCVAE(self.env,
+                                                                     node,
+                                                                     self.model_registrar,
+                                                                     self.hyperparams,
+                                                                     self.device)
+
+    def update_removed_nodes(self):
+        for node in list(self.removed_nodes.keys()):
+            if self.removed_nodes[node] >= len(self.hyperparams['edge_removal_filter']):
+                del self.node_data[node]
+                del self.removed_nodes[node]
+
+    def _remove_node_model(self, node):
+        if node not in self.nodes:
+            raise ValueError('%s is not in this graph!' % str(node))
+
+        self.nodes.remove(node)
+        del self.node_models_dict[node]
+
+    def set_environment(self, env, init_timestep=0):
+        self.env = env
+        self.scene_graph = SceneGraph(edge_radius=self.env.attention_radius)
+        self.nodes.clear()
+        self.node_data.clear()
+        self.node_models_dict.clear()
+
+        # Fast-forwarding ourselves to the initial timestep, without running any of the underlying models.
+        for timestep in range(init_timestep + 1):
+            self.incremental_forward(self.env.scenes[0].get_clipped_input_dict(timestep, self.hyperparams['state']),
+                                     maps=None, run_models=False)
+
+    def incremental_forward(self, new_inputs_dict,
+                            maps,
+                            prediction_horizon=0,
+                            num_samples=0,
+                            robot_present_and_future=None,
+                            z_mode=False,
+                            gmm_mode=False,
+                            full_dist=False,
+                            all_z_sep=False,
+                            run_models=True):
+        # The way this function works is by appending the new datapoints to the
+        # ends of each of the LSTMs in the graph. Then, we recalculate the
+        # encoder's output vector h_x and feed that into the decoder to sample new outputs.
+        mode = ModeKeys.PREDICT
+
+        # No grad since we're predicting always, as evidenced by the line above.
+        with torch.no_grad():
+            for node, new_input in new_inputs_dict.items():
+                if node not in self.node_data:
+                    self.node_data[node] = RingBuffer(capacity=self.RING_CAPACITY,
+                                                      dtype=(float, sum(len(self.state[node.type][k])
+                                                                        for k in self.state[node.type])))
+                self.node_data[node].append(new_input)
+
+                if node in self.removed_nodes:
+                    del self.removed_nodes[node]
+
+            # Nodes in self.node_data that aren't in new_inputs_dict were just removed.
+            newly_removed_nodes = (set(self.node_data.keys()) - set(self.removed_nodes.keys())) - set(
+                new_inputs_dict.keys())
+
+            # We update self.removed_nodes with the newly removed nodes as well as all existing removed nodes to get
+            # the time since their last removal increased by one.
+            self.removed_nodes.update(newly_removed_nodes | set(self.removed_nodes.keys()))
+
+            # For any nodes that are older than the length of the edge_removal_filter, we can safely clear their data.
+            self.update_removed_nodes()
+
+            # Any remaining removed nodes that aren't yet old enough for data clearing simply have NaNs appended so
+            # that when it's passed through the LSTMs, the hidden state keeps propagating but the input plays no role
+            # (the NaNs get converted to zeros later on).
+            for node in self.removed_nodes:
+                self.node_data[node].append(np.full((1, self.node_data[node].shape[1]), np.nan))
+
+            for node in self.node_data:
+                node.overwrite_data(self.node_data[node], None,
+                                    forward_in_time_on_next_overwrite=(self.node_data[node].shape[0]
+                                                                       == self.RING_CAPACITY))
+
+            temp_scene_dict = {k: v[:, 0:2] for k, v in self.node_data.items()}
+            if not temp_scene_dict:
+                new_scene_graph = SceneGraph(edge_radius=self.env.attention_radius)
+            else:
+                new_scene_graph = TemporalSceneGraph.create_from_temp_scene_dict(
+                    temp_scene_dict,
+                    self.env.attention_radius,
+                    duration=self.RING_CAPACITY,
+                    edge_addition_filter=self.hyperparams['edge_addition_filter'],
+                    edge_removal_filter=self.hyperparams['edge_removal_filter'],
+                    online=True).to_scene_graph(t=self.RING_CAPACITY - 1)
+
+            if self.hyperparams['dynamic_edges'] == 'yes':
+                new_nodes, removed_nodes, new_neighbors, removed_neighbors = new_scene_graph - self.scene_graph
+
+                # Aside from updating the scene graph, this for loop updates the graph model
+                # structure of all affected nodes.
+                not_removed_nodes = [node for node in self.nodes if node not in removed_nodes]
+                self.scene_graph = new_scene_graph
+                for node in not_removed_nodes:
+                    self.node_models_dict[node].update_graph(new_scene_graph, new_neighbors, removed_neighbors)
+
+                # These next 2 for loops add or remove entire node models.
+                for node in new_nodes:
+                    if (node.is_robot and self.hyperparams['incl_robot_node']) or node.type not in self.pred_state.keys():
+                        # Only deal with Models for NodeTypes we want to predict
+                        continue
+
+                    self._add_node_model(node)
+                    self.node_models_dict[node].update_graph(new_scene_graph, new_neighbors, removed_neighbors)
+
+                for node in removed_nodes:
+                    if (node.is_robot and self.hyperparams['incl_robot_node']) or node.type not in self.pred_state.keys():
+                        continue
+
+                    self._remove_node_model(node)
+
+            # This actually updates the node models with the newly observed data.
+            if run_models:
+                inputs = dict()
+                inputs_st = dict()
+                inputs_np = dict()
+
+                iter_list = list(self.node_models_dict.keys()) + [node for node in new_inputs_dict
+                                                                    if node.type not in self.pred_state.keys()]
+                if self.env.scenes[0].robot is not None:
+                    iter_list.append(self.env.scenes[0].robot)
+
+                for node in iter_list:
+                    input_np = node.get(np.array([node.last_timestep, node.last_timestep]), self.state[node.type])
+
+                    _, std = self.env.get_standardize_params(self.state[node.type.name], node.type)
+                    std[0:2] = self.env.attention_radius[(node.type, node.type)]
+                    rel_state = np.zeros_like(input_np)
+                    rel_state[:, 0:2] = input_np[:, 0:2]
+                    input_st = self.env.standardize(input_np,
+                                                    self.state[node.type.name],
+                                                    node.type,
+                                                    mean=rel_state)
+                    self.rel_states[node] = rel_state
+
+                    # Converting NaNs to zeros.
+                    input_np[np.isnan(input_np)] = 0
+                    input_st[np.isnan(input_st)] = 0
+
+                    # Convert to torch tensors
+                    inputs[node] = torch.tensor(input_np, dtype=torch.float, device=self.device)
+                    inputs_st[node] = torch.tensor(input_st, dtype=torch.float, device=self.device)
+                    inputs_np[node] = input_np
+
+                # We want tensors of shape (1, ph + 1, state_dim) where the first 1 is the batch size.
+                if (self.hyperparams['incl_robot_node']
+                        and self.env.scenes[0].robot is not None
+                        and robot_present_and_future is not None):
+                    if len(robot_present_and_future.shape) == 2:
+                        robot_present_and_future = robot_present_and_future[np.newaxis, :]
+
+                    assert robot_present_and_future.shape[1] == prediction_horizon + 1
+                    robot_present_and_future = torch.tensor(robot_present_and_future,
+                                                            dtype=torch.float, device=self.device)
+
+                for node in self.node_models_dict:
+                    self.node_models_dict[node].encoder_forward(inputs,
+                                                                inputs_st,
+                                                                inputs_np,
+                                                                robot_present_and_future,
+                                                                maps)
+
+                # If num_predicted_timesteps or num_samples == 0 then do not run the decoder at all,
+                # just update the encoder LSTMs.
+                if prediction_horizon == 0 or num_samples == 0:
+                    return
+
+                return self.sample_model(prediction_horizon,
+                                         num_samples,
+                                         robot_present_and_future=robot_present_and_future,
+                                         z_mode=z_mode,
+                                         gmm_mode=gmm_mode,
+                                         full_dist=full_dist,
+                                         all_z_sep=all_z_sep)
+
+    def _run_decoder(self, node,
+                     num_predicted_timesteps,
+                     num_samples,
+                     robot_present_and_future=None,
+                     z_mode=False,
+                     gmm_mode=False,
+                     full_dist=False,
+                     all_z_sep=False):
+        model = self.node_models_dict[node]
+        prediction_dist, predictions_uns = model.decoder_forward(num_predicted_timesteps,
+                                                                 num_samples,
+                                                                 robot_present_and_future=robot_present_and_future,
+                                                                 z_mode=z_mode,
+                                                                 gmm_mode=gmm_mode,
+                                                                 full_dist=full_dist,
+                                                                 all_z_sep=all_z_sep)
+
+        predictions_np = predictions_uns.cpu().detach().numpy()
+
+        # Return will be of shape (batch_size, num_samples, num_predicted_timesteps, 2)
+        return prediction_dist, np.transpose(predictions_np, (1, 0, 2, 3))
+
+    def sample_model(self, num_predicted_timesteps,
+                     num_samples,
+                     robot_present_and_future=None,
+                     z_mode=False,
+                     gmm_mode=False,
+                     full_dist=False,
+                     all_z_sep=False):
+        # Just start from the encoder output (minus the
+        # robot future) and get num_samples of
+        # num_predicted_timesteps-length trajectories.
+        if num_predicted_timesteps == 0 or num_samples == 0:
+            return
+
+        mode = ModeKeys.PREDICT
+
+        # We want tensors of shape (1, ph + 1, state_dim) where the first 1 is the batch size.
+        if self.hyperparams['incl_robot_node'] and self.env.scenes[
+            0].robot is not None and robot_present_and_future is not None:
+            if len(robot_present_and_future.shape) == 2:
+                robot_present_and_future = robot_present_and_future[np.newaxis, :]
+
+            assert robot_present_and_future.shape[1] == num_predicted_timesteps + 1
+
+        # No grad since we're predicting always, as evidenced by the line above.
+        with torch.no_grad():
+            predictions_dict = dict()
+            prediction_dists = dict()
+            for node in set(self.nodes) - set(self.removed_nodes.keys()):
+                if node.is_robot:
+                    continue
+
+                prediction_dists[node], predictions_dict[node] = self._run_decoder(node, num_predicted_timesteps,
+                                                                                   num_samples,
+                                                                                   robot_present_and_future,
+                                                                                   z_mode,
+                                                                                   gmm_mode,
+                                                                                   full_dist,
+                                                                                   all_z_sep)
+
+        return prediction_dists, predictions_dict
+
+    def forward(self, init_env,
+                init_timestep,
+                input_dicts,  # After the initial environment
+                num_predicted_timesteps,
+                num_samples,
+                robot_present_and_future=None,
+                z_mode=False,
+                gmm_mode=False,
+                full_dist=False,
+                all_z_sep=False):
+        # This is the standard forward prediction function,
+        # if you have some historical data and just want to
+        # predict forward some number of timesteps.
+
+        # Setting us back to the initial scene graph we had.
+        self.set_environment(init_env, init_timestep)
+
+        # Looping through and applying updates to the model.
+        for i in range(len(input_dicts)):
+            self.incremental_forward(input_dicts[i])
+
+        return self.sample_model(num_predicted_timesteps,
+                                 num_samples,
+                                 robot_present_and_future=robot_present_and_future,
+                                 z_mode=z_mode,
+                                 gmm_mode=gmm_mode,
+                                 full_dist=full_dist,
+                                 all_z_sep=all_z_sep)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py
new file mode 100644
index 000000000..eccde3eb2
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model/trajectron.py
@@ -0,0 +1,201 @@
+import torch
+import numpy as np
+from model.mgcvae import MultimodalGenerativeCVAE
+from model.dataset import get_timesteps_data, restore
+
+
+class Trajectron(torch.nn.Module):
+    def __init__(self, model_registrar,
+                 hyperparams, log_writer,
+                 device):
+        super(Trajectron, self).__init__()
+        self.hyperparams = hyperparams
+        self.log_writer = log_writer
+        self.device = device
+        self.curr_iter = 0
+
+        self.model_registrar = model_registrar
+        # self.node_models_dict = dict()
+        self.node_models_dict = torch.nn.ModuleDict()
+        self.nodes = set()
+
+        self.env = None
+
+        self.min_ht = self.hyperparams['minimum_history_length']
+        self.max_ht = self.hyperparams['maximum_history_length']
+        self.ph = self.hyperparams['prediction_horizon']
+        self.state = self.hyperparams['state']
+        self.state_length = dict()
+        for state_type in self.state.keys():
+            self.state_length[state_type] = int(
+                np.sum([len(entity_dims) for entity_dims in self.state[state_type].values()])
+            )
+        self.pred_state = self.hyperparams['pred_state']
+
+    def eval(self):
+        super().eval()
+        for key in self.node_models_dict.keys():
+            self.node_models_dict[key].eval()
+
+    def set_environment(self, env):
+        self.env = env
+
+        self.node_models_dict.clear()
+        edge_types = env.get_edge_types()
+
+        for node_type in env.NodeType:
+            # Only add a Model for NodeTypes we want to predict
+            if node_type in self.pred_state.keys():
+                self.node_models_dict[str(node_type)] = MultimodalGenerativeCVAE(env,
+                                                                            node_type,
+                                                                            self.model_registrar,
+                                                                            self.hyperparams,
+                                                                            self.device,
+                                                                            edge_types,
+                                                                            log_writer=self.log_writer)
+
+    def set_curr_iter(self, curr_iter):
+        self.curr_iter = curr_iter
+        for node_str, model in self.node_models_dict.items():
+            model.set_curr_iter(curr_iter)
+
+    def set_annealing_params(self):
+        for node_str, model in self.node_models_dict.items():
+            model.set_annealing_params()
+
+    def step_annealers(self, node_type=None):
+        if node_type is None:
+            for node_type in self.node_models_dict:
+                self.node_models_dict[node_type].step_annealers()
+        else:
+            self.node_models_dict[node_type].step_annealers()
+
+    def train_loss(self, batch, node_type):
+        (first_history_index,
+         x_t, y_t, x_st_t, y_st_t,
+         neighbors_data_st,
+         neighbors_edge_value,
+         robot_traj_st_t,
+         map) = batch
+
+        x = x_t.to(self.device)
+        y = y_t.to(self.device)
+        x_st_t = x_st_t.to(self.device)
+        y_st_t = y_st_t.to(self.device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(self.device)
+        if type(map) == torch.Tensor:
+            map = map.to(self.device)
+
+        # Run forward pass
+        model = self.node_models_dict[node_type]
+        loss = model.train_loss(inputs=x,
+                                inputs_st=x_st_t,
+                                first_history_indices=first_history_index,
+                                labels=y,
+                                labels_st=y_st_t,
+                                neighbors=restore(neighbors_data_st),
+                                neighbors_edge_value=restore(neighbors_edge_value),
+                                robot=robot_traj_st_t,
+                                map=map,
+                                prediction_horizon=self.ph)
+
+        return loss
+
+    def eval_loss(self, batch, node_type):
+        (first_history_index,
+         x_t, y_t, x_st_t, y_st_t,
+         neighbors_data_st,
+         neighbors_edge_value,
+         robot_traj_st_t,
+         map) = batch
+
+        x = x_t.to(self.device)
+        y = y_t.to(self.device)
+        x_st_t = x_st_t.to(self.device)
+        y_st_t = y_st_t.to(self.device)
+        if robot_traj_st_t is not None:
+            robot_traj_st_t = robot_traj_st_t.to(self.device)
+        if type(map) == torch.Tensor:
+            map = map.to(self.device)
+
+        # Run forward pass
+        model = self.node_models_dict[node_type]
+        nll = model.eval_loss(inputs=x,
+                              inputs_st=x_st_t,
+                              first_history_indices=first_history_index,
+                              labels=y,
+                              labels_st=y_st_t,
+                              neighbors=restore(neighbors_data_st),
+                              neighbors_edge_value=restore(neighbors_edge_value),
+                              robot=robot_traj_st_t,
+                              map=map,
+                              prediction_horizon=self.ph)
+
+        return nll.cpu().detach().numpy()
+
+    def predict(self,
+                scene,
+                timesteps,
+                ph,
+                num_samples=1,
+                min_future_timesteps=0,
+                min_history_timesteps=1,
+                z_mode=False,
+                gmm_mode=False,
+                full_dist=True,
+                all_z_sep=False):
+
+        predictions_dict = {}
+        for node_type in self.env.NodeType:
+            if node_type not in self.pred_state:
+                continue
+
+            model = self.node_models_dict[node_type]
+
+            # Get Input data for node type and given timesteps
+            batch = get_timesteps_data(env=self.env, scene=scene, t=timesteps, node_type=node_type, state=self.state,
+                                       pred_state=self.pred_state, edge_types=model.edge_types,
+                                       min_ht=min_history_timesteps, max_ht=self.max_ht, min_ft=min_future_timesteps,
+                                       max_ft=min_future_timesteps, hyperparams=self.hyperparams)
+            # There are no nodes of type present for timestep
+            if batch is None:
+                continue
+            (first_history_index,
+             x_t, y_t, x_st_t, y_st_t,
+             neighbors_data_st,
+             neighbors_edge_value,
+             robot_traj_st_t,
+             map), nodes, timesteps_o = batch
+
+            x = x_t.to(self.device)
+            x_st_t = x_st_t.to(self.device)
+            if robot_traj_st_t is not None:
+                robot_traj_st_t = robot_traj_st_t.to(self.device)
+            if type(map) == torch.Tensor:
+                map = map.to(self.device)
+
+            # Run forward pass
+            predictions = model.predict(inputs=x,
+                                        inputs_st=x_st_t,
+                                        first_history_indices=first_history_index,
+                                        neighbors=neighbors_data_st,
+                                        neighbors_edge_value=neighbors_edge_value,
+                                        robot=robot_traj_st_t,
+                                        map=map,
+                                        prediction_horizon=ph,
+                                        num_samples=num_samples,
+                                        z_mode=z_mode,
+                                        gmm_mode=gmm_mode,
+                                        full_dist=full_dist,
+                                        all_z_sep=all_z_sep)
+
+            predictions_np = predictions.cpu().detach().numpy()
+
+            # Assign predictions to node
+            for i, ts in enumerate(timesteps_o):
+                if ts not in predictions_dict.keys():
+                    predictions_dict[ts] = dict()
+                predictions_dict[ts][nodes[i]] = np.transpose(predictions_np[:, [i]], (1, 0, 2, 3))
+
+        return predictions_dict
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json
new file mode 100644
index 000000000..f38943d36
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/model_dir/config.json
@@ -0,0 +1 @@
+{"batch_size": 256, "grad_clip": 1.0, "learning_rate_style": "exp", "learning_rate": 0.001, "min_learning_rate": 1e-05, "learning_decay_rate": 0.9999, "prediction_horizon": 12, "minimum_history_length": 1, "maximum_history_length": 8, "map_encoder": {"PEDESTRIAN": {"heading_state_index": 5, "patch_size": [50, 10, 50, 90], "map_channels": 3, "hidden_channels": [10, 20, 10, 1], "output_size": 32, "masks": [5, 5, 5, 5], "strides": [1, 1, 1, 1], "dropout": 0.5}}, "k": 1, "k_eval": 1, "kl_min": 0.07, "kl_weight": 100.0, "kl_weight_start": 0, "kl_decay_rate": 0.99995, "kl_crossover": 400, "kl_sigmoid_divisor": 4, "rnn_kwargs": {"dropout_keep_prob": 0.75}, "MLP_dropout_keep_prob": 0.9, "enc_rnn_dim_edge": 32, "enc_rnn_dim_edge_influence": 32, "enc_rnn_dim_history": 32, "enc_rnn_dim_future": 32, "dec_rnn_dim": 128, "q_z_xy_MLP_dims": null, "p_z_x_MLP_dims": 32, "GMM_components": 1, "log_p_yt_xz_max": 6, "N": 1, "K": 25, "tau_init": 2.0, "tau_final": 0.05, "tau_decay_rate": 0.997, "use_z_logit_clipping": true, "z_logit_clip_start": 0.05, "z_logit_clip_final": 5.0, "z_logit_clip_crossover": 300, "z_logit_clip_divisor": 5, "dynamic": {"PEDESTRIAN": {"name": "SingleIntegrator", "distribution": true, "limits": {}}}, "state": {"PEDESTRIAN": {"position": ["x", "y"], "velocity": ["x", "y"], "acceleration": ["x", "y"]}}, "pred_state": {"PEDESTRIAN": {"position": ["x", "y"]}}, "log_histograms": false, "scene_freq_mult_eval": false, "node_freq_mult_eval": false, "edge_encoding": false, "incl_robot_node": false, "use_map_encoding": false}
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py
new file mode 100644
index 000000000..6da777c01
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test/test_data_structures.py
@@ -0,0 +1,48 @@
+import numpy as np
+import pandas as pd
+from data import SingleHeaderNumpyArray, DoubleHeaderNumpyArray
+
+
+def test_single_header_numpy_array():
+    x = np.random.rand(10)
+    y = np.random.rand(10)
+
+    array = SingleHeaderNumpyArray(np.stack((x, y), axis=-1), ['x', 'y'])
+
+    assert (array[:, 'x'] == x).all()
+    assert (array[:, 'y'] == y).all()
+    assert (array[3:7, 'y'] == y[3:7]).all()
+    assert (array.x == x).all()
+    assert (array.y == y).all()
+
+
+def test_double_header_numpy_array():
+    x = np.random.rand(10)
+    y = np.random.rand(10)
+    vx = np.random.rand(10)
+    vy = np.random.rand(10)
+
+    data_dict = {('position', 'x'): x,
+                 ('position', 'y'): y,
+                 ('velocity', 'x'): vx,
+                 ('velocity', 'y'): vy}
+
+    data_columns = pd.MultiIndex.from_product([['position', 'velocity'], ['x', 'y']])
+
+    node_data = pd.DataFrame(data_dict, columns=data_columns)
+
+    array = DoubleHeaderNumpyArray(node_data.values, list(node_data.columns))
+
+    test_header_dict = {'position': ['x', 'y'], 'velocity': ['y']}
+
+    assert (array[:, ('position', 'x')] == x).all()
+    assert (array[:, ('velocity', 'y')] == vy).all()
+    assert (array[4:7, ('velocity', 'y')] == vy[4:7]).all()
+    assert (array[:, [('position', 'x'), ('velocity', 'y')]] == np.stack((x, vy), axis=-1)).all()
+    assert (array[:, [('position', 'y'), ('velocity', 'x')]] == np.stack((y, vx), axis=-1)).all()
+    assert (array[2:6, [('position', 'y'), ('velocity', 'x')]] == np.stack((y, vx), axis=-1)[2:6]).all()
+    assert (array[:, test_header_dict] == np.stack((x, y, vy), axis=-1)).all()
+    assert (array[1:8, test_header_dict] == np.stack((x, y, vy), axis=-1)[1:8]).all()
+    assert (array.position.x == x).all()
+
+
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py
new file mode 100644
index 000000000..3e6cae7e4
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/test_online.py
@@ -0,0 +1,238 @@
+import os
+import time
+import json
+import torch
+import dill
+import random
+import pathlib
+import evaluation
+import numpy as np
+import visualization as vis
+from argument_parser import args
+from model.online.online_trajectron import OnlineTrajectron
+from model.model_registrar import ModelRegistrar
+from environment import Environment, Scene
+import matplotlib.pyplot as plt
+
+if not torch.cuda.is_available() or args.device == 'cpu':
+    args.device = torch.device('cpu')
+else:
+    if torch.cuda.device_count() == 1:
+        # If you have CUDA_VISIBLE_DEVICES set, which you should,
+        # then this will prevent leftover flag arguments from
+        # messing with the device allocation.
+        args.device = 'cuda:0'
+
+    args.device = torch.device(args.device)
+
+if args.eval_device is None:
+    args.eval_device = 'cpu'
+
+if args.seed is not None:
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def create_online_env(env, hyperparams, scene_idx, init_timestep):
+    test_scene = env.scenes[scene_idx]
+
+    online_scene = Scene(timesteps=init_timestep + 1,
+                         map=test_scene.map,
+                         dt=test_scene.dt)
+    online_scene.nodes = test_scene.get_nodes_clipped_at_time(
+        timesteps=np.arange(init_timestep - hyperparams['maximum_history_length'],
+                            init_timestep + 1),
+        state=hyperparams['state'])
+    online_scene.robot = test_scene.robot
+    online_scene.calculate_scene_graph(attention_radius=env.attention_radius,
+                                       edge_addition_filter=hyperparams['edge_addition_filter'],
+                                       edge_removal_filter=hyperparams['edge_removal_filter'])
+
+    return Environment(node_type_list=env.node_type_list,
+                       standardization=env.standardization,
+                       scenes=[online_scene],
+                       attention_radius=env.attention_radius,
+                       robot_type=env.robot_type)
+
+
+def get_maps_for_input(input_dict, scene, hyperparams):
+    scene_maps = list()
+    scene_pts = list()
+    heading_angles = list()
+    patch_sizes = list()
+    nodes_with_maps = list()
+    for node in input_dict:
+        if node.type in hyperparams['map_encoder']:
+            x = input_dict[node]
+            me_hyp = hyperparams['map_encoder'][node.type]
+            if 'heading_state_index' in me_hyp:
+                heading_state_index = me_hyp['heading_state_index']
+                # We have to rotate the map in the opposit direction of the agent to match them
+                if type(heading_state_index) is list:  # infer from velocity or heading vector
+                    heading_angle = -np.arctan2(x[-1, heading_state_index[1]],
+                                                x[-1, heading_state_index[0]]) * 180 / np.pi
+                else:
+                    heading_angle = -x[-1, heading_state_index] * 180 / np.pi
+            else:
+                heading_angle = None
+
+            scene_map = scene.map[node.type]
+            map_point = x[-1, :2]
+
+            patch_size = hyperparams['map_encoder'][node.type]['patch_size']
+
+            scene_maps.append(scene_map)
+            scene_pts.append(map_point)
+            heading_angles.append(heading_angle)
+            patch_sizes.append(patch_size)
+            nodes_with_maps.append(node)
+
+    if heading_angles[0] is None:
+        heading_angles = None
+    else:
+        heading_angles = torch.Tensor(heading_angles)
+
+    maps = scene_maps[0].get_cropped_maps_from_scene_map_batch(scene_maps,
+                                                               scene_pts=torch.Tensor(scene_pts),
+                                                               patch_size=patch_sizes[0],
+                                                               rotation=heading_angles)
+
+    maps_dict = {node: maps[[i]] for i, node in enumerate(nodes_with_maps)}
+    return maps_dict
+
+
+def main():
+    # Choose one of the model directory names under the experiment/*/models folders.
+    # Possibilities are 'vel_ee', 'int_ee', 'int_ee_me', or 'robot'
+    model_dir = os.path.join(args.log_dir, 'int_ee')
+
+    # Load hyperparameters from json
+    config_file = os.path.join(model_dir, args.conf)
+    if not os.path.exists(config_file):
+        raise ValueError('Config json not found!')
+    with open(config_file, 'r') as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Add hyperparams from arguments
+    hyperparams['dynamic_edges'] = args.dynamic_edges
+    hyperparams['edge_state_combine_method'] = args.edge_state_combine_method
+    hyperparams['edge_influence_combine_method'] = args.edge_influence_combine_method
+    hyperparams['edge_addition_filter'] = args.edge_addition_filter
+    hyperparams['edge_removal_filter'] = args.edge_removal_filter
+    hyperparams['batch_size'] = args.batch_size
+    hyperparams['k_eval'] = args.k_eval
+    hyperparams['offline_scene_graph'] = args.offline_scene_graph
+    hyperparams['incl_robot_node'] = args.incl_robot_node
+    hyperparams['edge_encoding'] = not args.no_edge_encoding
+    hyperparams['use_map_encoding'] = args.map_encoding
+
+    output_save_dir = os.path.join(model_dir, 'pred_figs')
+    pathlib.Path(output_save_dir).mkdir(parents=True, exist_ok=True)
+
+    eval_data_path = os.path.join(args.data_dir, args.eval_data_dict)
+    with open(eval_data_path, 'rb') as f:
+        eval_env = dill.load(f, encoding='latin1')
+
+    if eval_env.robot_type is None and hyperparams['incl_robot_node']:
+        eval_env.robot_type = eval_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+        for scene in eval_env.scenes:
+            scene.add_robot_from_nodes(eval_env.robot_type)
+
+    print('Loaded data from %s' % (eval_data_path,))
+
+    # Creating a dummy environment with a single scene that contains information about the world.
+    # When using this code, feel free to use whichever scene index or initial timestep you wish.
+    scene_idx = 0
+
+    # You need to have at least acceleration, so you want 2 timesteps of prior data, e.g. [0, 1],
+    # so that you can immediately start incremental inference from the 3rd timestep onwards.
+    init_timestep = 1
+
+    eval_scene = eval_env.scenes[scene_idx]
+    online_env = create_online_env(eval_env, hyperparams, scene_idx, init_timestep)
+
+    model_registrar = ModelRegistrar(model_dir, args.eval_device)
+    model_registrar.load_models(iter_num=12)
+
+    trajectron = OnlineTrajectron(model_registrar,
+                                  hyperparams,
+                                  args.eval_device)
+
+    # If you want to see what different robot futures do to the predictions, uncomment this line as well as
+    # related "... += adjustment" lines below.
+    # adjustment = np.stack([np.arange(13)/float(i*2.0) for i in range(6, 12)], axis=1)
+
+    # Here's how you'd incrementally run the model, e.g. with streaming data.
+    trajectron.set_environment(online_env, init_timestep)
+
+    for timestep in range(init_timestep + 1, eval_scene.timesteps):
+        input_dict = eval_scene.get_clipped_input_dict(timestep, hyperparams['state'])
+
+        maps = None
+        if hyperparams['use_map_encoding']:
+            maps = get_maps_for_input(input_dict, eval_scene, hyperparams)
+
+        robot_present_and_future = None
+        if eval_scene.robot is not None and hyperparams['incl_robot_node']:
+            robot_present_and_future = eval_scene.robot.get(np.array([timestep,
+                                                                      timestep + hyperparams['prediction_horizon']]),
+                                                            hyperparams['state'][eval_scene.robot.type],
+                                                            padding=0.0)
+            robot_present_and_future = np.stack([robot_present_and_future, robot_present_and_future], axis=0)
+            # robot_present_and_future += adjustment
+
+        start = time.time()
+        dists, preds = trajectron.incremental_forward(input_dict,
+                                                      maps,
+                                                      prediction_horizon=6,
+                                                      num_samples=1,
+                                                      robot_present_and_future=robot_present_and_future,
+                                                      full_dist=True)
+        end = time.time()
+        print("t=%d: took %.2f s (= %.2f Hz) w/ %d nodes and %d edges" % (timestep, end - start,
+                                                                          1. / (end - start), len(trajectron.nodes),
+                                                                          trajectron.scene_graph.get_num_edges()))
+
+        detailed_preds_dict = dict()
+        for node in eval_scene.nodes:
+            if node in preds:
+                detailed_preds_dict[node] = preds[node]
+
+        fig, ax = plt.subplots()
+        vis.visualize_distribution(ax,
+                                   dists)
+        vis.visualize_prediction(ax,
+                                 {timestep: preds},
+                                 eval_scene.dt,
+                                 hyperparams['maximum_history_length'],
+                                 hyperparams['prediction_horizon'])
+
+        if eval_scene.robot is not None and hyperparams['incl_robot_node']:
+            robot_for_plotting = eval_scene.robot.get(np.array([timestep,
+                                                                timestep + hyperparams['prediction_horizon']]),
+                                                      hyperparams['state'][eval_scene.robot.type])
+            # robot_for_plotting += adjustment
+
+            ax.plot(robot_for_plotting[1:, 1], robot_for_plotting[1:, 0],
+                    color='r',
+                    linewidth=1.0, alpha=1.0)
+
+            # Current Node Position
+            circle = plt.Circle((robot_for_plotting[0, 1],
+                                 robot_for_plotting[0, 0]),
+                                0.3,
+                                facecolor='r',
+                                edgecolor='k',
+                                lw=0.5,
+                                zorder=3)
+            ax.add_artist(circle)
+
+        fig.savefig(os.path.join(output_save_dir, f'pred_{timestep}.pdf'), dpi=300)
+        plt.close(fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py
new file mode 100644
index 000000000..0e40b5bdc
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/train.py
@@ -0,0 +1,440 @@
+import torch
+from torch import nn, optim, utils
+import numpy as np
+import os
+import time
+import dill
+import json
+import random
+import pathlib
+import warnings
+from tqdm import tqdm
+import visualization
+import evaluation
+import matplotlib.pyplot as plt
+from argument_parser import args
+from model.trajectron import Trajectron
+from model.model_registrar import ModelRegistrar
+from model.model_utils import cyclical_lr
+from model.dataset import EnvironmentDataset, collate
+from tensorboardX import SummaryWriter
+# torch.autograd.set_detect_anomaly(True)
+
+if not torch.cuda.is_available() or args.device == 'cpu':
+    args.device = torch.device('cpu')
+else:
+    if torch.cuda.device_count() == 1:
+        # If you have CUDA_VISIBLE_DEVICES set, which you should,
+        # then this will prevent leftover flag arguments from
+        # messing with the device allocation.
+        args.device = 'cuda:0'
+
+    args.device = torch.device(args.device)
+
+if args.eval_device is None:
+    args.eval_device = torch.device('cpu')
+
+# This is needed for memory pinning using a DataLoader (otherwise memory is pinned to cuda:0 by default)
+torch.cuda.set_device(args.device)
+
+if args.seed is not None:
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def main():
+    # Load hyperparameters from json
+    if not os.path.exists(args.conf):
+        print('Config json not found!')
+    with open(args.conf, 'r', encoding='utf-8') as conf_json:
+        hyperparams = json.load(conf_json)
+
+    # Add hyperparams from arguments
+    hyperparams['dynamic_edges'] = args.dynamic_edges
+    hyperparams['edge_state_combine_method'] = args.edge_state_combine_method
+    hyperparams['edge_influence_combine_method'] = args.edge_influence_combine_method
+    hyperparams['edge_addition_filter'] = args.edge_addition_filter
+    hyperparams['edge_removal_filter'] = args.edge_removal_filter
+    hyperparams['batch_size'] = args.batch_size
+    hyperparams['k_eval'] = args.k_eval
+    hyperparams['offline_scene_graph'] = args.offline_scene_graph
+    hyperparams['incl_robot_node'] = args.incl_robot_node
+    hyperparams['node_freq_mult_train'] = args.node_freq_mult_train
+    hyperparams['node_freq_mult_eval'] = args.node_freq_mult_eval
+    hyperparams['scene_freq_mult_train'] = args.scene_freq_mult_train
+    hyperparams['scene_freq_mult_eval'] = args.scene_freq_mult_eval
+    hyperparams['scene_freq_mult_viz'] = args.scene_freq_mult_viz
+    hyperparams['edge_encoding'] = not args.no_edge_encoding
+    hyperparams['use_map_encoding'] = args.map_encoding
+    hyperparams['augment'] = args.augment
+    hyperparams['override_attention_radius'] = args.override_attention_radius
+
+    print('-----------------------')
+    print('| TRAINING PARAMETERS |')
+    print('-----------------------')
+    print('| batch_size: %d' % args.batch_size)
+    print('| device: %s' % args.device)
+    print('| eval_device: %s' % args.eval_device)
+    print('| Offline Scene Graph Calculation: %s' % args.offline_scene_graph)
+    print('| EE state_combine_method: %s' % args.edge_state_combine_method)
+    print('| EIE scheme: %s' % args.edge_influence_combine_method)
+    print('| dynamic_edges: %s' % args.dynamic_edges)
+    print('| robot node: %s' % args.incl_robot_node)
+    print('| edge_addition_filter: %s' % args.edge_addition_filter)
+    print('| edge_removal_filter: %s' % args.edge_removal_filter)
+    print('| MHL: %s' % hyperparams['minimum_history_length'])
+    print('| PH: %s' % hyperparams['prediction_horizon'])
+    print('-----------------------')
+
+    log_writer = None
+    model_dir = None
+    if not args.debug:
+        # Create the log and model directiory if they're not present.
+        model_dir = os.path.join(args.log_dir,
+                                 'models_' + time.strftime('%d_%b_%Y_%H_%M_%S', time.localtime()) + args.log_tag)
+        pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)
+
+        # Save config to model directory
+        with open(os.path.join(model_dir, 'config.json'), 'w') as conf_json:
+            json.dump(hyperparams, conf_json)
+
+        log_writer = SummaryWriter(log_dir=model_dir)
+
+    # Load training and evaluation environments and scenes
+    train_scenes = []
+    train_data_path = os.path.join(args.data_dir, args.train_data_dict)
+    with open(train_data_path, 'rb') as f:
+        train_env = dill.load(f, encoding='latin1')
+
+    for attention_radius_override in args.override_attention_radius:
+        node_type1, node_type2, attention_radius = attention_radius_override.split(' ')
+        train_env.attention_radius[(node_type1, node_type2)] = float(attention_radius)
+
+    if train_env.robot_type is None and hyperparams['incl_robot_node']:
+        train_env.robot_type = train_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+        for scene in train_env.scenes:
+            scene.add_robot_from_nodes(train_env.robot_type)
+
+    train_scenes = train_env.scenes
+    train_scenes_sample_probs = train_env.scenes_freq_mult_prop if args.scene_freq_mult_train else None
+
+    train_dataset = EnvironmentDataset(train_env,
+                                       hyperparams['state'],
+                                       hyperparams['pred_state'],
+                                       scene_freq_mult=hyperparams['scene_freq_mult_train'],
+                                       node_freq_mult=hyperparams['node_freq_mult_train'],
+                                       hyperparams=hyperparams,
+                                       min_history_timesteps=hyperparams['minimum_history_length'],
+                                       min_future_timesteps=hyperparams['prediction_horizon'],
+                                       return_robot=not args.incl_robot_node)
+    train_data_loader = dict()
+    for node_type_data_set in train_dataset:
+        if len(node_type_data_set) == 0:
+            continue
+
+        node_type_dataloader = utils.data.DataLoader(node_type_data_set,
+                                                     collate_fn=collate,
+                                                     pin_memory=False if args.device is 'cpu' else True,
+                                                     batch_size=args.batch_size,
+                                                     shuffle=True,
+                                                     num_workers=args.preprocess_workers)
+        train_data_loader[node_type_data_set.node_type] = node_type_dataloader
+
+    print(f"Loaded training data from {train_data_path}")
+
+    eval_scenes = []
+    eval_scenes_sample_probs = None
+    if args.eval_every is not None:
+        eval_data_path = os.path.join(args.data_dir, args.eval_data_dict)
+        with open(eval_data_path, 'rb') as f:
+            eval_env = dill.load(f, encoding='latin1')
+
+        for attention_radius_override in args.override_attention_radius:
+            node_type1, node_type2, attention_radius = attention_radius_override.split(' ')
+            eval_env.attention_radius[(node_type1, node_type2)] = float(attention_radius)
+
+        if eval_env.robot_type is None and hyperparams['incl_robot_node']:
+            eval_env.robot_type = eval_env.NodeType[0]  # TODO: Make more general, allow the user to specify?
+            for scene in eval_env.scenes:
+                scene.add_robot_from_nodes(eval_env.robot_type)
+
+        eval_scenes = eval_env.scenes
+        eval_scenes_sample_probs = eval_env.scenes_freq_mult_prop if args.scene_freq_mult_eval else None
+
+        eval_dataset = EnvironmentDataset(eval_env,
+                                          hyperparams['state'],
+                                          hyperparams['pred_state'],
+                                          scene_freq_mult=hyperparams['scene_freq_mult_eval'],
+                                          node_freq_mult=hyperparams['node_freq_mult_eval'],
+                                          hyperparams=hyperparams,
+                                          min_history_timesteps=hyperparams['minimum_history_length'],
+                                          min_future_timesteps=hyperparams['prediction_horizon'],
+                                          return_robot=not args.incl_robot_node)
+        eval_data_loader = dict()
+        for node_type_data_set in eval_dataset:
+            if len(node_type_data_set) == 0:
+                continue
+
+            node_type_dataloader = utils.data.DataLoader(node_type_data_set,
+                                                         collate_fn=collate,
+                                                         pin_memory=False if args.eval_device is 'cpu' else True,
+                                                         batch_size=args.eval_batch_size,
+                                                         shuffle=True,
+                                                         num_workers=args.preprocess_workers)
+            eval_data_loader[node_type_data_set.node_type] = node_type_dataloader
+
+        print(f"Loaded evaluation data from {eval_data_path}")
+
+    # Offline Calculate Scene Graph
+    if hyperparams['offline_scene_graph'] == 'yes':
+        print(f"Offline calculating scene graphs")
+        for i, scene in enumerate(train_scenes):
+            scene.calculate_scene_graph(train_env.attention_radius,
+                                        hyperparams['edge_addition_filter'],
+                                        hyperparams['edge_removal_filter'])
+            print(f"Created Scene Graph for Training Scene {i}")
+
+        for i, scene in enumerate(eval_scenes):
+            scene.calculate_scene_graph(eval_env.attention_radius,
+                                        hyperparams['edge_addition_filter'],
+                                        hyperparams['edge_removal_filter'])
+            print(f"Created Scene Graph for Evaluation Scene {i}")
+
+    model_registrar = ModelRegistrar(model_dir, args.device)
+
+    trajectron = Trajectron(model_registrar,
+                            hyperparams,
+                            log_writer,
+                            args.device)
+
+    trajectron.set_environment(train_env)
+    trajectron.set_annealing_params()
+    print('Created Training Model.')
+
+    eval_trajectron = None
+    if args.eval_every is not None or args.vis_every is not None:
+        eval_trajectron = Trajectron(model_registrar,
+                                     hyperparams,
+                                     log_writer,
+                                     args.eval_device)
+        eval_trajectron.set_environment(eval_env)
+        eval_trajectron.set_annealing_params()
+    print('Created Evaluation Model.')
+
+    optimizer = dict()
+    lr_scheduler = dict()
+    for node_type in train_env.NodeType:
+        if node_type not in hyperparams['pred_state']:
+            continue
+        optimizer[node_type] = optim.Adam([{'params': model_registrar.get_all_but_name_match('map_encoder').parameters()},
+                                           {'params': model_registrar.get_name_match('map_encoder').parameters(), 'lr':0.0008}], lr=hyperparams['learning_rate'])
+        # Set Learning Rate
+        if hyperparams['learning_rate_style'] == 'const':
+            lr_scheduler[node_type] = optim.lr_scheduler.ExponentialLR(optimizer[node_type], gamma=1.0)
+        elif hyperparams['learning_rate_style'] == 'exp':
+            lr_scheduler[node_type] = optim.lr_scheduler.ExponentialLR(optimizer[node_type],
+                                                                       gamma=hyperparams['learning_decay_rate'])
+
+    #################################
+    #           TRAINING            #
+    #################################
+    curr_iter_node_type = {node_type: 0 for node_type in train_data_loader.keys()}
+    for epoch in range(1, args.train_epochs + 1):
+        model_registrar.to(args.device)
+        train_dataset.augment = args.augment
+        for node_type, data_loader in train_data_loader.items():
+            curr_iter = curr_iter_node_type[node_type]
+            pbar = tqdm(data_loader, ncols=80)
+            for batch in pbar:
+                trajectron.set_curr_iter(curr_iter)
+                trajectron.step_annealers(node_type)
+                optimizer[node_type].zero_grad()
+                train_loss = trajectron.train_loss(batch, node_type)
+                pbar.set_description(f"Epoch {epoch}, {node_type} L: {train_loss.item():.2f}")
+                train_loss.backward()
+                # Clipping gradients.
+                if hyperparams['grad_clip'] is not None:
+                    nn.utils.clip_grad_value_(model_registrar.parameters(), hyperparams['grad_clip'])
+                optimizer[node_type].step()
+
+                # Stepping forward the learning rate scheduler and annealers.
+                lr_scheduler[node_type].step()
+
+                if not args.debug:
+                    log_writer.add_scalar(f"{node_type}/train/learning_rate",
+                                          lr_scheduler[node_type].get_lr()[0],
+                                          curr_iter)
+                    log_writer.add_scalar(f"{node_type}/train/loss", train_loss, curr_iter)
+
+                curr_iter += 1
+            curr_iter_node_type[node_type] = curr_iter
+        train_dataset.augment = False
+        if args.eval_every is not None or args.vis_every is not None:
+            eval_trajectron.set_curr_iter(epoch)
+
+        #################################
+        #        VISUALIZATION          #
+        #################################
+        if args.vis_every is not None and not args.debug and epoch % args.vis_every == 0 and epoch > 0:
+            max_hl = hyperparams['maximum_history_length']
+            ph = hyperparams['prediction_horizon']
+            with torch.no_grad():
+                # Predict random timestep to plot for train data set
+                if args.scene_freq_mult_viz:
+                    scene = np.random.choice(train_scenes, p=train_scenes_sample_probs)
+                else:
+                    scene = np.random.choice(train_scenes)
+                timestep = scene.sample_timesteps(1, min_future_timesteps=ph)
+                predictions = trajectron.predict(scene,
+                                                 timestep,
+                                                 ph,
+                                                 min_future_timesteps=ph,
+                                                 z_mode=True,
+                                                 gmm_mode=True,
+                                                 all_z_sep=False,
+                                                 full_dist=False)
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(ax,
+                                                   predictions,
+                                                   scene.dt,
+                                                   max_hl=max_hl,
+                                                   ph=ph,
+                                                   map=scene.map['VISUALIZATION'] if scene.map is not None else None)
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure('train/prediction', fig, epoch)
+
+                model_registrar.to(args.eval_device)
+                # Predict random timestep to plot for eval data set
+                if args.scene_freq_mult_viz:
+                    scene = np.random.choice(eval_scenes, p=eval_scenes_sample_probs)
+                else:
+                    scene = np.random.choice(eval_scenes)
+                timestep = scene.sample_timesteps(1, min_future_timesteps=ph)
+                predictions = eval_trajectron.predict(scene,
+                                                      timestep,
+                                                      ph,
+                                                      num_samples=20,
+                                                      min_future_timesteps=ph,
+                                                      z_mode=False,
+                                                      full_dist=False)
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(ax,
+                                                   predictions,
+                                                   scene.dt,
+                                                   max_hl=max_hl,
+                                                   ph=ph,
+                                                   map=scene.map['VISUALIZATION'] if scene.map is not None else None)
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure('eval/prediction', fig, epoch)
+
+                # Predict random timestep to plot for eval data set
+                predictions = eval_trajectron.predict(scene,
+                                                      timestep,
+                                                      ph,
+                                                      min_future_timesteps=ph,
+                                                      z_mode=True,
+                                                      gmm_mode=True,
+                                                      all_z_sep=True,
+                                                      full_dist=False)
+
+                # Plot predicted timestep for random scene
+                fig, ax = plt.subplots(figsize=(10, 10))
+                visualization.visualize_prediction(ax,
+                                                   predictions,
+                                                   scene.dt,
+                                                   max_hl=max_hl,
+                                                   ph=ph,
+                                                   map=scene.map['VISUALIZATION'] if scene.map is not None else None)
+                ax.set_title(f"{scene.name}-t: {timestep}")
+                log_writer.add_figure('eval/prediction_all_z', fig, epoch)
+
+        #################################
+        #           EVALUATION          #
+        #################################
+        if args.eval_every is not None and not args.debug and epoch % args.eval_every == 0 and epoch > 0:
+            max_hl = hyperparams['maximum_history_length']
+            ph = hyperparams['prediction_horizon']
+            model_registrar.to(args.eval_device)
+            with torch.no_grad():
+                # Calculate evaluation loss
+                for node_type, data_loader in eval_data_loader.items():
+                    eval_loss = []
+                    print(f"Starting Evaluation @ epoch {epoch} for node type: {node_type}")
+                    pbar = tqdm(data_loader, ncols=80)
+                    for batch in pbar:
+                        eval_loss_node_type = eval_trajectron.eval_loss(batch, node_type)
+                        pbar.set_description(f"Epoch {epoch}, {node_type} L: {eval_loss_node_type.item():.2f}")
+                        eval_loss.append({node_type: {'nll': [eval_loss_node_type]}})
+                        del batch
+
+                    evaluation.log_batch_errors(eval_loss,
+                                                log_writer,
+                                                f"{node_type}/eval_loss",
+                                                epoch)
+
+                # Predict batch timesteps for evaluation dataset evaluation
+                eval_batch_errors = []
+                for scene in tqdm(eval_scenes, desc='Sample Evaluation', ncols=80):
+                    timesteps = scene.sample_timesteps(args.eval_batch_size)
+
+                    predictions = eval_trajectron.predict(scene,
+                                                          timesteps,
+                                                          ph,
+                                                          num_samples=50,
+                                                          min_future_timesteps=ph,
+                                                          full_dist=False)
+
+                    eval_batch_errors.append(evaluation.compute_batch_statistics(predictions,
+                                                                                 scene.dt,
+                                                                                 max_hl=max_hl,
+                                                                                 ph=ph,
+                                                                                 node_type_enum=eval_env.NodeType,
+                                                                                 map=scene.map))
+
+                evaluation.log_batch_errors(eval_batch_errors,
+                                            log_writer,
+                                            'eval',
+                                            epoch,
+                                            bar_plot=['kde'],
+                                            box_plot=['ade', 'fde'])
+
+                # Predict maximum likelihood batch timesteps for evaluation dataset evaluation
+                eval_batch_errors_ml = []
+                for scene in tqdm(eval_scenes, desc='MM Evaluation', ncols=80):
+                    timesteps = scene.sample_timesteps(scene.timesteps)
+
+                    predictions = eval_trajectron.predict(scene,
+                                                          timesteps,
+                                                          ph,
+                                                          num_samples=1,
+                                                          min_future_timesteps=ph,
+                                                          z_mode=True,
+                                                          gmm_mode=True,
+                                                          full_dist=False)
+
+                    eval_batch_errors_ml.append(evaluation.compute_batch_statistics(predictions,
+                                                                                    scene.dt,
+                                                                                    max_hl=max_hl,
+                                                                                    ph=ph,
+                                                                                    map=scene.map,
+                                                                                    node_type_enum=eval_env.NodeType,
+                                                                                    kde=False))
+
+                evaluation.log_batch_errors(eval_batch_errors_ml,
+                                            log_writer,
+                                            'eval/ml',
+                                            epoch)
+
+        if args.save_every is not None and args.debug is False and epoch % args.save_every == 0:
+            model_registrar.save_models(epoch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py
new file mode 100644
index 000000000..b50caf009
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/__init__.py
@@ -0,0 +1,3 @@
+from .trajectory_utils import prediction_output_to_trajectories
+from .matrix_utils import block_diag, tile
+from .os_utils import maybe_makedirs
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py
new file mode 100644
index 000000000..87c4efac3
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/matrix_utils.py
@@ -0,0 +1,41 @@
+import numpy as np
+import torch
+
+
+def attach_dim(v, n_dim_to_prepend=0, n_dim_to_append=0):
+    return v.reshape(
+        torch.Size([1] * n_dim_to_prepend)
+        + v.shape
+        + torch.Size([1] * n_dim_to_append))
+
+
+def block_diag(m):
+    """
+    Make a block diagonal matrix along dim=-3
+    EXAMPLE:
+    block_diag(torch.ones(4,3,2))
+    should give a 12 x 8 matrix with blocks of 3 x 2 ones.
+    Prepend batch dimensions if needed.
+    You can also give a list of matrices.
+    :type m: torch.Tensor, list
+    :rtype: torch.Tensor
+    """
+    if type(m) is list:
+        m = torch.cat([m1.unsqueeze(-3) for m1 in m], -3)
+
+    d = m.dim()
+    n = m.shape[-3]
+    siz0 = m.shape[:-3]
+    siz1 = m.shape[-2:]
+    m2 = m.unsqueeze(-2)
+    eye = attach_dim(torch.eye(n, device=m.device).unsqueeze(-2), d - 3, 1)
+    return (m2 * eye).reshape(siz0 + torch.Size(torch.tensor(siz1) * n))
+
+
+def tile(a, dim, n_tile, device='cpu'):
+    init_dim = a.size(dim)
+    repeat_idx = [1] * a.dim()
+    repeat_idx[dim] = n_tile
+    a = a.repeat(*(repeat_idx))
+    order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)])).to(device)
+    return torch.index_select(a, dim, order_index)
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py
new file mode 100644
index 000000000..038342680
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/os_utils.py
@@ -0,0 +1,16 @@
+import os
+
+
+def maybe_makedirs(path_to_create):
+    """This function will create a directory, unless it exists already,
+    at which point the function will return.
+    The exception handling is necessary as it prevents a race condition
+    from occurring.
+    Inputs:
+        path_to_create - A string path to a directory you'd like created.
+    """
+    try:
+        os.makedirs(path_to_create)
+    except OSError:
+        if not os.path.isdir(path_to_create):
+            raise
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py
new file mode 100644
index 000000000..e355822cf
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/utils/trajectory_utils.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+
+def prediction_output_to_trajectories(prediction_output_dict,
+                                      dt,
+                                      max_h,
+                                      ph,
+                                      map=None,
+                                      prune_ph_to_future=False):
+
+    prediction_timesteps = prediction_output_dict.keys()
+
+    output_dict = dict()
+    histories_dict = dict()
+    futures_dict = dict()
+
+    for t in prediction_timesteps:
+        histories_dict[t] = dict()
+        output_dict[t] = dict()
+        futures_dict[t] = dict()
+        prediction_nodes = prediction_output_dict[t].keys()
+        for node in prediction_nodes:
+            predictions_output = prediction_output_dict[t][node]
+            position_state = {'position': ['x', 'y']}
+
+            history = node.get(np.array([t - max_h, t]), position_state)  # History includes current pos
+            history = history[~np.isnan(history.sum(axis=1))]
+
+            future = node.get(np.array([t + 1, t + ph]), position_state)
+            future = future[~np.isnan(future.sum(axis=1))]
+
+            if prune_ph_to_future:
+                predictions_output = predictions_output[:, :, :future.shape[0]]
+                if predictions_output.shape[2] == 0:
+                    continue
+
+            trajectory = predictions_output
+
+            if map is None:
+                histories_dict[t][node] = history
+                output_dict[t][node] = trajectory
+                futures_dict[t][node] = future
+            else:
+                histories_dict[t][node] = map.to_map_points(history)
+                output_dict[t][node] = map.to_map_points(trajectory)
+                futures_dict[t][node] = map.to_map_points(future)
+
+    return output_dict, histories_dict, futures_dict
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py
new file mode 100644
index 000000000..1f92021f3
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/__init__.py
@@ -0,0 +1,2 @@
+from .visualization import visualize_prediction, visualize_distribution
+from .visualization_utils import plot_boxplots
\ No newline at end of file
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py
new file mode 100644
index 000000000..08e1fef90
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization.py
@@ -0,0 +1,130 @@
+from utils import prediction_output_to_trajectories
+from scipy import linalg
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import matplotlib.patheffects as pe
+import numpy as np
+import seaborn as sns
+
+
+def plot_trajectories(ax,
+                      prediction_dict,
+                      histories_dict,
+                      futures_dict,
+                      line_alpha=0.7,
+                      line_width=0.2,
+                      edge_width=2,
+                      circle_edge_width=0.5,
+                      node_circle_size=0.3,
+                      batch_num=0,
+                      kde=False):
+
+    cmap = ['k', 'b', 'y', 'g', 'r']
+
+    for node in histories_dict:
+        history = histories_dict[node]
+        future = futures_dict[node]
+        predictions = prediction_dict[node]
+
+        if np.isnan(history[-1]).any():
+            continue
+
+        ax.plot(history[:, 0], history[:, 1], 'k--')
+
+        for sample_num in range(prediction_dict[node].shape[1]):
+
+            if kde and predictions.shape[1] >= 50:
+                line_alpha = 0.2
+                for t in range(predictions.shape[2]):
+                    sns.kdeplot(predictions[batch_num, :, t, 0], predictions[batch_num, :, t, 1],
+                                ax=ax, shade=True, shade_lowest=False,
+                                color=np.random.choice(cmap), alpha=0.8)
+
+            ax.plot(predictions[batch_num, sample_num, :, 0], predictions[batch_num, sample_num, :, 1],
+                    color=cmap[node.type.value],
+                    linewidth=line_width, alpha=line_alpha)
+
+            ax.plot(future[:, 0],
+                    future[:, 1],
+                    'w--',
+                    path_effects=[pe.Stroke(linewidth=edge_width, foreground='k'), pe.Normal()])
+
+            # Current Node Position
+            circle = plt.Circle((history[-1, 0],
+                                 history[-1, 1]),
+                                node_circle_size,
+                                facecolor='g',
+                                edgecolor='k',
+                                lw=circle_edge_width,
+                                zorder=3)
+            ax.add_artist(circle)
+
+    ax.axis('equal')
+
+
+def visualize_prediction(ax,
+                         prediction_output_dict,
+                         dt,
+                         max_hl,
+                         ph,
+                         robot_node=None,
+                         map=None,
+                         **kwargs):
+
+    prediction_dict, histories_dict, futures_dict = prediction_output_to_trajectories(prediction_output_dict,
+                                                                                      dt,
+                                                                                      max_hl,
+                                                                                      ph,
+                                                                                      map=map)
+
+    assert(len(prediction_dict.keys()) <= 1)
+    if len(prediction_dict.keys()) == 0:
+        return
+    ts_key = list(prediction_dict.keys())[0]
+
+    prediction_dict = prediction_dict[ts_key]
+    histories_dict = histories_dict[ts_key]
+    futures_dict = futures_dict[ts_key]
+
+    if map is not None:
+        ax.imshow(map.as_image(), origin='lower', alpha=0.5)
+    plot_trajectories(ax, prediction_dict, histories_dict, futures_dict, *kwargs)
+
+
+def visualize_distribution(ax,
+                           prediction_distribution_dict,
+                           map=None,
+                           pi_threshold=0.05,
+                           **kwargs):
+    if map is not None:
+        ax.imshow(map.as_image(), origin='lower', alpha=0.5)
+
+    for node, pred_dist in prediction_distribution_dict.items():
+        if pred_dist.mus.shape[:2] != (1, 1):
+            return
+
+        means = pred_dist.mus.squeeze().cpu().numpy()
+        covs = pred_dist.get_covariance_matrix().squeeze().cpu().numpy()
+        pis = pred_dist.pis_cat_dist.probs.squeeze().cpu().numpy()
+
+        for timestep in range(means.shape[0]):
+            for z_val in range(means.shape[1]):
+                mean = means[timestep, z_val]
+                covar = covs[timestep, z_val]
+                pi = pis[timestep, z_val]
+
+                if pi < pi_threshold:
+                    continue
+
+                v, w = linalg.eigh(covar)
+                v = 2. * np.sqrt(2.) * np.sqrt(v)
+                u = w[0] / linalg.norm(w[0])
+
+                # Plot an ellipse to show the Gaussian component
+                angle = np.arctan(u[1] / u[0])
+                angle = 180. * angle / np.pi  # convert to degrees
+                ell = patches.Ellipse(mean, v[0], v[1], 180. + angle, color='blue' if node.type.name == 'VEHICLE' else 'orange')
+                ell.set_edgecolor(None)
+                ell.set_clip_box(ax.bbox)
+                ell.set_alpha(pi/10)
+                ax.add_artist(ell)
diff --git a/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py
new file mode 100644
index 000000000..8ad700ee2
--- /dev/null
+++ b/forge/test/models/pytorch/multimodal/trajectron/trajectron/visualization/visualization_utils.py
@@ -0,0 +1,20 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+def plot_boxplots(ax, perf_dict_for_pd, x_label, y_label):
+    perf_df = pd.DataFrame.from_dict(perf_dict_for_pd)
+    our_mean_color = sns.color_palette("muted")[9]
+    marker_size = 7
+    mean_markers = 'X'
+    with sns.color_palette("muted"):
+        sns.boxplot(x=x_label, y=y_label, data=perf_df, ax=ax, showfliers=False)
+        ax.plot([0], [np.mean(perf_df[y_label])], color=our_mean_color, marker=mean_markers,
+                markeredgecolor='#545454', markersize=marker_size, zorder=10)
+
+
+def plot_barplots(ax, perf_dict_for_pd, x_label, y_label):
+    perf_df = pd.DataFrame.from_dict(perf_dict_for_pd)
+    with sns.color_palette("muted"):
+        sns.barplot(x=x_label, y=y_label, ax=ax, data=perf_df)
\ No newline at end of file