diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 21588fe3..b9bc85df 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,8 +8,10 @@ jobs:
       fail-fast: true
       max-parallel: 15
       matrix:
+        # os: [ubuntu-latest, macos-latest, windows-latest, macos-13-xlarge]
+        # For Apple Silicon: https://github.com/actions/runner-images/issues/8439
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10', '3.11']
     defaults:
       run:
         shell: bash
diff --git a/configs/env/cvrp.yaml b/configs/env/cvrp.yaml
index 2b98bd92..e8598620 100644
--- a/configs/env/cvrp.yaml
+++ b/configs/env/cvrp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.cvrp.CVRPEnv
+_target_: rl4co.envs.CVRPEnv
 name: cvrp
 
 num_loc: 20
diff --git a/configs/env/default.yaml b/configs/env/default.yaml
index a9ce9bf3..66fa95a6 100644
--- a/configs/env/default.yaml
+++ b/configs/env/default.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.tsp.TSPEnv
+_target_: rl4co.envs.TSPEnv
 name: tsp
 
 num_loc: 20
diff --git a/configs/env/dpp.yaml b/configs/env/dpp.yaml
index 0456187c..51a3c74b 100644
--- a/configs/env/dpp.yaml
+++ b/configs/env/dpp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.dpp.DPPEnv
+_target_: rl4co.envs.DPPEnv
 name: dpp
 
 max_decaps: 20
diff --git a/configs/env/mdpp.yaml b/configs/env/mdpp.yaml
index 57194a58..df790181 100644
--- a/configs/env/mdpp.yaml
+++ b/configs/env/mdpp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.mdpp.MDPPEnv
+_target_: rl4co.envs.MDPPEnv
 name: mdpp
 
 max_decaps: 20
diff --git a/configs/env/mtsp.yaml b/configs/env/mtsp.yaml
index 50cadba5..e24e0dca 100644
--- a/configs/env/mtsp.yaml
+++ b/configs/env/mtsp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.mtsp.MTSPEnv
+_target_: rl4co.envs.MTSPEnv
 name: mtsp
 
 num_loc: 20
diff --git a/configs/env/op.yaml b/configs/env/op.yaml
index e71bcba1..08d8d86d 100644
--- a/configs/env/op.yaml
+++ b/configs/env/op.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.op.OPEnv
+_target_: rl4co.envs.OPEnv
 name: op
 
 num_loc: 20
diff --git a/configs/env/pctsp.yaml b/configs/env/pctsp.yaml
index 3e92aac1..a05fc1f7 100644
--- a/configs/env/pctsp.yaml
+++ b/configs/env/pctsp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.pctsp.PCTSPEnv
+_target_: rl4co.envs.PCTSPEnv
 name: pctsp
 
 num_loc: 20
diff --git a/configs/env/pdp.yaml b/configs/env/pdp.yaml
index 71f12e01..ba5236a9 100644
--- a/configs/env/pdp.yaml
+++ b/configs/env/pdp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.pdp.PDPEnv
+_target_: rl4co.envs.PDPEnv
 name: pdp
 
 num_loc: 20
diff --git a/configs/env/sdvrp.yaml b/configs/env/sdvrp.yaml
index cb5f81a0..6ecdd4ce 100644
--- a/configs/env/sdvrp.yaml
+++ b/configs/env/sdvrp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.sdvrp.SDVRPEnv
+_target_: rl4co.envs.SDVRPEnv
 name: sdvrp
 
 num_loc: 20
diff --git a/configs/env/spctsp.yaml b/configs/env/spctsp.yaml
index ac1387e6..1a239237 100644
--- a/configs/env/spctsp.yaml
+++ b/configs/env/spctsp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.spctsp.SPCTSPEnv
+_target_: rl4co.envs.SPCTSPEnv
 name: spctsp
 
 num_loc: 20
diff --git a/configs/env/tsp.yaml b/configs/env/tsp.yaml
index 6d3b87fc..e853a203 100644
--- a/configs/env/tsp.yaml
+++ b/configs/env/tsp.yaml
@@ -1,4 +1,4 @@
-_target_: rl4co.envs.tsp.TSPEnv
+_target_: rl4co.envs.TSPEnv
 
 name: tsp
 
diff --git a/configs/experiment/base.yaml b/configs/experiment/base.yaml
index 4fae8d25..4cfe47d0 100644
--- a/configs/experiment/base.yaml
+++ b/configs/experiment/base.yaml
@@ -17,9 +17,6 @@ defaults:
 # that are automatically generated with seed following Kool et al. (2019).
 env:
   num_loc: 50
-  data_dir: ${paths.root_dir}/data/tsp
-  val_file: tsp${env.num_loc}_val_seed4321.npz
-  test_file: tsp${env.num_loc}_test_seed1234.npz
 
 # Logging: we use Wandb in this case
 logger:
diff --git a/configs/trainer/default.yaml b/configs/trainer/default.yaml
index 84df21f4..e3344212 100644
--- a/configs/trainer/default.yaml
+++ b/configs/trainer/default.yaml
@@ -4,16 +4,9 @@ _target_: rl4co.utils.trainer.RL4COTrainer
 default_root_dir: ${paths.output_dir}
 
 gradient_clip_val: 1.0
-accelerator: "gpu"
+accelerator: "auto"
 precision: "16-mixed"
 
-# Fast distributed training: comment out to use on single GPU
-# devices: 1 # change number of devices
-strategy:
-  _target_: lightning.pytorch.strategies.DDPStrategy
-  find_unused_parameters: True
-  gradient_as_bucket_view: True
-
 # perform a validation loop every N training epochs
 check_val_every_n_epoch: 1
 
diff --git a/docs/_theme/rl4co/extensions/lightning.py b/docs/_theme/rl4co/extensions/lightning.py
index d0633b41..8cba7fda 100644
--- a/docs/_theme/rl4co/extensions/lightning.py
+++ b/docs/_theme/rl4co/extensions/lightning.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 from docutils import nodes
 from docutils.statemachine import StringList
-from sphinx.util.docutils import SphinxDirective
-
 from pt_lightning_sphinx_theme.extensions.pytorch_tutorials import (
-    cardnode,
     CustomCalloutItemDirective,
     CustomCardItemDirective,
     DisplayItemDirective,
@@ -24,7 +21,9 @@
     ReactGreeter,
     SlackButton,
     TwoColumns,
+    cardnode,
 )
+from sphinx.util.docutils import SphinxDirective
 
 
 class tutoriallistnode(nodes.General, nodes.Element):
diff --git a/docs/_theme/rl4co/extensions/pytorch_tutorials.py b/docs/_theme/rl4co/extensions/pytorch_tutorials.py
index 97dbc6e6..12b0e73d 100644
--- a/docs/_theme/rl4co/extensions/pytorch_tutorials.py
+++ b/docs/_theme/rl4co/extensions/pytorch_tutorials.py
@@ -34,9 +34,8 @@
 from docutils import nodes
 from docutils.parsers.rst import Directive, directives
 from docutils.statemachine import StringList
-from sphinx.util.docutils import SphinxDirective
-
 from pt_lightning_sphinx_theme.extensions.react import get_react_component_rst
+from sphinx.util.docutils import SphinxDirective
 
 try:
     FileNotFoundError
@@ -272,11 +271,23 @@ def run(self):
 
             image_class = ""
             if "image_center" in self.options:
-                image = "<img src='" + self.options["image_center"] + "' style=height:" + image_height + "  >"
+                image = (
+                    "<img src='"
+                    + self.options["image_center"]
+                    + "' style=height:"
+                    + image_height
+                    + "  >"
+                )
                 image_class = "image-center"
 
             elif "image_right" in self.options:
-                image = "<img src='" + self.options["image_right"] + "' style=height:" + image_height + "  >"
+                image = (
+                    "<img src='"
+                    + self.options["image_right"]
+                    + "' style=height:"
+                    + image_height
+                    + "  >"
+                )
                 image_class = "image-right"
             else:
                 image = ""
@@ -371,7 +382,11 @@ def run(self):
             raise
             # return []
         callout_rst = get_react_component_rst(
-            "LikeButtonWithTitle", width=width, margin=margin, title=title, padding=padding
+            "LikeButtonWithTitle",
+            width=width,
+            margin=margin,
+            title=title,
+            padding=padding,
         )
         callout_list = StringList(callout_rst.split("\n"))
         callout = nodes.paragraph()
@@ -427,7 +442,9 @@ def run(self):
             print(e)
             raise
             return []
-        callout_rst = SLACK_TEMPLATE.format(align=align, title=title, margin=margin, width=width)
+        callout_rst = SLACK_TEMPLATE.format(
+            align=align, title=title, margin=margin, width=width
+        )
         callout_list = StringList(callout_rst.split("\n"))
         callout = nodes.paragraph()
         self.state.nested_parse(callout_list, self.content_offset, callout)
diff --git a/pyproject.toml b/pyproject.toml
index 8b4feee5..bf0fbcf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,14 +23,14 @@ dynamic = ["version"]
 
 license = {file = "LICENSE"}
 
-# TODO: allow new Python versions https://github.com/kaist-silab/rl4co/issues/95
-requires-python = ">=3.8, <3.11" # https://github.com/kaist-silab/rl4co/issues/90
+requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: Apache Software License",
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
@@ -41,10 +41,9 @@ classifiers = [
 
 # TODO: allow new TorchRL / TensorDict versions https://github.com/kaist-silab/rl4co/issues/95
 dependencies = [
-    "torch>=2.0.0,<2.1.0", # Possibly TorchRL problem on Windows with older version
-    "torchrl==0.1.1",
-    "tensordict==0.1.1",
-    "lightning>=2.0.5",
+    "torchrl>=0.2.0",
+    "tensordict>=0.2.0",
+    "lightning>=2.1.0",
     "hydra-core",
     "hydra-colorlog",
     "omegaconf",
diff --git a/rl4co/__init__.py b/rl4co/__init__.py
index 4dc8ce10..95407eb1 100644
--- a/rl4co/__init__.py
+++ b/rl4co/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.4.dev1"
+__version__ = "0.3.0dev0"
diff --git a/rl4co/data/dataset.py b/rl4co/data/dataset.py
index 5d22c603..afcf8bee 100644
--- a/rl4co/data/dataset.py
+++ b/rl4co/data/dataset.py
@@ -1,3 +1,4 @@
+
 import torch
 
 from tensordict.tensordict import TensorDict
@@ -5,9 +6,40 @@
 
 
 class TensorDictDataset(Dataset):
+    """Dataset compatible with TensorDicts.
+    Uses more CPU and has similar performance in loading to list comprehension, but is faster in instantiation
+    than :class:`TensorDictDatasetList` (more than 10x faster).
+    """
+
+    def __init__(self, td: TensorDict):
+        self.data = td
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitems__(self, index):
+        # Tricks:
+        # - batched data loading with `__getitems__` for faster loading
+        # - avoid directly indexing TensorDicts for faster loading
+        return TensorDict(
+            {key: item[index] for key, item in self.data.items()},
+            batch_size=torch.Size([len(index)]),
+            _run_checks=False,  # faster this way
+        )
+
+    def add_key(self, key, value):
+        return self.data.update({key: value})  # native method
+
+
+def tensordict_collate_fn(x):
+    """Equivalent to collating with `lambda x: x`"""
+    return x
+
+
+class TensorDictDatasetList(Dataset):
     """Dataset compatible with TensorDicts.
     It is better to "disassemble" the TensorDict into a list of dicts.
-    See :class:`tensordict_collate_fn` for more details.
+    See :class:`tensordict_collate_fn_list` for more details.
 
     Note:
         Check out the issue on tensordict for more details:
@@ -16,21 +48,24 @@ class TensorDictDataset(Dataset):
         but uses > 3x more CPU.
     """
 
-    def __init__(self, data: TensorDict):
+    def __init__(self, td: TensorDict):
+        self.data_len = td.batch_size[0]
         self.data = [
-            {key: value[i] for key, value in data.items()} for i in range(data.shape[0])
+            {key: value[i] for key, value in td.items()} for i in range(self.data_len)
         ]
 
     def __len__(self):
-        return len(self.data)
+        return self.data_len
 
     def __getitem__(self, idx):
         return self.data[idx]
 
+    def add_key(self, key, value):
+        return ExtraKeyDataset(self, value, key_name=key)
 
-def tensordict_collate_fn(batch):
-    """Collate function compatible with TensorDicts.
-    Reassemble the list of dicts into a TensorDict; seems to be way more efficient than using a TensorDictDataset.
+
+def tensordict_collate_fn_list(batch):
+    """Collate function compatible with TensorDicts that reassembles a list of dicts.
 
     Note:
         Check out the issue on tensordict for more details:
@@ -40,24 +75,28 @@ def tensordict_collate_fn(batch):
     """
     return TensorDict(
         {key: torch.stack([b[key] for b in batch]) for key in batch[0].keys()},
-        batch_size=len(batch),
+        batch_size=torch.Size([len(batch)]),
+        device=batch[0].device,
+        _run_checks=False,
     )
 
 
-class ExtraKeyDataset(Dataset):
+class ExtraKeyDataset(TensorDictDatasetList):
     """Dataset that includes an extra key to add to the data dict.
     This is useful for adding a REINFORCE baseline reward to the data dict.
+    Note that this is faster to instantiate than using list comprehension.
     """
 
-    def __init__(self, dataset: TensorDictDataset, extra: torch.Tensor):
+    def __init__(
+        self, dataset: TensorDictDatasetList, extra: torch.Tensor, key_name="extra"
+    ):
+        self.data_len = len(dataset)
+        assert self.data_len == len(extra), "Data and extra must be same length"
         self.data = dataset.data
         self.extra = extra
-        assert len(self.data) == len(self.extra), "Data and extra must be same length"
-
-    def __len__(self):
-        return len(self.data)
+        self.key_name = key_name
 
     def __getitem__(self, idx):
         data = self.data[idx]
-        data["extra"] = self.extra[idx]
+        data[self.key_name] = self.extra[idx]
         return data
diff --git a/rl4co/envs/__init__.py b/rl4co/envs/__init__.py
index 26eeecb0..1b4fce9f 100644
--- a/rl4co/envs/__init__.py
+++ b/rl4co/envs/__init__.py
@@ -1,25 +1,31 @@
 # Base environment
-# Main Environments
-from rl4co.envs.atsp import ATSPEnv
 from rl4co.envs.common.base import RL4COEnvBase
-from rl4co.envs.cvrp import CVRPEnv
-from rl4co.envs.dpp import DPPEnv
-from rl4co.envs.ffsp import FFSPEnv
-from rl4co.envs.mdpp import MDPPEnv
-from rl4co.envs.mtsp import MTSPEnv
-from rl4co.envs.op import OPEnv
-from rl4co.envs.pctsp import PCTSPEnv
-from rl4co.envs.pdp import PDPEnv
-from rl4co.envs.sdvrp import SDVRPEnv
-from rl4co.envs.smtwtp import SMTWTPEnv
-from rl4co.envs.spctsp import SPCTSPEnv
-from rl4co.envs.tsp import TSPEnv
+
+# EDA
+from rl4co.envs.eda import DPPEnv, MDPPEnv
+
+# Routing
+from rl4co.envs.routing import (
+    ATSPEnv,
+    CVRPEnv,
+    MTSPEnv,
+    OPEnv,
+    PCTSPEnv,
+    PDPEnv,
+    SDVRPEnv,
+    SPCTSPEnv,
+    TSPEnv,
+)
+
+# Scheduling
+from rl4co.envs.scheduling import FFSPEnv, SMTWTPEnv
 
 # Register environments
 ENV_REGISTRY = {
     "atsp": ATSPEnv,
     "cvrp": CVRPEnv,
     "dpp": DPPEnv,
+    "ffsp": FFSPEnv,
     "mdpp": MDPPEnv,
     "mtsp": MTSPEnv,
     "op": OPEnv,
diff --git a/rl4co/envs/common/base.py b/rl4co/envs/common/base.py
index 1607d413..85eb52a1 100644
--- a/rl4co/envs/common/base.py
+++ b/rl4co/envs/common/base.py
@@ -1,5 +1,5 @@
 from os.path import join as pjoin
-from typing import Optional, Iterable
+from typing import Iterable, Optional
 
 import torch
 
@@ -40,6 +40,7 @@ def __init__(
         val_dataloader_names: list = None,
         test_dataloader_names: list = None,
         check_solution: bool = True,
+        _torchrl_mode: bool = False,  # TODO
         seed: int = None,
         device: str = "cpu",
         **kwargs,
@@ -47,6 +48,7 @@ def __init__(
         super().__init__(device=device, batch_size=[])
         self.data_dir = data_dir
         self.train_file = pjoin(data_dir, train_file) if train_file is not None else None
+        self._torchrl_mode = _torchrl_mode
 
         def get_files(f):
             if f is not None:
@@ -85,6 +87,41 @@ def get_multiple_dataloader_names(f, names):
             seed = torch.empty((), dtype=torch.int64).random_().item()
         self.set_seed(seed)
 
+    def step(self, td: TensorDict) -> TensorDict:
+        """Step function to call at each step of the episode containing an action.
+        If `_torchrl_mode` is True, we call `_torchrl_step` instead which set the
+        `next` key of the TensorDict to the next state - this is the usual way to do it in TorchRL,
+        but inefficient in our case
+        """
+        if not self._torchrl_mode:
+            # Default: just return the TensorDict without farther checks etc is faster
+            td = self._step(td)
+            return {"next": td}
+        else:
+            # Since we simplify the syntax
+            return self._torchrl_step(td)
+
+    def _torchrl_step(self, td: TensorDict) -> TensorDict:
+        """See :meth:`super().step` for more details.
+        This is the usual way to do it in TorchRL, but inefficient in our case
+
+        Note:
+            Here we clone the TensorDict to avoid recursion error, since we allow
+            for directly updating the TensorDict in the step function
+        """
+        # sanity check
+        self._assert_tensordict_shape(td)
+        next_preset = td.get("next", None)
+
+        next_tensordict = self._step(
+            td.clone()
+        )  # NOTE: we clone to avoid recursion error
+        next_tensordict = self._step_proc_data(next_tensordict)
+        if next_preset is not None:
+            next_tensordict.update(next_preset.exclude(*next_tensordict.keys(True, True)))
+        td.set("next", next_tensordict)
+        return td
+
     def _step(self, td: TensorDict) -> TensorDict:
         """Step function to call at each step of the episode containing an action.
         Gives the next observation, reward, done
@@ -178,6 +215,13 @@ def _set_seed(self, seed: Optional[int]):
         rng = torch.manual_seed(seed)
         self.rng = rng
 
+    def to(self, device):
+        """Override `to` device method for safety against `None` device (may be found in `TensorDict`))"""
+        if device is None:
+            return self
+        else:
+            return super().to(device)
+
     def __getstate__(self):
         """Return the state of the environment. By default, we want to avoid pickling
         the random number generator directly as it is not allowed by `deepcopy`
diff --git a/rl4co/envs/eda/__init__.py b/rl4co/envs/eda/__init__.py
new file mode 100644
index 00000000..da7f45e2
--- /dev/null
+++ b/rl4co/envs/eda/__init__.py
@@ -0,0 +1,2 @@
+from rl4co.envs.eda.dpp import DPPEnv
+from rl4co.envs.eda.mdpp import MDPPEnv
diff --git a/rl4co/envs/dpp.py b/rl4co/envs/eda/dpp.py
similarity index 95%
rename from rl4co/envs/dpp.py
rename to rl4co/envs/eda/dpp.py
index 8ac8a857..fe88572c 100644
--- a/rl4co/envs/dpp.py
+++ b/rl4co/envs/eda/dpp.py
@@ -100,30 +100,25 @@ def _step(self, td: TensorDict) -> TensorDict:
         # Set done if i is greater than max_decaps
         done = td["i"] >= self.max_decaps - 1
 
-        # Calculate reward (we set to -inf since we calculate the reward outside based on the actions)
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "probe": td["probe"],
-                    "i": td["i"] + 1,
-                    "action_mask": available,
-                    "keepout": td["keepout"],
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "i": td["i"] + 1,
+                "action_mask": available,
+                "reward": reward,
+                "done": done,
+            }
         )
+        return td
 
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         # Initialize locations
         if batch_size is None:
             batch_size = self.batch_size if td is None else td.batch_size
-        self.device = td.device if td is not None else self.device
+        device = td.device if td is not None else self.device
+        self.to(device)
 
         # We allow loading the initial observation from a dataset for faster loading
         if td is None:
@@ -170,7 +165,6 @@ def _make_spec(self, td_params):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
@@ -297,7 +291,7 @@ def _initial_impedance(self, probe):
         return zout
 
     def _decap_simulator(self, probe, solution, keepout=None):
-        self.device = solution.device
+        self.to(self.device)
 
         probe = probe.item()
 
diff --git a/rl4co/envs/mdpp.py b/rl4co/envs/eda/mdpp.py
similarity index 98%
rename from rl4co/envs/mdpp.py
rename to rl4co/envs/eda/mdpp.py
index 633cd5e9..25948fe0 100644
--- a/rl4co/envs/mdpp.py
+++ b/rl4co/envs/eda/mdpp.py
@@ -11,7 +11,7 @@
     UnboundedDiscreteTensorSpec,
 )
 
-from rl4co.envs.dpp import DPPEnv
+from rl4co.envs.eda.dpp import DPPEnv
 from rl4co.utils.pylogger import get_pylogger
 
 log = get_pylogger(__name__)
@@ -64,8 +64,12 @@ def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict
         # Action mask is 0 if both action_mask (e.g. keepout) and probe are 0
         action_mask = torch.logical_and(td_reset["action_mask"], ~td_reset["probe"])
         # Keepout regions are the inverse of action_mask
-        td_reset.set_("keepout", ~td_reset["action_mask"])
-        td_reset.set_("action_mask", action_mask)
+        td_reset.update(
+            {
+                "keepout": ~td_reset["action_mask"],
+                "action_mask": action_mask,
+            }
+        )
         return td_reset
 
     def _make_spec(self, td_params):
@@ -95,7 +99,6 @@ def _make_spec(self, td_params):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/routing/__init__.py b/rl4co/envs/routing/__init__.py
new file mode 100644
index 00000000..d7b815d5
--- /dev/null
+++ b/rl4co/envs/routing/__init__.py
@@ -0,0 +1,9 @@
+from rl4co.envs.routing.atsp import ATSPEnv
+from rl4co.envs.routing.cvrp import CVRPEnv
+from rl4co.envs.routing.mtsp import MTSPEnv
+from rl4co.envs.routing.op import OPEnv
+from rl4co.envs.routing.pctsp import PCTSPEnv
+from rl4co.envs.routing.pdp import PDPEnv
+from rl4co.envs.routing.sdvrp import SDVRPEnv
+from rl4co.envs.routing.spctsp import SPCTSPEnv
+from rl4co.envs.routing.tsp import TSPEnv
diff --git a/rl4co/envs/atsp.py b/rl4co/envs/routing/atsp.py
similarity index 90%
rename from rl4co/envs/atsp.py
rename to rl4co/envs/routing/atsp.py
index fe451db9..dcfb720e 100644
--- a/rl4co/envs/atsp.py
+++ b/rl4co/envs/routing/atsp.py
@@ -20,7 +20,7 @@
 class ATSPEnv(RL4COEnvBase):
     """
     Asymmetric Traveling Salesman Problem environment
-    At each step, the agent chooses a city to visit. The reward is the -infinite unless the agent visits all the cities.
+    At each step, the agent chooses a city to visit. The reward is 0 unless the agent visits all the cities.
     In that case, the reward is (-)length of the path: maximizing the reward is equivalent to minimizing the path length.
     Unlike the TSP, the distance matrix is asymmetric, i.e., the distance from A to B is not necessarily the same as the distance from B to A.
 
@@ -62,24 +62,20 @@ def _step(td: TensorDict) -> TensorDict:
         # We are done there are no unvisited locations
         done = torch.count_nonzero(available, dim=-1) <= 0
 
-        # The reward is calculated outside via get_reward for efficiency, so we set it to -inf here
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        td.update(
             {
-                "next": {
-                    "cost_matrix": td["cost_matrix"],
-                    "first_node": first_node,
-                    "current_node": current_node,
-                    "i": td["i"] + 1,
-                    "action_mask": available,
-                    "reward": reward,
-                    "done": done,
-                }
+                "first_node": first_node,
+                "current_node": current_node,
+                "i": td["i"] + 1,
+                "action_mask": available,
+                "reward": reward,
+                "done": done,
             },
-            td.shape,
         )
+        return td
 
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         # Initialize distance matrix
@@ -90,9 +86,8 @@ def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict
             batch_size = (
                 self.batch_size if cost_matrix is None else cost_matrix.shape[:-2]
             )
-        self.device = device = (
-            cost_matrix.device if cost_matrix is not None else self.device
-        )
+        device = cost_matrix.device if cost_matrix is not None else self.device
+        self.to(device)
         if cost_matrix is None:
             cost_matrix = self.generate_data(batch_size=batch_size).to(device)[
                 "cost_matrix"
@@ -142,7 +137,6 @@ def _make_spec(self, td_params: TensorDict = None):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/cvrp.py b/rl4co/envs/routing/cvrp.py
similarity index 92%
rename from rl4co/envs/cvrp.py
rename to rl4co/envs/routing/cvrp.py
index 2cde5f51..edd79d38 100644
--- a/rl4co/envs/cvrp.py
+++ b/rl4co/envs/routing/cvrp.py
@@ -41,7 +41,7 @@ class CVRPEnv(RL4COEnvBase):
     """Capacitated Vehicle Routing Problem (CVRP) environment.
     At each step, the agent chooses a customer to visit depending on the current location and the remaining capacity.
     When the agent visits a customer, the remaining capacity is updated. If the remaining capacity is not enough to
-    visit any customer, the agent must go back to the depot. The reward is the -infinite unless the agent visits all the cities.
+    visit any customer, the agent must go back to the depot. The reward is 0 unless the agent visits all the cities.
     In that case, the reward is (-)length of the path: maximizing the reward is equivalent to minimizing the path length.
 
     Args:
@@ -103,25 +103,19 @@ def _step(self, td: TensorDict) -> TensorDict:
 
         # SECTION: get done
         done = visited.sum(-1) == visited.size(-1)
-        reward = torch.ones_like(done) * float("-inf")
+        reward = torch.zeros_like(done)
 
-        td_step = TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "demand": td["demand"],
-                    "current_node": current_node,
-                    "used_capacity": used_capacity,
-                    "vehicle_capacity": td["vehicle_capacity"],
-                    "visited": visited,
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "current_node": current_node,
+                "used_capacity": used_capacity,
+                "visited": visited,
+                "reward": reward,
+                "done": done,
+            }
         )
-        td_step["next"].set("action_mask", self.get_action_mask(td_step["next"]))
-        return td_step
+        td.set("action_mask", self.get_action_mask(td))
+        return td
 
     def _reset(
         self,
@@ -134,7 +128,7 @@ def _reset(
             td = self.generate_data(batch_size=batch_size)
         batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
 
-        self.device = td.device
+        self.to(td.device)
 
         # Create reset TensorDict
         td_reset = TensorDict(
@@ -226,15 +220,11 @@ def generate_data(self, batch_size) -> TensorDict:
         # Demand sampling Following Kool et al. (2019)
         # Generates a slightly different distribution than using torch.randint
         demand = (
-            (
-                torch.FloatTensor(*batch_size, self.num_loc)
-                .uniform_(self.min_demand - 1, self.max_demand - 1)
-                .int()
-                + 1
-            )
-            .float()
-            .to(self.device)
-        )
+            torch.FloatTensor(*batch_size, self.num_loc, device=self.device)
+            .uniform_(self.min_demand - 1, self.max_demand - 1)
+            .int()
+            + 1
+        ).float()
 
         # Support for heterogeneous capacity if provided
         if not isinstance(self.capacity, torch.Tensor):
@@ -250,6 +240,7 @@ def generate_data(self, batch_size) -> TensorDict:
                 "capacity": capacity,
             },
             batch_size=batch_size,
+            device=self.device,
         )
 
     @staticmethod
@@ -258,7 +249,7 @@ def load_data(fpath, batch_size=[]):
         Normalize demand by capacity to be in [0, 1]
         """
         td_load = load_npz_to_tensordict(fpath)
-        td_load.set_("demand", td_load["demand"] / td_load["capacity"][:, None])
+        td_load.set("demand", td_load["demand"] / td_load["capacity"][:, None])
         return td_load
 
     def _make_spec(self, td_params: TensorDict):
@@ -286,7 +277,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/mpdp.py b/rl4co/envs/routing/mpdp.py
similarity index 93%
rename from rl4co/envs/mpdp.py
rename to rl4co/envs/routing/mpdp.py
index 77e6c7cc..1a83eeb0 100644
--- a/rl4co/envs/mpdp.py
+++ b/rl4co/envs/routing/mpdp.py
@@ -22,7 +22,7 @@ class MPDPEnv(RL4COEnvBase):
     The goal is to pick up and deliver all the packages while satisfying the precedence constraints.
     When an agent goes back to the depot, a new agent is spawned. In the min-max version, the goal is to minimize the
     maximum tour length among all agents.
-    The reward is the -infinite unless the agent visits all the cities.
+    The reward is 0 unless the agent visits all the cities.
     In that case, the reward is (-)length of the path: maximizing the reward is equivalent to minimizing the path length.
 
     Args:
@@ -111,36 +111,25 @@ def _step(self, td: TensorDict) -> TensorDict:
 
         # Get done and reward
         done = visited.all(dim=-1, keepdim=True).squeeze(-1)
-        reward = torch.ones_like(done) * float(
-            "-inf"
-        )  # reward calculated via `get_reward` for now
+        reward = torch.zeros_like(done)
 
-        td_step = TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "visited": visited,
-                    "lengths": td["lengths"],
-                    "count_depot": td["count_depot"],
-                    "agent_idx": agent_idx,
-                    "cur_coord": cur_coord,
-                    "to_delivery": to_delivery,
-                    "left_request": td["left_request"],
-                    "depot_distance": depot_distance,
-                    "remain_sum_paired_distance": remain_sum_paired_distance,
-                    "remain_pickup_max_distance": remain_pickup_max_distance,
-                    "remain_delivery_max_distance": remain_delivery_max_distance,
-                    "add_pd_distance": td["add_pd_distance"],
-                    "longest_lengths": td["longest_lengths"],
-                    "i": td["i"] + 1,
-                    "done": done,
-                    "reward": reward,
-                }
-            },
-            td.shape,
+                "visited": visited,
+                "agent_idx": agent_idx,
+                "cur_coord": cur_coord,
+                "to_delivery": to_delivery,
+                "depot_distance": depot_distance,
+                "remain_sum_paired_distance": remain_sum_paired_distance,
+                "remain_pickup_max_distance": remain_pickup_max_distance,
+                "remain_delivery_max_distance": remain_delivery_max_distance,
+                "i": td["i"] + 1,
+                "done": done,
+                "reward": reward,
+            }
         )
-        td_step["next"].set("action_mask", self.get_action_mask(td_step["next"]))
-        return td_step
+        td.set("action_mask", self.get_action_mask(td))
+        return td
 
     def _reset(
         self,
@@ -154,7 +143,7 @@ def _reset(
         if td is None or td.is_empty():
             td = self.generate_data(batch_size=batch_size)
 
-        self.device = td.device
+        self.to(td.device)
 
         # NOTE: this is a hack to get the agent_num
         # agent_num = td["agent_num"][0].item() if agent_num is None else agent_num
@@ -425,7 +414,6 @@ def _make_spec(self, td_params: TensorDict):
                 dtype=torch.int64,
             ),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/mtsp.py b/rl4co/envs/routing/mtsp.py
similarity index 94%
rename from rl4co/envs/mtsp.py
rename to rl4co/envs/routing/mtsp.py
index e85fa624..7b835589 100644
--- a/rl4co/envs/mtsp.py
+++ b/rl4co/envs/routing/mtsp.py
@@ -112,26 +112,22 @@ def _step(td: TensorDict) -> TensorDict:
         # The reward is the negative of the max_subtour_length (minmax objective)
         reward = -max_subtour_length
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "num_agents": td["num_agents"],
-                    "max_subtour_length": max_subtour_length,
-                    "current_length": current_length,
-                    "agent_idx": cur_agent_idx,
-                    "first_node": first_node,
-                    "current_node": current_node,
-                    "i": td["i"] + 1,
-                    "action_mask": available,
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "max_subtour_length": max_subtour_length,
+                "current_length": current_length,
+                "agent_idx": cur_agent_idx,
+                "first_node": first_node,
+                "current_node": current_node,
+                "i": td["i"] + 1,
+                "action_mask": available,
+                "reward": reward,
+                "done": done,
+            }
         )
 
+        return td
+
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         # Initialize data
         if batch_size is None:
@@ -214,7 +210,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/op.py b/rl4co/envs/routing/op.py
similarity index 94%
rename from rl4co/envs/op.py
rename to rl4co/envs/routing/op.py
index 42d00a9b..70f06263 100644
--- a/rl4co/envs/op.py
+++ b/rl4co/envs/routing/op.py
@@ -90,29 +90,23 @@ def _step(self, td: TensorDict) -> TensorDict:
         # Done if went back to depot (except if it's the first step, since we start at the depot)
         done = (current_node.squeeze(-1) == 0) & (td["i"] > 0)
 
-        # The reward is calculated outside via get_reward for efficiency, so we set it to -inf here
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        td_step = TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "prize": td["prize"],
-                    "tour_length": tour_length,
-                    "current_loc": current_loc,
-                    "max_length": td["max_length"],
-                    "current_node": current_node,
-                    "visited": visited,
-                    "current_total_prize": current_total_prize,
-                    "i": td["i"] + 1,
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "tour_length": tour_length,
+                "current_loc": current_loc,
+                "current_node": current_node,
+                "visited": visited,
+                "current_total_prize": current_total_prize,
+                "i": td["i"] + 1,
+                "reward": reward,
+                "done": done,
+            }
         )
-        td_step["next"].set("action_mask", self.get_action_mask(td_step["next"]))
-        return td_step
+        td.set("action_mask", self.get_action_mask(td))
+        return td
 
     def _reset(
         self,
@@ -124,8 +118,7 @@ def _reset(
             batch_size = self.batch_size if td is None else td["locs"].shape[:-2]
         if td is None or td.is_empty():
             td = self.generate_data(batch_size=batch_size)
-        self.device = td.device
-
+        self.to(td.device)
         # Add depot to locs
         locs_with_depot = torch.cat((td["depot"][:, None, :], td["locs"]), -2)
 
@@ -322,7 +315,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/pctsp.py b/rl4co/envs/routing/pctsp.py
similarity index 93%
rename from rl4co/envs/pctsp.py
rename to rl4co/envs/routing/pctsp.py
index b4c3204f..ca0f4863 100644
--- a/rl4co/envs/pctsp.py
+++ b/rl4co/envs/routing/pctsp.py
@@ -78,31 +78,26 @@ def _step(self, td: TensorDict) -> TensorDict:
         # Update visited
         visited = td["visited"].scatter(-1, current_node[..., None], 1)
 
-        # Done and reward. Calculation is done outside hence set -inf
+        # Done and reward
         done = (td["i"] > 0) & (current_node == 0)
-        reward = torch.ones_like(cur_total_prize) * float("-inf")
 
-        td_step = TensorDict(
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
+
+        # Update state
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "current_node": current_node,
-                    "expected_prize": td["expected_prize"],
-                    "real_prize": td["real_prize"],
-                    "penalty": td["penalty"],
-                    "cur_total_prize": cur_total_prize,
-                    "cur_total_penalty": cur_total_penalty,
-                    "visited": visited,
-                    "prize_required": td["prize_required"],
-                    "i": td["i"] + 1,
-                    "reward": reward,
-                    "done": done,
-                },
-            },
-            batch_size=td.batch_size,
+                "current_node": current_node,
+                "cur_total_prize": cur_total_prize,
+                "cur_total_penalty": cur_total_penalty,
+                "visited": visited,
+                "i": td["i"] + 1,
+                "reward": reward,
+                "done": done,
+            }
         )
-        td_step["next"].set("action_mask", self.get_action_mask(td_step["next"]))
-        return td_step
+        td.set("action_mask", self.get_action_mask(td))
+        return td
 
     def _reset(
         self, td: Optional[TensorDict] = None, batch_size: Optional[list] = None
@@ -111,7 +106,7 @@ def _reset(
             batch_size = self.batch_size if td is None else td["locs"].shape[:-2]
         if td is None or td.is_empty():
             td = self.generate_data(batch_size=batch_size)
-        self.device = td.device
+        self.to(td.device)
 
         locs = torch.cat([td["depot"][..., None, :], td["locs"]], dim=-2)
         expected_prize = td["deterministic_prize"]
@@ -323,7 +318,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/pdp.py b/rl4co/envs/routing/pdp.py
similarity index 93%
rename from rl4co/envs/pdp.py
rename to rl4co/envs/routing/pdp.py
index 7d6a309d..b845e093 100644
--- a/rl4co/envs/pdp.py
+++ b/rl4co/envs/routing/pdp.py
@@ -71,25 +71,22 @@ def _step(td: TensorDict) -> TensorDict:
         # We are done there are no unvisited locations
         done = torch.count_nonzero(available, dim=-1) == 0
 
-        # The reward is calculated outside via get_reward for efficiency, so we set it to -inf here
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        # Update step
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "current_node": current_node,
-                    "available": available,
-                    "to_deliver": to_deliver,
-                    "i": td["i"] + 1,
-                    "action_mask": action_mask,
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "current_node": current_node,
+                "available": available,
+                "to_deliver": to_deliver,
+                "i": td["i"] + 1,
+                "action_mask": action_mask,
+                "reward": reward,
+                "done": done,
+            }
         )
+        return td
 
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         if batch_size is None:
@@ -98,7 +95,7 @@ def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict
         if td is None or td.is_empty():
             td = self.generate_data(batch_size=batch_size)
 
-        self.device = td.device
+        self.to(td.device)
 
         locs = torch.cat((td["depot"][:, None, :], td["locs"]), -2)
 
@@ -170,7 +167,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/sdvrp.py b/rl4co/envs/routing/sdvrp.py
similarity index 89%
rename from rl4co/envs/sdvrp.py
rename to rl4co/envs/routing/sdvrp.py
index 30d582ee..1fbf182d 100644
--- a/rl4co/envs/sdvrp.py
+++ b/rl4co/envs/routing/sdvrp.py
@@ -10,10 +10,11 @@
     UnboundedDiscreteTensorSpec,
 )
 
-from rl4co.envs.cvrp import CVRPEnv
 from rl4co.utils.ops import gather_by_index
 from rl4co.utils.pylogger import get_pylogger
 
+from .cvrp import CVRPEnv
+
 log = get_pylogger(__name__)
 
 
@@ -84,27 +85,24 @@ def _step(self, td: TensorDict) -> TensorDict:
             -1, current_node, -delivered_demand
         )
 
-        # Get done and reward (-inf since we get it outside)
+        # Get done
         done = ~(demand_with_depot > 0).any(-1)
-        reward = torch.ones_like(done) * float("-inf")
 
-        td_step = TensorDict(
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
+
+        # Update state
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "demand": td["demand"],
-                    "demand_with_depot": demand_with_depot,
-                    "current_node": current_node,
-                    "used_capacity": used_capacity,
-                    "vehicle_capacity": td["vehicle_capacity"],
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "demand_with_depot": demand_with_depot,
+                "current_node": current_node,
+                "used_capacity": used_capacity,
+                "reward": reward,
+                "done": done,
+            }
         )
-        td_step["next"].set("action_mask", self.get_action_mask(td_step["next"]))
-        return td_step
+        td.set("action_mask", self.get_action_mask(td))
+        return td
 
     def _reset(
         self,
@@ -117,7 +115,7 @@ def _reset(
         if td is None or td.is_empty():
             td = self.generate_data(batch_size=batch_size)
 
-        self.device = td["locs"].device
+        self.to(td.device)
 
         # Create reset TensorDict
         reset_td = TensorDict(
@@ -211,7 +209,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/spctsp.py b/rl4co/envs/routing/spctsp.py
similarity index 95%
rename from rl4co/envs/spctsp.py
rename to rl4co/envs/routing/spctsp.py
index e6bfb757..e39a31b1 100644
--- a/rl4co/envs/spctsp.py
+++ b/rl4co/envs/routing/spctsp.py
@@ -1,6 +1,7 @@
-from rl4co.envs.pctsp import PCTSPEnv
 from rl4co.utils.pylogger import get_pylogger
 
+from .pctsp import PCTSPEnv
+
 log = get_pylogger(__name__)
 
 
diff --git a/rl4co/envs/tsp.py b/rl4co/envs/routing/tsp.py
similarity index 86%
rename from rl4co/envs/tsp.py
rename to rl4co/envs/routing/tsp.py
index 3d7d53a6..7db63541 100644
--- a/rl4co/envs/tsp.py
+++ b/rl4co/envs/routing/tsp.py
@@ -11,7 +11,6 @@
 )
 
 from rl4co.envs.common.base import RL4COEnvBase
-from rl4co.envs.common.utils import batch_to_scalar
 from rl4co.utils.ops import gather_by_index, get_tour_length
 from rl4co.utils.pylogger import get_pylogger
 
@@ -21,7 +20,7 @@
 class TSPEnv(RL4COEnvBase):
     """
     Traveling Salesman Problem environment
-    At each step, the agent chooses a city to visit. The reward is the -infinite unless the agent visits all the cities.
+    At each step, the agent chooses a city to visit. The reward is 0 unless the agent visits all the cities.
     In that case, the reward is (-)length of the path: maximizing the reward is equivalent to minimizing the path length.
 
     Args:
@@ -50,41 +49,38 @@ def __init__(
     @staticmethod
     def _step(td: TensorDict) -> TensorDict:
         current_node = td["action"]
-        first_node = current_node if batch_to_scalar(td["i"]) == 0 else td["first_node"]
+        first_node = current_node if td["i"].all() == 0 else td["first_node"]
 
-        # Set not visited to 0 (i.e., we visited the node)
+        # # Set not visited to 0 (i.e., we visited the node)
         available = td["action_mask"].scatter(
             -1, current_node.unsqueeze(-1).expand_as(td["action_mask"]), 0
         )
 
         # We are done there are no unvisited locations
-        done = torch.count_nonzero(available, dim=-1) <= 0
+        done = torch.sum(available, dim=-1) == 0
 
-        # The reward is calculated outside via get_reward for efficiency, so we set it to -inf here
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        td.update(
             {
-                "next": {
-                    "locs": td["locs"],
-                    "first_node": first_node,
-                    "current_node": current_node,
-                    "i": td["i"] + 1,
-                    "action_mask": available,
-                    "reward": reward,
-                    "done": done,
-                }
+                "first_node": first_node,
+                "current_node": current_node,
+                "i": td["i"] + 1,
+                "action_mask": available,
+                "reward": reward,
+                "done": done,
             },
-            td.shape,
         )
+        return td
 
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         # Initialize locations
         init_locs = td["locs"] if td is not None else None
         if batch_size is None:
             batch_size = self.batch_size if init_locs is None else init_locs.shape[:-2]
-        self.device = device = init_locs.device if init_locs is not None else self.device
+        device = init_locs.device if init_locs is not None else self.device
+        self.to(device)
         if init_locs is None:
             init_locs = self.generate_data(batch_size=batch_size).to(device)["locs"]
         batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
@@ -106,6 +102,7 @@ def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict
                 "current_node": current_node,
                 "i": i,
                 "action_mask": available,
+                "reward": torch.zeros((*batch_size, 1), dtype=torch.float32),
             },
             batch_size=batch_size,
         )
@@ -137,7 +134,6 @@ def _make_spec(self, td_params):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/scheduling/__init__.py b/rl4co/envs/scheduling/__init__.py
new file mode 100644
index 00000000..a9c5144b
--- /dev/null
+++ b/rl4co/envs/scheduling/__init__.py
@@ -0,0 +1,2 @@
+from rl4co.envs.scheduling.ffsp import FFSPEnv
+from rl4co.envs.scheduling.smtwtp import SMTWTPEnv
diff --git a/rl4co/envs/ffsp.py b/rl4co/envs/scheduling/ffsp.py
similarity index 92%
rename from rl4co/envs/ffsp.py
rename to rl4co/envs/scheduling/ffsp.py
index ff97d5d7..fb1dd5fe 100644
--- a/rl4co/envs/ffsp.py
+++ b/rl4co/envs/scheduling/ffsp.py
@@ -139,30 +139,25 @@ def _step(self, td: TensorDict) -> TensorDict:
 
             reward = td["reward"]
 
-        return TensorDict(
+        # Updated state
+        td.update(
             {
-                "next": {
-                    "stage_table": td["stage_table"],
-                    "machine_table": td["machine_table"],
-                    "time_idx": time_idx,
-                    "sub_time_idx": sub_time_idx,
-                    "batch_idx": batch_idx,
-                    "machine_idx": machine_idx,
-                    "schedule": schedule,
-                    "machine_wait_step": machine_wait_step,
-                    "job_location": job_location,
-                    "job_wait_step": job_wait_step,
-                    "job_duration": td["job_duration"],
-                    "reward": reward,
-                    "finish": finish,
-                    # Update variables
-                    "job_mask": job_mask,
-                    "stage_idx": stage_idx,
-                    "stage_machine_idx": stage_machine_idx,
-                }
-            },
-            td.shape,
+                "time_idx": time_idx,
+                "sub_time_idx": sub_time_idx,
+                "batch_idx": batch_idx,
+                "machine_idx": machine_idx,
+                "schedule": schedule,
+                "machine_wait_step": machine_wait_step,
+                "job_location": job_location,
+                "job_wait_step": job_wait_step,
+                "reward": reward,
+                "finish": finish,
+                "job_mask": job_mask,
+                "stage_idx": stage_idx,
+                "stage_machine_idx": stage_machine_idx,
+            }
         )
+        return td
 
     def _reset(
         self, td: Optional[TensorDict] = None, batch_size: Optional[list] = None
@@ -321,7 +316,6 @@ def _make_spec(self, td_params: TensorDict):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/envs/smtwtp.py b/rl4co/envs/scheduling/smtwtp.py
similarity index 90%
rename from rl4co/envs/smtwtp.py
rename to rl4co/envs/scheduling/smtwtp.py
index 41f5e5af..d65d8d07 100644
--- a/rl4co/envs/smtwtp.py
+++ b/rl4co/envs/scheduling/smtwtp.py
@@ -22,7 +22,7 @@ class SMTWTPEnv(RL4COEnvBase):
     SMTWTP is a scheduling problem in which a set of jobs must be processed on a single machine.
     Each job i has a processing time, a weight, and a due date. The objective is to minimize the sum of the weighted tardiness of all jobs,
     where the weighted tardiness of a job is defined as the product of its weight and the duration by which its completion time exceeds its due date.
-    At each step, the agent chooses a job to process. The reward is the -infinite unless the agent processes all the jobs.
+    At each step, the agent chooses a job to process. The reward is 0 unless the agent processes all the jobs.
     In that case, the reward is (-)objective value of the processing order: maximizing the reward is equivalent to minimizing the objective.
 
     Args:
@@ -80,25 +80,19 @@ def _step(td: TensorDict) -> TensorDict:
         # We are done there are no unvisited locations
         done = torch.count_nonzero(available, dim=-1) <= 0
 
-        # The reward is calculated outside via get_reward for efficiency, so we set it to -inf here
-        reward = torch.ones_like(done) * float("-inf")
+        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
+        reward = torch.zeros_like(done)
 
-        # The output must be written in a ``"next"`` entry
-        return TensorDict(
+        td.update(
             {
-                "next": {
-                    "job_due_time": td["job_due_time"],
-                    "job_weight": td["job_weight"],
-                    "job_process_time": td["job_process_time"],
-                    "current_job": current_job,
-                    "current_time": current_time,
-                    "action_mask": available,
-                    "reward": reward,
-                    "done": done,
-                }
-            },
-            td.shape,
+                "current_job": current_job,
+                "current_time": current_time,
+                "action_mask": available,
+                "reward": reward,
+                "done": done,
+            }
         )
+        return td
 
     def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
         # Initialization
@@ -106,9 +100,8 @@ def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict
             batch_size = self.batch_size if td is None else td["job_due_time"].shape[:-1]
         batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
 
-        self.device = device = (
-            td["job_due_time"].device if td is not None else self.device
-        )
+        device = td["job_due_time"].device if td is not None else self.device
+        self.to(device)
 
         td = self.generate_data(batch_size) if td is None else td
 
@@ -170,7 +163,6 @@ def _make_spec(self, td_params: TensorDict = None):
             ),
             shape=(),
         )
-        self.input_spec = self.observation_spec.clone()
         self.action_spec = BoundedTensorSpec(
             shape=(1,),
             dtype=torch.int64,
diff --git a/rl4co/models/__init__.py b/rl4co/models/__init__.py
index c923dbe7..b4b794db 100644
--- a/rl4co/models/__init__.py
+++ b/rl4co/models/__init__.py
@@ -1,7 +1,6 @@
 from rl4co.models.zoo.active_search import ActiveSearch
 from rl4co.models.zoo.am import AttentionModel, AttentionModelPolicy
 from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
-
 from rl4co.models.zoo.common.search import SearchBase
 from rl4co.models.zoo.eas import EAS, EASEmb, EASLay
 from rl4co.models.zoo.ham import (
diff --git a/rl4co/models/nn/utils.py b/rl4co/models/nn/utils.py
index acdc622e..6ab21c7c 100644
--- a/rl4co/models/nn/utils.py
+++ b/rl4co/models/nn/utils.py
@@ -19,7 +19,9 @@ def get_log_likelihood(log_p, actions, mask, return_sum: bool = True):
     if mask is not None:
         log_p[~mask] = 0
 
-    assert (log_p > -1000).data.all(), "Logprobs should not be -inf, check sampling procedure!"
+    assert (
+        log_p > -1000
+    ).data.all(), "Logprobs should not be -inf, check sampling procedure!"
 
     # Calculate log_likelihood
     if return_sum:
diff --git a/rl4co/models/rl/common/base.py b/rl4co/models/rl/common/base.py
index 3ec0ce22..7a7d5b20 100644
--- a/rl4co/models/rl/common/base.py
+++ b/rl4co/models/rl/common/base.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Union, Iterable
+from typing import Any, Iterable, Union
 
 import torch
 import torch.nn as nn
@@ -149,7 +149,6 @@ def setup(self, stage="fit"):
             self.data_cfg["test_data_size"], phase="test"
         )
         self.dataloader_names = None
-        
         self.setup_loggers()
         self.post_setup_hook()
 
@@ -214,12 +213,16 @@ def configure_optimizers(self, parameters=None):
 
     def log_metrics(self, metric_dict: dict, phase: str, dataloader_idx: int = None):
         """Log metrics to logger and progress bar"""
-        metrics = getattr(self, f"{phase}_metrics") 
+        metrics = getattr(self, f"{phase}_metrics")
         dataloader_name = ""
         if dataloader_idx is not None and self.dataloader_names is not None:
-            dataloader_name = "/" + self.dataloader_names[dataloader_idx]           
+            dataloader_name = "/" + self.dataloader_names[dataloader_idx]
         metrics = {
-            f"{phase}/{k}{dataloader_name}": v.mean() if isinstance(v, torch.Tensor) else v for k, v in metric_dict.items() if k in metrics
+            f"{phase}/{k}{dataloader_name}": v.mean()
+            if isinstance(v, torch.Tensor)
+            else v
+            for k, v in metric_dict.items()
+            if k in metrics
         }
         log_on_step = self.log_on_step if phase == "train" else False
         on_epoch = False if phase == "train" else True
@@ -292,7 +295,10 @@ def _dataloader(self, dataset, batch_size, shuffle=False):
                 self.dataloader_names = list(dataset.keys())
             else:
                 self.dataloader_names = [f"{i}" for i in range(len(dataset))]
-            return [self._dataloader_single(ds, batch_size, shuffle) for ds in dataset.values()]            
+            return [
+                self._dataloader_single(ds, batch_size, shuffle)
+                for ds in dataset.values()
+            ]
         else:
             return self._dataloader_single(dataset, batch_size, shuffle)
 
diff --git a/rl4co/models/rl/ppo/ppo.py b/rl4co/models/rl/ppo/ppo.py
index 6f3cc950..af7915ce 100644
--- a/rl4co/models/rl/ppo/ppo.py
+++ b/rl4co/models/rl/ppo/ppo.py
@@ -6,6 +6,7 @@
 
 from torch.utils.data import DataLoader
 
+from rl4co.data.dataset import TensorDictDataset, tensordict_collate_fn
 from rl4co.envs.common.base import RL4COEnvBase
 from rl4co.models.rl.common.base import RL4COLitModule
 from rl4co.utils.pylogger import get_pylogger
@@ -124,8 +125,8 @@ def shared_step(
     ):
         # Evaluate old actions, log probabilities, and rewards
         with torch.no_grad():
-            td = self.env.reset(batch)
-            out = self.policy(td, self.env, phase=phase, return_actions=True)
+            td = self.env.reset(batch)  # note: clone needed for dataloader
+            out = self.policy(td.clone(), self.env, phase=phase, return_actions=True)
 
         if phase == "train":
             batch_size = out["actions"].shape[0]
@@ -146,12 +147,17 @@ def shared_step(
             td.set("reward", out["reward"])
             td.set("action", out["actions"])
 
+            dataset = TensorDictDataset(td)
             dataloader = DataLoader(
-                td, batch_size=mini_batch_size, shuffle=True, collate_fn=lambda x: x
+                dataset,
+                batch_size=mini_batch_size,
+                shuffle=True,
+                collate_fn=tensordict_collate_fn,
             )
 
             for _ in range(self.ppo_cfg["ppo_epochs"]):  # PPO inner epoch, K
                 for sub_td in dataloader:
+                    previous_reward = sub_td["reward"].view(-1, 1)
                     ll, entropy = self.policy.evaluate_action(
                         sub_td, action=sub_td["action"]
                     )
@@ -163,7 +169,7 @@ def shared_step(
 
                     # Compute the advantage
                     value_pred = self.critic(sub_td)  # [batch, 1]
-                    adv = sub_td["reward"].view(-1, 1) - value_pred.detach()
+                    adv = previous_reward - value_pred.detach()
 
                     # Normalize advantage
                     if self.ppo_cfg["normalize_adv"]:
@@ -181,7 +187,7 @@ def shared_step(
                     ).mean()
 
                     # compute value function loss
-                    value_loss = F.huber_loss(value_pred, sub_td["reward"].view(-1, 1))
+                    value_loss = F.huber_loss(value_pred, previous_reward)
 
                     # compute total loss
                     loss = (
diff --git a/rl4co/models/rl/reinforce/baselines.py b/rl4co/models/rl/reinforce/baselines.py
index 8117edd1..8d345660 100644
--- a/rl4co/models/rl/reinforce/baselines.py
+++ b/rl4co/models/rl/reinforce/baselines.py
@@ -6,10 +6,9 @@
 
 from scipy.stats import ttest_rel
 from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
 
 from rl4co import utils
-from rl4co.data.dataset import ExtraKeyDataset, tensordict_collate_fn
+from rl4co.data.dataset import tensordict_collate_fn
 from rl4co.models.rl.common.critic import CriticNetwork
 
 log = utils.get_pylogger(__name__)
@@ -81,6 +80,7 @@ def eval(self, td, reward, env=None):
 
 class MeanBaseline(REINFORCEBaseline):
     """Mean baseline: return mean of reward as baseline"""
+
     def __new__(cls, **kw):
         return ExponentialBaseline(beta=0.0, **kw)
 
@@ -158,13 +158,11 @@ class RolloutBaseline(REINFORCEBaseline):
 
     Args:
         bl_alpha: Alpha value for the baseline T-test
-        progress_bar: Whether to show progress bar for rollout
     """
 
-    def __init__(self, bl_alpha=0.05, progress_bar=False, **kw):
+    def __init__(self, bl_alpha=0.05, **kw):
         super(RolloutBaseline, self).__init__()
         self.bl_alpha = bl_alpha
-        self.progress_bar = progress_bar
 
     def setup(self, *args, **kw):
         self._update_model(*args, **kw)
@@ -235,9 +233,7 @@ def eval_model(batch):
 
         dl = DataLoader(dataset, batch_size=batch_size, collate_fn=tensordict_collate_fn)
 
-        rewards = torch.cat(
-            [eval_model(batch) for batch in tqdm(dl, disable=not self.progress_bar)], 0
-        )
+        rewards = torch.cat([eval_model(batch) for batch in dl], 0)
         return rewards
 
     def wrap_dataset(self, dataset, env, batch_size=64, device="cpu", **kw):
@@ -253,7 +249,7 @@ def wrap_dataset(self, dataset, env, batch_size=64, device="cpu", **kw):
             .detach()
             .cpu()
         )
-        return ExtraKeyDataset(dataset, rewards)
+        return dataset.add_key("extra", rewards)
 
     def __getstate__(self):
         """Do not include datasets in state to avoid pickling issues"""
diff --git a/rl4co/models/zoo/__init__.py b/rl4co/models/zoo/__init__.py
index c923dbe7..b4b794db 100644
--- a/rl4co/models/zoo/__init__.py
+++ b/rl4co/models/zoo/__init__.py
@@ -1,7 +1,6 @@
 from rl4co.models.zoo.active_search import ActiveSearch
 from rl4co.models.zoo.am import AttentionModel, AttentionModelPolicy
 from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
-
 from rl4co.models.zoo.common.search import SearchBase
 from rl4co.models.zoo.eas import EAS, EASEmb, EASLay
 from rl4co.models.zoo.ham import (
diff --git a/rl4co/models/zoo/ham/model.py b/rl4co/models/zoo/ham/model.py
index a95f558b..aa402e84 100644
--- a/rl4co/models/zoo/ham/model.py
+++ b/rl4co/models/zoo/ham/model.py
@@ -7,7 +7,7 @@
 
 
 class HeterogeneousAttentionModel(REINFORCE):
-    """Heterogenous Attention Model for solving the Pickup and Delivery Problem based on 
+    """Heterogenous Attention Model for solving the Pickup and Delivery Problem based on
     REINFORCE: https://arxiv.org/abs/2110.02634.
 
     Args:
@@ -20,7 +20,7 @@ class HeterogeneousAttentionModel(REINFORCE):
     """
 
     def __init__(
-        self, 
+        self,
         env: RL4COEnvBase,
         policy: HeterogeneousAttentionModelPolicy = None,
         baseline: Union[REINFORCEBaseline, str] = "rollout",
diff --git a/rl4co/models/zoo/ham/policy.py b/rl4co/models/zoo/ham/policy.py
index d2ae43c8..a9cd4ea1 100644
--- a/rl4co/models/zoo/ham/policy.py
+++ b/rl4co/models/zoo/ham/policy.py
@@ -1,4 +1,3 @@
-import torch.nn as nn
 from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
 from rl4co.models.zoo.ham.encoder import GraphHeterogeneousAttentionEncoder
 
@@ -41,4 +40,4 @@ def __init__(
             num_heads=num_heads,
             normalization=normalization,
             **kwargs,
-        )
\ No newline at end of file
+        )
diff --git a/rl4co/models/zoo/matnet/__init__.py b/rl4co/models/zoo/matnet/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/rl4co/models/zoo/matnet/decoder.py b/rl4co/models/zoo/matnet/decoder.py
new file mode 100644
index 00000000..e703bf5c
--- /dev/null
+++ b/rl4co/models/zoo/matnet/decoder.py
@@ -0,0 +1,52 @@
+from dataclasses import dataclass
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from rl4co.models.zoo.common.autoregressive.decoder import AutoregressiveDecoder
+from rl4co.utils.ops import batchify, get_num_starts, select_start_nodes, unbatchify
+from tensordict import TensorDict
+from torch import Tensor
+
+
+@dataclass
+class PrecomputedCache:
+    node_embeddings: Tensor
+    graph_context: Union[Tensor, float]
+    glimpse_key: Tensor
+    glimpse_val: Tensor
+    logit_key: Tensor
+
+
+class MatNetDecoder(AutoregressiveDecoder):
+    def _precompute_cache(
+        self, embeddings: Tuple[Tensor, Tensor], num_starts: int = 0, td: TensorDict = None
+    ):
+        col_emb, row_emb = embeddings
+        (
+            glimpse_key_fixed,
+            glimpse_val_fixed,
+            logit_key,
+        ) = self.project_node_embeddings(
+            col_emb
+        ).chunk(3, dim=-1)
+
+        # Optionally disable the graph context from the initial embedding as done in POMO
+        if self.use_graph_context:
+            graph_context = unbatchify(
+                batchify(self.project_fixed_context(col_emb.mean(1)), num_starts),
+                num_starts,
+            )
+        else:
+            graph_context = 0
+
+        # Organize in a dataclass for easy access
+        return PrecomputedCache(
+            node_embeddings=row_emb,
+            graph_context=graph_context,
+            glimpse_key=glimpse_key_fixed,
+            glimpse_val=glimpse_val_fixed,
+            # logit_key=col_emb,
+            logit_key=logit_key,
+        )
\ No newline at end of file
diff --git a/rl4co/models/zoo/matnet/encoder.py b/rl4co/models/zoo/matnet/encoder.py
new file mode 100644
index 00000000..273baa31
--- /dev/null
+++ b/rl4co/models/zoo/matnet/encoder.py
@@ -0,0 +1,309 @@
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from rl4co.models.nn.ops import Normalization
+from tensordict import TensorDict
+
+
+class MatNetCrossMHA(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        mixer_hidden_dim: int = 16,
+        mix1_init: float = (1 / 2) ** (1 / 2),
+        mix2_init: float = (1 / 16) ** (1 / 2),
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        assert (
+            self.embedding_dim % num_heads == 0
+        ), "embedding_dim must be divisible by num_heads"
+        self.head_dim = self.embedding_dim // num_heads
+
+        self.Wq = nn.Linear(embedding_dim, embedding_dim, bias=bias)
+        self.Wkv = nn.Linear(embedding_dim, 2 * embedding_dim, bias=bias)
+
+        # Score mixer
+        # Taken from the official MatNet implementation
+        # https://github.com/yd-kwon/MatNet/blob/main/ATSP/ATSP_MatNet/ATSPModel_LIB.py#L72
+        mix_W1 = torch.torch.distributions.Uniform(low=-mix1_init, high=mix1_init).sample(
+            (num_heads, 2, mixer_hidden_dim)
+        )
+        mix_b1 = torch.torch.distributions.Uniform(low=-mix1_init, high=mix1_init).sample(
+            (num_heads, mixer_hidden_dim)
+        )
+        self.mix_W1 = nn.Parameter(mix_W1)
+        self.mix_b1 = nn.Parameter(mix_b1)
+
+        mix_W2 = torch.torch.distributions.Uniform(low=-mix2_init, high=mix2_init).sample(
+            (num_heads, mixer_hidden_dim, 1)
+        )
+        mix_b2 = torch.torch.distributions.Uniform(low=-mix2_init, high=mix2_init).sample(
+            (num_heads, 1)
+        )
+        self.mix_W2 = nn.Parameter(mix_W2)
+        self.mix_b2 = nn.Parameter(mix_b2)
+
+        self.out_proj = nn.Linear(embedding_dim, embedding_dim, bias=bias)
+
+    def forward(self, q_input, kv_input, dmat):
+        """
+
+        Args:
+            q_input (Tensor): [b, m, d]
+            kv_input (Tensor): [b, n, d]
+            dmat (Tensor): [b, m, n]
+
+        Returns:
+            Tensor: [b, m, d]
+        """
+
+        b, m, n = dmat.shape
+
+        q = rearrange(
+            self.Wq(q_input), "b m (h d) -> b h m d", h=self.num_heads
+        )  # [b, h, m, d]
+        k, v = rearrange(
+            self.Wkv(kv_input), "b n (two h d) -> two b h n d", two=2, h=self.num_heads
+        ).unbind(
+            dim=0
+        )  # [b, h, n, d]
+
+        scale = math.sqrt(q.size(-1))  # scale factor
+        attn_scores = torch.matmul(q, k.transpose(2, 3)) / scale  # [b, h, m, n]
+        mix_attn_scores = torch.stack(
+            [attn_scores, dmat[:, None, :, :].expand(b, self.num_heads, m, n)], dim=-1
+        )  # [b, h, m, n, 2]
+
+        mix_attn_scores = (
+            (
+                torch.matmul(
+                    F.relu(
+                        torch.matmul(mix_attn_scores.transpose(1, 2), self.mix_W1)
+                        + self.mix_b1[None, None, :, None, :]
+                    ),
+                    self.mix_W2,
+                )
+                + self.mix_b2[None, None, :, None, :]
+            )
+            .transpose(1, 2)
+            .squeeze(-1)
+        )  # [b, h, m, n]
+
+        attn_probs = F.softmax(mix_attn_scores, dim=-1)
+        out = torch.matmul(attn_probs, v)
+        return self.out_proj(rearrange(out, "b h s d -> b s (h d)"))
+
+
+class MatNetMHA(nn.Module):
+    def __init__(self, embedding_dim: int, num_heads: int, bias: bool = True):
+        super().__init__()
+        self.row_encoding_block = MatNetCrossMHA(embedding_dim, num_heads, bias)
+        self.col_encoding_block = MatNetCrossMHA(embedding_dim, num_heads, bias)
+
+    def forward(self, row_emb, col_emb, dmat):
+        """
+        Args:
+            row_emb (Tensor): [b, m, d]
+            col_emb (Tensor): [b, n, d]
+            dmat (Tensor): [b, m, n]
+
+        Returns:
+            Updated row_emb (Tensor): [b, m, d]
+            Updated col_emb (Tensor): [b, n, d]
+        """
+
+        updated_row_emb = self.row_encoding_block(row_emb, col_emb, dmat)
+        updated_col_emb = self.col_encoding_block(
+            col_emb, row_emb, dmat.transpose(-2, -1)
+        )
+        return updated_row_emb, updated_col_emb
+
+
+class MatNetMHALayer(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        feed_forward_hidden: int = 512,
+        normalization: Optional[str] = "instance",
+    ):
+        super().__init__()
+        self.MHA = MatNetMHA(embedding_dim, num_heads, bias)
+
+        self.F_a = nn.ModuleDict(
+            {
+                "norm1": Normalization(embedding_dim, normalization),
+                "ffn": nn.Sequential(
+                    nn.Linear(embedding_dim, feed_forward_hidden),
+                    nn.ReLU(),
+                    nn.Linear(feed_forward_hidden, embedding_dim),
+                ),
+                "norm2": Normalization(embedding_dim, normalization),
+            }
+        )
+
+        self.F_b = nn.ModuleDict(
+            {
+                "norm1": Normalization(embedding_dim, normalization),
+                "ffn": nn.Sequential(
+                    nn.Linear(embedding_dim, feed_forward_hidden),
+                    nn.ReLU(),
+                    nn.Linear(feed_forward_hidden, embedding_dim),
+                ),
+                "norm2": Normalization(embedding_dim, normalization),
+            }
+        )
+
+    def forward(self, row_emb, col_emb, dmat):
+        """
+        Args:
+            row_emb (Tensor): [b, m, d]
+            col_emb (Tensor): [b, n, d]
+            dmat (Tensor): [b, m, n]
+
+        Returns:
+            Updated row_emb (Tensor): [b, m, d]
+            Updated col_emb (Tensor): [b, n, d]
+        """
+
+        row_emb_out, col_emb_out = self.MHA(row_emb, col_emb, dmat)
+
+        row_emb_out = self.F_a["norm1"](row_emb + row_emb_out)
+        row_emb_out = self.F_a["norm2"](row_emb_out + self.F_a["ffn"](row_emb_out))
+
+        col_emb_out = self.F_b["norm1"](col_emb + col_emb_out)
+        col_emb_out = self.F_b["norm2"](col_emb_out + self.F_b["ffn"](col_emb_out))
+        return row_emb_out, col_emb_out
+
+
+class MatNetMHANetwork(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int = 128,
+        num_heads: int = 8,
+        num_layers: int = 3,
+        normalization: str = "batch",
+        feed_forward_hidden: int = 512,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                MatNetMHALayer(
+                    num_heads=num_heads,
+                    embedding_dim=embedding_dim,
+                    feed_forward_hidden=feed_forward_hidden,
+                    normalization=normalization,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, row_emb, col_emb, dmat):
+        """
+        Args:
+            row_emb (Tensor): [b, m, d]
+            col_emb (Tensor): [b, n, d]
+            dmat (Tensor): [b, m, n]
+
+        Returns:
+            Updated row_emb (Tensor): [b, m, d]
+            Updated col_emb (Tensor): [b, n, d]
+        """
+
+        for layer in self.layers:
+            row_emb, col_emb = layer(row_emb, col_emb, dmat)
+        return row_emb, col_emb
+
+
+class MatNetATSPInitEmbedding(nn.Module):
+    """
+    Preparing the initial row and column embeddings for ATSP.
+
+    Reference:
+    https://github.com/yd-kwon/MatNet/blob/782698b60979effe2e7b61283cca155b7cdb727f/ATSP/ATSP_MatNet/ATSPModel.py#L51
+
+
+    """
+
+    def __init__(self, embedding_dim: int, mode: str = "RandomOneHot") -> None:
+        super().__init__()
+
+        self.embedding_dim = embedding_dim
+        assert mode in {
+            "RandomOneHot",
+            "Random",
+        }, "mode must be one of ['RandomOneHot', 'Random']"
+        self.mode = mode
+
+        self.dmat_proj = nn.Linear(1, 2 * embedding_dim, bias=False)
+        self.row_proj = nn.Linear(embedding_dim * 4, embedding_dim, bias=False)
+        self.col_proj = nn.Linear(embedding_dim * 4, embedding_dim, bias=False)
+
+    def forward(self, td: TensorDict):
+        dmat = td["cost_matrix"]  # [b, n, n]
+        b, n, _ = dmat.shape
+
+        row_emb = torch.zeros(b, n, self.embedding_dim, device=dmat.device)
+
+        if self.mode == "RandomOneHot":
+            # MatNet uses one-hot encoding for column embeddings
+            # https://github.com/yd-kwon/MatNet/blob/782698b60979effe2e7b61283cca155b7cdb727f/ATSP/ATSP_MatNet/ATSPModel.py#L60
+
+            col_emb = torch.zeros(b, n, self.embedding_dim, device=dmat.device)
+            rand = torch.rand(b, n)
+            rand_idx = rand.argsort(dim=1)
+            b_idx = torch.arange(b)[:, None].expand(b, n)
+            n_idx = torch.arange(n)[None, :].expand(b, n)
+            col_emb[b_idx, n_idx, rand_idx] = 1.0
+
+        elif self.mode == "Random":
+            col_emb = torch.rand(b, n, self.embedding_dim, device=dmat.device)
+        else:
+            raise NotImplementedError
+
+        return row_emb, col_emb, dmat
+
+
+class MatNetEncoder(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int = 256,
+        num_heads: int = 16,
+        num_layers: int = 5,
+        normalization: str = "instance",
+        feed_forward_hidden: int = 512,
+        init_embedding: nn.Module = None,
+        init_embedding_kwargs: dict = None,
+    ):
+        super().__init__()
+
+        if init_embedding is None:
+            init_embedding = MatNetATSPInitEmbedding(
+                embedding_dim, **init_embedding_kwargs
+            )
+
+        self.init_embedding = init_embedding
+        self.net = MatNetMHANetwork(
+            embedding_dim=embedding_dim,
+            num_heads=num_heads,
+            num_layers=num_layers,
+            normalization=normalization,
+            feed_forward_hidden=feed_forward_hidden,
+        )
+
+    def forward(self, td):
+        row_emb, col_emb, dmat = self.init_embedding(td)
+        row_emb, col_emb = self.net(row_emb, col_emb, dmat)
+
+        embedding = (row_emb, col_emb)
+        init_embedding = None
+        return embedding, init_embedding  # match output signature for the AR policy class
diff --git a/rl4co/models/zoo/matnet/model.py b/rl4co/models/zoo/matnet/model.py
new file mode 100644
index 00000000..1af9cace
--- /dev/null
+++ b/rl4co/models/zoo/matnet/model.py
@@ -0,0 +1,39 @@
+from typing import Any, Union
+from rl4co.models.zoo.matnet.policy import MatNetPolicy
+
+import torch.nn as nn
+
+from rl4co.models.zoo.pomo.model import POMO
+from rl4co.envs.common.base import RL4COEnvBase
+
+
+class MatNet(POMO):
+    def __init__(
+        self,
+        env: RL4COEnvBase,
+        policy: Union[nn.Module, MatNetPolicy] = None,
+        optimizer_kwargs: dict = {"lr": 4 * 1e-4, "weight_decay": 1e-6},
+        lr_scheduler: str = "MultiStepLR",
+        lr_scheduler_kwargs: dict = {"milestones": [2001, 2101], "gamma": 0.1},
+        use_dihedral_8: bool = False,
+        num_starts: int = None,
+        train_data_size: int = 10_000,
+        batch_size: int = 200,
+        policy_params: dict = {},
+        model_params: dict = {},
+    ):
+        if policy is None:
+            policy = MatNetPolicy(env_name=env.name, **policy_params)
+
+        super(MatNet, self).__init__(
+            env=env,
+            policy=policy,
+            optimizer_kwargs=optimizer_kwargs,
+            lr_scheduler=lr_scheduler,
+            lr_scheduler_kwargs=lr_scheduler_kwargs,
+            use_dihedral_8=use_dihedral_8,
+            num_starts=num_starts,
+            train_data_size=train_data_size,
+            batch_size=batch_size,
+            **model_params,
+        )
diff --git a/rl4co/models/zoo/matnet/policy.py b/rl4co/models/zoo/matnet/policy.py
new file mode 100644
index 00000000..8b4e1761
--- /dev/null
+++ b/rl4co/models/zoo/matnet/policy.py
@@ -0,0 +1,61 @@
+from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
+from rl4co.models.zoo.matnet.encoder import MatNetEncoder
+from rl4co.models.zoo.matnet.decoder import MatNetDecoder
+from rl4co.utils.pylogger import get_pylogger
+
+log = get_pylogger(__name__)
+
+
+class MatNetPolicy(AutoregressivePolicy):
+    """MatNet Policy from Kwon et al., 2021.
+    Reference: https://arxiv.org/abs/2106.11113
+
+    Warning:
+        This implementation is under development and subject to change.
+
+    Args:
+        env_name: Name of the environment used to initialize embeddings
+        embedding_dim: Dimension of the node embeddings
+        num_encoder_layers: Number of layers in the encoder
+        num_heads: Number of heads in the attention layers
+        normalization: Normalization type in the attention layers
+        **kwargs: keyword arguments passed to the `AutoregressivePolicy`
+
+    Default paarameters are adopted from the original implementation.
+    """
+
+    def __init__(
+        self,
+        env_name: str,
+        embedding_dim: int = 256,
+        num_encoder_layers: int = 5,
+        num_heads: int = 16,
+        normalization: str = "instance",
+        init_embedding_kwargs: dict = {"mode": "RandomOneHot"},
+        use_graph_context: bool = False,
+        **kwargs,
+    ):
+        if env_name not in ["atsp"]:
+            log.error(f"env_name {env_name} is not originally implemented in MatNet")
+
+        super(MatNetPolicy, self).__init__(
+            env_name=env_name,
+            encoder=MatNetEncoder(
+                embedding_dim=embedding_dim,
+                num_heads=num_heads,
+                num_layers=num_encoder_layers,
+                normalization=normalization,
+                init_embedding_kwargs=init_embedding_kwargs,
+            ),
+            decoder=MatNetDecoder(
+                env_name=env_name,
+                embedding_dim=embedding_dim,
+                num_heads=num_heads,
+                use_graph_context=use_graph_context,
+            ),
+            embedding_dim=embedding_dim,
+            num_encoder_layers=num_encoder_layers,
+            num_heads=num_heads,
+            normalization=normalization,
+            **kwargs,
+        )
diff --git a/rl4co/models/zoo/mdam/__init__.py b/rl4co/models/zoo/mdam/__init__.py
index 0dcc6521..2b7a14da 100644
--- a/rl4co/models/zoo/mdam/__init__.py
+++ b/rl4co/models/zoo/mdam/__init__.py
@@ -1,2 +1,2 @@
+from .model import MDAM
 from .policy import MDAMPolicy
-from .model import MDAM
\ No newline at end of file
diff --git a/rl4co/models/zoo/mdam/decoder.py b/rl4co/models/zoo/mdam/decoder.py
index 87fd0dee..8e1b9daf 100644
--- a/rl4co/models/zoo/mdam/decoder.py
+++ b/rl4co/models/zoo/mdam/decoder.py
@@ -1,15 +1,14 @@
 import math
-from typing import Union
 
 from dataclasses import dataclass
-from tensordict import TensorDict
+from typing import Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from tensordict import TensorDict
 
 from rl4co.envs import RL4COEnvBase
-
 from rl4co.models.nn.attention import LogitAttention
 from rl4co.models.nn.env_embeddings import env_context_embedding, env_dynamic_embedding
 from rl4co.models.nn.utils import decode_probs, get_log_likelihood
@@ -67,8 +66,7 @@ def __init__(
         self.project_node_embeddings = nn.ModuleList(self.project_node_embeddings)
 
         self.project_fixed_context = [
-            nn.Linear(embedding_dim, embedding_dim, bias=False)
-            for _ in range(num_paths)
+            nn.Linear(embedding_dim, embedding_dim, bias=False) for _ in range(num_paths)
         ]
         self.project_fixed_context = nn.ModuleList(self.project_fixed_context)
 
@@ -79,8 +77,7 @@ def __init__(
         self.project_step_context = nn.ModuleList(self.project_step_context)
 
         self.project_out = [
-            nn.Linear(embedding_dim, embedding_dim, bias=False)
-            for _ in range(num_paths)
+            nn.Linear(embedding_dim, embedding_dim, bias=False) for _ in range(num_paths)
         ]
         self.project_out = nn.ModuleList(self.project_out)
 
@@ -108,15 +105,15 @@ def __init__(
         self.shrink_size = shrink_size
 
     def forward(
-            self, 
-            td: TensorDict, 
-            encoded_inputs: torch.Tensor, 
-            env: Union[str, RL4COEnvBase],
-            attn, 
-            V,
-            h_old,
-            **decoder_kwargs
-        ):
+        self,
+        td: TensorDict,
+        encoded_inputs: torch.Tensor,
+        env: Union[str, RL4COEnvBase],
+        attn,
+        V,
+        h_old,
+        **decoder_kwargs,
+    ):
         # SECTION: Decoder first step: calculate for the decoder divergence loss
         # Cost list and log likelihood list along with path
         output_list = []
@@ -261,7 +258,9 @@ def _get_log_p(self, fixed, td, path_index, normalize=True):
         step_context = self.context[path_index](
             fixed.node_embeddings, td
         )  # [batch, embed_dim]
-        glimpse_q = fixed.graph_context + step_context.unsqueeze(1).to(fixed.graph_context.device)
+        glimpse_q = fixed.graph_context + step_context.unsqueeze(1).to(
+            fixed.graph_context.device
+        )
 
         # Compute keys and values for the nodes
         (
diff --git a/rl4co/models/zoo/mdam/model.py b/rl4co/models/zoo/mdam/model.py
index b0696cf3..5ab935a6 100644
--- a/rl4co/models/zoo/mdam/model.py
+++ b/rl4co/models/zoo/mdam/model.py
@@ -1,4 +1,3 @@
-
 from typing import Union
 
 from rl4co.envs.common.base import RL4COEnvBase
@@ -8,10 +7,10 @@
 
 
 class MDAM(REINFORCE):
-    """ Multi-Decoder Attention Model (MDAM) is a model
-    to train multiple diverse policies, which effectively increases the chance of finding 
+    """Multi-Decoder Attention Model (MDAM) is a model
+    to train multiple diverse policies, which effectively increases the chance of finding
     good solutions compared with existing methods that train only one policy.
-    Reference link: https://arxiv.org/abs/2012.10638; 
+    Reference link: https://arxiv.org/abs/2012.10638;
     Implementation reference: https://github.com/liangxinedu/MDAM.
 
     Args:
@@ -24,15 +23,15 @@ class MDAM(REINFORCE):
     """
 
     def __init__(
-            self, 
-            env: RL4COEnvBase, 
-            policy: MDAMPolicy = None, 
-            baseline: Union[REINFORCEBaseline, str] = "rollout", 
-            policy_kwargs={},
-            baseline_kwargs={},
-            **kwargs
-        ):
+        self,
+        env: RL4COEnvBase,
+        policy: MDAMPolicy = None,
+        baseline: Union[REINFORCEBaseline, str] = "rollout",
+        policy_kwargs={},
+        baseline_kwargs={},
+        **kwargs,
+    ):
         if policy is None:
-            policy = MDAMPolicy(env.name, **policy_kwargs) 
+            policy = MDAMPolicy(env.name, **policy_kwargs)
 
-        super().__init__(env, policy, baseline, baseline_kwargs, **kwargs)
\ No newline at end of file
+        super().__init__(env, policy, baseline, baseline_kwargs, **kwargs)
diff --git a/rl4co/models/zoo/mdam/policy.py b/rl4co/models/zoo/mdam/policy.py
index 7a8b7c04..30299dd5 100644
--- a/rl4co/models/zoo/mdam/policy.py
+++ b/rl4co/models/zoo/mdam/policy.py
@@ -1,26 +1,25 @@
-import torch.nn as nn
 from typing import Union
 
 from tensordict import TensorDict
-from rl4co.envs import RL4COEnvBase, get_env
 
+from rl4co.envs import RL4COEnvBase, get_env
 from rl4co.models.nn.env_embeddings import env_init_embedding
+from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
 from rl4co.models.zoo.mdam.decoder import Decoder
 from rl4co.models.zoo.mdam.encoder import GraphAttentionEncoder
-from rl4co.models.zoo.common.autoregressive import AutoregressivePolicy
 from rl4co.utils.pylogger import get_pylogger
 
 log = get_pylogger(__name__)
 
 
 class MDAMPolicy(AutoregressivePolicy):
-    """ Multi-Decoder Attention Model (MDAM) policy.
+    """Multi-Decoder Attention Model (MDAM) policy.
     Args:
 
     """
-    
+
     def __init__(
-        self, 
+        self,
         env_name: str,
         embedding_dim: int = 128,
         num_encoder_layers: int = 3,
@@ -35,13 +34,13 @@ def __init__(
                 embed_dim=embedding_dim,
                 num_layers=num_encoder_layers,
                 normalization=normalization,
-                **kwargs
+                **kwargs,
             ),
             decoder=Decoder(
                 env_name=env_name,
                 embedding_dim=embedding_dim,
                 num_heads=num_heads,
-                **kwargs
+                **kwargs,
             ),
             embedding_dim=embedding_dim,
             num_encoder_layers=num_encoder_layers,
@@ -84,4 +83,4 @@ def forward(
             "entropy": kl_divergence,
             "actions": actions if return_actions else None,
         }
-        return out
\ No newline at end of file
+        return out
diff --git a/rl4co/models/zoo/pomo/model.py b/rl4co/models/zoo/pomo/model.py
index eefff8ae..1d707df2 100644
--- a/rl4co/models/zoo/pomo/model.py
+++ b/rl4co/models/zoo/pomo/model.py
@@ -56,7 +56,9 @@ def __init__(
         for phase in ["train", "val", "test"]:
             self.set_decode_type_multistart(phase)
 
-    def shared_step(self, batch: Any, batch_idx: int, phase: str, dataloader_idx: int = None):
+    def shared_step(
+        self, batch: Any, batch_idx: int, phase: str, dataloader_idx: int = None
+    ):
         td = self.env.reset(batch)
         n_aug, n_start = self.num_augment, self.num_starts
         n_start = get_num_starts(td) if n_start is None else n_start
@@ -102,10 +104,10 @@ def shared_step(self, batch: Any, batch_idx: int, phase: str, dataloader_idx: in
                 out.update({"max_aug_reward": max_aug_reward})
 
                 if out.get("actions", None) is not None:
-                    actions_ = out["best_multistart_actions"] if n_start > 1 else out["actions"]
-                    out.update(
-                        {"best_aug_actions": gather_by_index(actions_, max_idxs)}
+                    actions_ = (
+                        out["best_multistart_actions"] if n_start > 1 else out["actions"]
                     )
+                    out.update({"best_aug_actions": gather_by_index(actions_, max_idxs)})
 
         metrics = self.log_metrics(out, phase, dataloader_idx=dataloader_idx)
         return {"loss": out.get("loss", None), **metrics}
diff --git a/rl4co/tasks/train.py b/rl4co/tasks/train.py
index 8d04a01c..6b628470 100644
--- a/rl4co/tasks/train.py
+++ b/rl4co/tasks/train.py
@@ -9,11 +9,12 @@
 from lightning.pytorch.loggers import Logger
 from omegaconf import DictConfig
 
-pyrootutils.setup_root(__file__, indicator=".gitignore", pythonpath=True)
-
 from rl4co import utils
 from rl4co.utils import RL4COTrainer
 
+pyrootutils.setup_root(__file__, indicator=".gitignore", pythonpath=True)
+
+
 log = utils.get_pylogger(__name__)
 
 
diff --git a/rl4co/utils/ops.py b/rl4co/utils/ops.py
index 2034d0de..669e4a12 100644
--- a/rl4co/utils/ops.py
+++ b/rl4co/utils/ops.py
@@ -6,7 +6,6 @@
 from torch import Tensor
 
 
-# @torch.jit.script
 def _batchify_single(
     x: Union[Tensor, TensorDict], repeats: int
 ) -> Union[Tensor, TensorDict]:
@@ -61,7 +60,6 @@ def unbatchify(
     return x
 
 
-# @torch.jit.script
 def gather_by_index(src, idx, dim=1, squeeze=True):
     """Gather elements from src by index idx along specified dim
 
@@ -76,13 +74,13 @@ def gather_by_index(src, idx, dim=1, squeeze=True):
     return src.gather(dim, idx).squeeze() if squeeze else src.gather(dim, idx)
 
 
-# @torch.jit.script
+@torch.jit.script
 def get_distance(x: Tensor, y: Tensor):
     """Euclidean distance between two tensors of shape `[..., n, dim]`"""
     return (x - y).norm(p=2, dim=-1)
 
 
-# @torch.jit.script
+@torch.jit.script
 def get_tour_length(ordered_locs):
     """Compute the total tour distance for a batch of ordered tours.
     Computes the L2 norm between each pair of consecutive nodes in the tour and sums them up.
diff --git a/rl4co/utils/optim_helpers.py b/rl4co/utils/optim_helpers.py
index f784a62b..46367a37 100644
--- a/rl4co/utils/optim_helpers.py
+++ b/rl4co/utils/optim_helpers.py
@@ -1,7 +1,6 @@
 import inspect
 
 import torch
-import torch.nn as nn
 from torch.optim import Optimizer
 
 
diff --git a/rl4co/utils/trainer.py b/rl4co/utils/trainer.py
index a76b4e5f..790437c3 100644
--- a/rl4co/utils/trainer.py
+++ b/rl4co/utils/trainer.py
@@ -68,7 +68,7 @@ def __init__(
             except AttributeError:
                 pass
 
-        # Configure DDP automatically
+        # Configure DDP automatically if multiple GPUs are available
         if auto_configure_ddp and strategy == "auto":
             if devices == "auto":
                 n_devices = num_cuda_devices()
@@ -77,7 +77,11 @@ def __init__(
             else:
                 n_devices = devices
             if n_devices > 1:
-                log.info("Configuring DDP strategy automatically")
+                log.info(
+                    "Configuring DDP strategy automatically with {} GPUs".format(
+                        n_devices
+                    )
+                )
                 strategy = DDPStrategy(
                     find_unused_parameters=True,  # We set to True due to RL envs
                     gradient_as_bucket_view=True,  # https://pytorch-lightning.readthedocs.io/en/stable/advanced/advanced_gpu.html#ddp-optimizations
@@ -89,7 +93,9 @@ def __init__(
 
         # Check if gradient_clip_val is set to None
         if gradient_clip_val is None:
-            log.warning("gradient_clip_val is set to None. This may lead to unstable training.")
+            log.warning(
+                "gradient_clip_val is set to None. This may lead to unstable training."
+            )
 
         # We should reload dataloaders every epoch for RL training
         if reload_dataloaders_every_n_epochs != 1: