diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 4e0e7f6bcb..02238a51fa 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -20,42 +20,42 @@ jobs:
         include:
         - name: cpu-3.10-2.0
           container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-3.10-2.1
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-3.10-2.1-composer
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: cpu-doctest
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not vision and doctest
+          markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
         - name: daily-cpu-3.10-2.0
           container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: daily-cpu-3.10-2.1
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: daily-cpu-3.10-2.1-composer
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not vision and not doctest
+          markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not vision and doctest
+          markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
     name: ${{ matrix.name }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index e2c715710e..6eee54cb0b 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -15,17 +15,17 @@ jobs:
         include:
         - name: cpu-3.10-2.0
           container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not vision and not doctest
+          markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-3.10-2.1
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not vision and not doctest
+          markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-doctest
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not vision and doctest
+          markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
     name: ${{ matrix.name }}
diff --git a/README.md b/README.md
index 17a6e41cfd..8bdda2d3e0 100644
--- a/README.md
+++ b/README.md
@@ -135,26 +135,55 @@ Here is a code snippet demonstrating our Trainer on the MNIST dataset.
 <!--pytest.mark.filterwarnings(r'ignore:Some targets have less than 1 total probability:UserWarning')-->
 <!--pytest.mark.filterwarnings('ignore:Cannot split tensor of length .* into batches of size 128.*:UserWarning')-->
 ```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torchvision import datasets, transforms
 from torch.utils.data import DataLoader
 
 from composer import Trainer
-from composer.models import mnist_model
+from composer.models import ComposerClassifier
 from composer.algorithms import LabelSmoothing, CutMix, ChannelsLast
 
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
 transform = transforms.Compose([transforms.ToTensor()])
 dataset = datasets.MNIST("data", train=True, download=True, transform=transform)
 train_dataloader = DataLoader(dataset, batch_size=128)
 
 trainer = Trainer(
-    model=mnist_model(num_classes=10),
+    model=ComposerClassifier(module=Model(), num_classes=10),
     train_dataloader=train_dataloader,
     max_duration="2ep",
     algorithms=[
         LabelSmoothing(smoothing=0.1),
         CutMix(alpha=1.0),
         ChannelsLast(),
-        ]
+    ],
 )
 trainer.fit()
 ```
diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
index 274c10ce9c..4943a9db58 100644
--- a/STYLE_GUIDE.md
+++ b/STYLE_GUIDE.md
@@ -227,22 +227,23 @@ All imports in composer should be absolute -- that is, they do not begin with a
 1.  If a dependency is not core to Composer (e.g. it is for a model, dataset, algorithm, or some callbacks):
     1.  It must be specified in a entry of the `extra_deps` dictionary of [setup.py](setup.py).
         This dictionary groups dependencies that can be conditionally installed. An entry named `foo`
-        can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[unet]'`
-        will install everything in `install_requires`, along with `monai` and `scikit-learn`.
+        can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[system_metrics_monitor]'`
+        will install everything in `install_requires`, along with `pynvml`.
     1.  It must also be specified in the `run_constrained` and the `test.requires` section.
     1.  The import must be conditionally imported in the code. For example:
 
         <!--pytest-codeblocks:importorskip(monai)-->
         <!--pytest-codeblocks:importorskip(scikit-learn)-->
         ```python
+        from composer import Callback
         from composer.utils import MissingConditionalImportError
 
-        def unet():
+        class SystemMetricsMonitor(Callback)
             try:
-                import monai
+                import pynvml
             except ImportError as e:
-                raise MissingConditionalImportError(extra_deps_group="unet",
-                                                    conda_package="monai",
+                raise MissingConditionalImportError(extra_deps_group="system_metrics_monitor",
+                                                    conda_package="pynvml",
                                                     conda_channel="conda-forge",) from e
         ```
 
diff --git a/composer/algorithms/blurpool/README.md b/composer/algorithms/blurpool/README.md
index f99e1fb275..24b25d221a 100644
--- a/composer/algorithms/blurpool/README.md
+++ b/composer/algorithms/blurpool/README.md
@@ -56,9 +56,7 @@ def training_loop(model, train_loader):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
-
-from composer.models import composer_resnet
+from tests.common import RandomImageDataset, composer_resnet
 
 model = composer_resnet('resnet50')
 
diff --git a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
index 9324289351..54a6df1162 100644
--- a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
+++ b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
@@ -6,13 +6,11 @@
 from __future__ import annotations
 
 import logging
-import textwrap
 import warnings
 from typing import Dict, Optional, Sequence, Type, Union
 
 import torch
 import torch.nn.functional as F
-from packaging import version
 from torch.optim import Optimizer
 
 from composer.algorithms.warnings import NoEffectWarning
@@ -22,12 +20,6 @@
 
 log = logging.getLogger(__name__)
 
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as APEXFusedLayerNorm
-    APEX_INSTALLED = True
-except ImportError as e:
-    APEX_INSTALLED = False
-
 
 def apply_low_precision_layernorm(model,
                                   precision: Optional[Precision] = None,
@@ -38,22 +30,6 @@ def apply_low_precision_layernorm(model,
 
     policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {torch.nn.LayerNorm: _to_LPLayerNorm}
 
-    # Prior to v1.13, torch.nn.LayerNorm is slow in bf16 precision.
-    # We use FusedLayerNorm as a fallback.
-    if version.parse(torch.__version__) < version.parse('1.13') and precision == Precision.AMP_BF16:
-        warnings.warn(
-            DeprecationWarning(
-                textwrap.dedent(
-                    'You are using Low Precision LayerNorm on PyTorch < v.1.13 with bfloat16 precision. '
-                    'In this scenario, we fall back to Fused LayerNorm. '
-                    'Fused LayerNorm has been deprecated and will be removed in Composer 0.18. '
-                    'Please upgrade your PyTorch version to >=v.1.13 to use Low Precision LayerNorm without the Fused LayerNorm fallback.'
-                )))
-        check_if_apex_installed()
-        policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {
-            torch.nn.LayerNorm: _to_FusedLayerNorm
-        }
-
     replaced_instances = module_surgery.replace_module_classes(module=model, optimizers=optimizers, policies=policy)
     if len(replaced_instances) == 0:
         warnings.warn(NoEffectWarning('No instances of torch.nn.LayerNorm found.'))
@@ -129,13 +105,6 @@ def _cast_if_autocast_enabled(tensor):
     return tensor
 
 
-def check_if_apex_installed():
-    if not APEX_INSTALLED:
-        raise ImportError(
-            'https://github.com/NVIDIA/apex is not installed. The Low Precision LayerNorm algorithm cannot be applied on PyTorch <1.13 without Apex. The MosaicML Docker Images (https://hub.docker.com/r/mosaicml/pytorch) contain a copy of APEX for easy use.'
-        )
-
-
 def _to_LPLayerNorm(layer: torch.nn.Module, module_index: int) -> LPLayerNorm:
     """Defines a replacement policy from a `torch.nn.LayerNorm` to a `LPLayerNorm`"""
     if not isinstance(layer, torch.nn.LayerNorm):
@@ -153,22 +122,3 @@ def _to_LPLayerNorm(layer: torch.nn.Module, module_index: int) -> LPLayerNorm:
             lp_layernorm.bias.copy_(layer.bias)  # type: ignore
 
     return lp_layernorm
-
-
-def _to_FusedLayerNorm(layer: torch.nn.Module, module_index: int) -> APEXFusedLayerNorm:
-    """Defines a replacement policy from a `torch.nn.LayerNorm` to a `apex.normalization.fused_layer_norm`"""
-    if not isinstance(layer, torch.nn.LayerNorm):
-        raise TypeError(f'Expected torch.nn.LayerNorm, got {type(layer)}')
-    fused_layernorm = APEXFusedLayerNorm(normalized_shape=layer.normalized_shape, eps=layer.eps)
-
-    with torch.no_grad():
-        if layer.weight is None:  # pyright: ignore[reportUnnecessaryComparison]
-            fused_layernorm.weight = None  # pyright: ignore[reportGeneralTypeIssues]
-        else:
-            fused_layernorm.weight.copy_(layer.weight)
-        if layer.bias is None:  # pyright: ignore[reportUnnecessaryComparison]
-            fused_layernorm.bias = None  # pyright: ignore[reportGeneralTypeIssues]
-        else:
-            fused_layernorm.bias.copy_(layer.bias)
-
-    return fused_layernorm
diff --git a/composer/algorithms/stochastic_depth/README.md b/composer/algorithms/stochastic_depth/README.md
index 185b3cd28c..5412134bbe 100644
--- a/composer/algorithms/stochastic_depth/README.md
+++ b/composer/algorithms/stochastic_depth/README.md
@@ -14,7 +14,7 @@
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 ```
@@ -27,7 +27,6 @@ import torch
 import torch.nn.functional as F
 
 import composer.functional as cf
-from composer.models import composer_resnet
 
 # Training
 
@@ -63,7 +62,7 @@ for epoch in range(1):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
@@ -75,7 +74,6 @@ eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 # The trainer will automatically run it at the appropriate point in the training loop
 
 from composer.algorithms import StochasticDepth
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 
 # Train model
diff --git a/composer/algorithms/weight_standardization/README.md b/composer/algorithms/weight_standardization/README.md
index 870eba4c03..9e18f962e6 100644
--- a/composer/algorithms/weight_standardization/README.md
+++ b/composer/algorithms/weight_standardization/README.md
@@ -18,7 +18,7 @@ Weight Standardization is a reparametrization of convolutional weights such that
 ```python
 from torchvision import models
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 my_train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 my_cnn_model = models.resnet18()
@@ -31,7 +31,6 @@ my_cnn_model = models.resnet18()
 import composer.functional as cf
 import torch
 import torch.nn.functional as F
-from composer.models import composer_resnet
 
 def training_loop(model, train_dataloader):
     opt = torch.optim.Adam(model.parameters())
@@ -58,9 +57,8 @@ training_loop(my_cnn_model, my_train_dataloader)
 <!--pytest.mark.gpu-->
 <!--
 ```python
-from composer.models import composer_resnet
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 cnn_composer_model = composer_resnet('resnet50')
 my_train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
diff --git a/composer/callbacks/__init__.py b/composer/callbacks/__init__.py
index 268786a607..ee1ca0ae18 100644
--- a/composer/callbacks/__init__.py
+++ b/composer/callbacks/__init__.py
@@ -12,7 +12,6 @@
 from composer.callbacks.export_for_inference import ExportForInferenceCallback
 from composer.callbacks.free_outputs import FreeOutputs
 from composer.callbacks.generate import Generate
-from composer.callbacks.health_checker import HealthChecker
 from composer.callbacks.image_visualizer import ImageVisualizer
 from composer.callbacks.lr_monitor import LRMonitor
 from composer.callbacks.memory_monitor import MemoryMonitor
@@ -38,7 +37,6 @@
     'ExportForInferenceCallback',
     'ThresholdStopper',
     'ImageVisualizer',
-    'HealthChecker',
     'RuntimeEstimator',
     'SystemMetricsMonitor',
     'Generate',
diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py
deleted file mode 100644
index 5ce37f469f..0000000000
--- a/composer/callbacks/health_checker.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Check GPU Health during training."""
-import logging
-import os
-import warnings
-from collections import deque
-from datetime import datetime
-from typing import List, Optional, Tuple
-
-import numpy as np
-import torch
-
-from composer.core import Callback, State
-from composer.core.time import Timestamp
-from composer.loggers import Logger
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-__all__ = ['HealthChecker']
-
-
-class HealthChecker(Callback):
-    """Checks for GPU health.
-
-    This callback checks for GPU health by tracking and alerting for abnormal
-    GPU utilizations.
-
-    For example, if the average utilization during the observation window is,
-    [30, 30, 45], then the range (45-30=15) would exceed a threshold of 10%.
-
-    Args:
-        threshold (int, optional): Threshold of GPU utilization range to
-            trigger an alert. Defaults to 10.
-        sample_freq (int, optional): Sample frequency in seconds. Default: 5.
-        window_size (int, optional): Window size in seconds. HealthChecker will
-            check for abnormalities at this frequency. Default: 120.
-        wait (int, optional): Seconds to wait for starting to sample. Default: 120.
-        slack_webhook_url (str, optional): Slack URL to send alerts. Can also
-            be set with the SLACK_WEBHOOK_URL environment variable. Default: None
-        test_mode (bool, optional): If True, will send a test alert at the first check.
-            Default: False
-    """
-
-    def __init__(
-        self,
-        threshold: int = 10,
-        sample_freq: int = 5,
-        window_size: int = 120,
-        wait: int = 120,
-        slack_webhook_url: Optional[str] = None,
-        test_mode: bool = False,
-    ) -> None:
-        warnings.warn(f'HealthChecker is deprecated and will be removed in v0.16.')
-        self.sample_freq = sample_freq
-        self.window_size = window_size
-        self.wait = wait
-        self.slack_webhook_url = slack_webhook_url
-        self.test_mode = test_mode
-
-        if not self.slack_webhook_url:
-            self.slack_webhook_url = os.environ.get('SLACK_WEBHOOK_URL', None)
-
-        if self.slack_webhook_url:
-            # fail fast if missing import
-            try:
-                import slack_sdk
-                del slack_sdk
-            except ImportError as e:
-                raise MissingConditionalImportError('health_checker', 'slack_sdk', None) from e
-
-        self.last_sample = 0
-        self.last_check = 0
-
-        self.metrics = []
-        if self._is_available():
-            self.metrics.append(GPUUtilization(threshold))
-
-    def init(self, state: State, logger: Logger) -> None:
-        pass
-
-    def after_train_batch(self, state: State, logger: Logger):
-        if not self.metrics:
-            return
-
-        if self._sample(state.timestamp):
-            for metric in self.metrics:
-                metric.sample()
-
-        if self._check(state.timestamp):
-            for metric in self.metrics:
-                message, alert = metric.check()
-                if self.test_mode and message and dist.get_global_rank() == 0:
-                    alert = True
-                    message = '[**THIS IS A TEST**]' + message
-                if alert and not metric.alerted:
-                    self._alert(message, state)
-                    metric.alerted = True
-                metric.clear()
-
-    def _sample(self, timestamp: Timestamp) -> bool:
-        now = timestamp.total_wct.seconds
-
-        if now < self.wait:
-            return False
-
-        if now - self.last_sample >= self.sample_freq:
-            self.last_sample = now
-            return True
-
-        return False
-
-    def _check(self, timestamp: Timestamp) -> bool:
-        now = timestamp.total_wct.seconds
-
-        if now - self.last_check >= self.window_size:
-            self.last_check = now
-            return True
-        return False
-
-    def _alert(self, message: str, state: State) -> None:
-        prefix = '[{now}][{run_name}][node_rank={node_rank}]'.format(
-            now=datetime.now(),
-            run_name=state.run_name,
-            node_rank=dist.get_node_rank(),
-        )
-
-        node_name = os.environ.get('NODENAME', None)
-        if node_name is not None:
-            prefix += f'[node={node_name}]'
-
-        message = prefix + ' : ' + message
-
-        logging.warning(message)
-        if self.slack_webhook_url:
-            from slack_sdk.webhook import WebhookClient
-            client = WebhookClient(url=self.slack_webhook_url)
-            client.send(text=message)
-
-    @staticmethod
-    def _is_available() -> bool:
-        if not torch.cuda.is_available():
-            return False
-        try:
-            import pynvml
-            pynvml.nvmlInit()  # type: ignore
-            return True
-        except ImportError:
-            raise MissingConditionalImportError('health_checker', 'pynvml', None)
-        except pynvml.NVMLError_LibraryNotFound:  # type: ignore
-            logging.warning('NVML not found, disabling GPU health checking')
-        except Exception as e:
-            logging.warning(f'Error initializing NVML: {e}')
-
-        return False
-
-
-class GPUUtilization:
-    """GPU Utilization Metric."""
-
-    def __init__(self, threshold=10) -> None:
-        self.samples = deque()
-        self.threshold = threshold
-        self.alerted = False
-
-    def sample(self) -> None:
-        if dist.get_local_rank() == 0:
-            sample = self._sample()
-            if sample is not None:
-                self.samples.append(sample)
-
-    def _sample(self) -> Optional[List]:
-        try:
-            import pynvml
-        except ImportError:
-            raise MissingConditionalImportError('health_checker', 'pynvml', None)
-
-        try:
-            samples = []
-            device_count = pynvml.nvmlDeviceGetCount()
-            for i in range(device_count):
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
-        except pynvml.NVMLError:
-            return None
-        return samples
-
-    def check(self) -> Tuple[Optional[str], bool]:
-        if dist.get_local_rank() == 0:
-            average_sample = np.nanmean(list(self.samples), axis=0)
-            if np.nanmax(average_sample) - np.nanmin(average_sample) > self.threshold:
-                message = f'Abnormal GPU utilizations: {average_sample}'
-                return message, True
-            else:
-                message = f':+1: Normal GPU utilizations: {average_sample}'
-                return message, False
-        return None, False
-
-    def clear(self) -> None:
-        self.samples.clear()
diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index e84a42aec6..71e3b0a63a 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -111,16 +111,12 @@ class DataSpec:
     """Specifications for operating and training on data.
 
     An example of constructing a :class:`DataSpec` object with a ``device_transforms``
-    callable (:class:`.NormalizationFn`) and then using it with :class:`~.Trainer`:
+    callable and then using it with :class:`~.Trainer`:
 
     .. doctest::
 
-       >>> # In this case, we apply NormalizationFn
-       >>> # Construct DataSpec as shown below to apply this transformation
-       >>> from composer.datasets.utils import NormalizationFn
-       >>> CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-       >>> CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-       >>> device_transform_fn = NormalizationFn(mean=CHANNEL_MEAN, std=CHANNEL_STD)
+       >>> # Construct DataSpec and subtract mean from the batch
+       >>> device_transform_fn = lambda xs, ys: (xs.sub_(xs.mean()), ys)
        >>> train_dspec = DataSpec(train_dataloader, device_transforms=device_transform_fn)
        >>> # The same function can be used for eval dataloader as well
        >>> eval_dspec = DataSpec(eval_dataloader, device_transforms=device_transform_fn)
diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
index a456ec3239..6496c2b499 100644
--- a/composer/datasets/__init__.py
+++ b/composer/datasets/__init__.py
@@ -3,51 +3,17 @@
 
 """Natively supported datasets."""
 
-from composer.datasets.ade20k import (ADE20k, build_ade20k_dataloader, build_streaming_ade20k_dataloader,
-                                      build_synthetic_ade20k_dataloader)
-from composer.datasets.brats import PytTrain, PytVal
-from composer.datasets.c4 import build_streaming_c4_dataloader
-from composer.datasets.cifar import (build_cifar10_dataloader, build_ffcv_cifar10_dataloader,
-                                     build_streaming_cifar10_dataloader, build_synthetic_cifar10_dataloader)
-from composer.datasets.imagenet import (build_ffcv_imagenet_dataloader, build_imagenet_dataloader,
-                                        build_streaming_imagenet1k_dataloader, build_synthetic_imagenet_dataloader)
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
                                                               InContextLearningDataset, InContextLearningLMTaskDataset,
                                                               InContextLearningMultipleChoiceTaskDataset,
                                                               InContextLearningQATaskDataset,
                                                               InContextLearningSchemaTaskDataset)
-from composer.datasets.lm_dataset import build_lm_dataloader
-from composer.datasets.mnist import build_mnist_dataloader, build_synthetic_mnist_dataloader
-from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
-                                         SyntheticPILDataset)
 
 __all__ = [
-    'ADE20k',
-    'PytTrain',
-    'PytVal',
-    'SyntheticBatchPairDataset',
-    'SyntheticDataLabelType',
-    'SyntheticDataType',
-    'SyntheticPILDataset',
     'InContextLearningDataset',
     'InContextLearningQATaskDataset',
     'InContextLearningLMTaskDataset',
     'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
-    'build_ade20k_dataloader',
-    'build_streaming_ade20k_dataloader',
-    'build_streaming_c4_dataloader',
-    'build_cifar10_dataloader',
-    'build_streaming_cifar10_dataloader',
-    'build_ffcv_cifar10_dataloader',
-    'build_synthetic_ade20k_dataloader',
-    'build_synthetic_cifar10_dataloader',
-    'build_ffcv_imagenet_dataloader',
-    'build_imagenet_dataloader',
-    'build_streaming_imagenet1k_dataloader',
-    'build_synthetic_imagenet_dataloader',
-    'build_mnist_dataloader',
-    'build_synthetic_mnist_dataloader',
-    'build_lm_dataloader',
 ]
diff --git a/composer/datasets/ade20k.py b/composer/datasets/ade20k.py
deleted file mode 100644
index 34af316acf..0000000000
--- a/composer/datasets/ade20k.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ADE20K Semantic segmentation and scene parsing dataset.
-
-Please refer to the `ADE20K dataset <https://groups.csail.mit.edu/vision/datasets/ADE20K/>`_ for more details about this
-dataset.
-"""
-
-import os
-import warnings
-from math import ceil
-from typing import Any, Dict, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torchvision.transforms.functional as TF
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from torchvision import transforms
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'ADE20k', 'build_ade20k_dataloader', 'build_streaming_ade20k_dataloader', 'build_synthetic_ade20k_dataloader'
-]
-
-IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-
-def build_ade20k_transformations(split,
-                                 base_size: int = 512,
-                                 min_resize_scale: float = 0.5,
-                                 max_resize_scale: float = 2.0,
-                                 final_size: int = 512):
-    """Builds the transformations for the ADE20k dataset.
-
-       Args:
-           base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-           min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-           max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-           final_size (int): The final size of the image and target. Default: ``512``.
-
-       Returns:
-           both_transforms (torch.nn.Module): Transformations to apply to a 2-tuple containing the input image and the
-               target semantic segmentation mask.
-           image_transforms (torch.nn.Module): Transformations to apply to the input image only.
-           target_transforms (torch.nn.Module): Transformations to apply to the target semantic segmentation mask only.
-    """
-    warnings.warn(DeprecationWarning('build_ade20k_transformations is deprecated and will be removed in v0.18'))
-
-    if split == 'train':
-        both_transforms = torch.nn.Sequential(
-            RandomResizePair(
-                min_scale=min_resize_scale,
-                max_scale=max_resize_scale,
-                base_size=(base_size, base_size),
-            ),
-            RandomCropPair(
-                crop_size=(final_size, final_size),
-                class_max_percent=0.75,
-                num_retry=10,
-            ),
-            RandomHFlipPair(),
-        )
-
-        # Photometric distoration values come from mmsegmentation:
-        # https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L861
-        r_mean, g_mean, b_mean = IMAGENET_CHANNEL_MEAN
-        image_transforms = torch.nn.Sequential(
-            PhotometricDistoration(brightness=32. / 255, contrast=0.5, saturation=0.5, hue=18. / 255),
-            PadToSize(size=(final_size, final_size), fill=(int(r_mean), int(g_mean), int(b_mean))))
-
-        target_transforms = PadToSize(size=(final_size, final_size), fill=0)
-    else:
-        both_transforms = None
-        image_transforms = transforms.Resize(size=(final_size, final_size), interpolation=TF.InterpolationMode.BILINEAR)
-        target_transforms = transforms.Resize(size=(final_size, final_size), interpolation=TF.InterpolationMode.NEAREST)
-    return both_transforms, image_transforms, target_transforms
-
-
-def build_ade20k_dataloader(
-    global_batch_size: int,
-    datadir: str,
-    *,
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    base_size: int = 512,
-    min_resize_scale: float = 0.5,
-    max_resize_scale: float = 2.0,
-    final_size: int = 512,
-    ignore_background: bool = True,
-    **dataloader_kwargs,
-):
-    """Builds an ADE20k dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        datadir (str): Path to location of dataset.
-        split (str): The dataset split to use either 'train', 'val', or 'test'. Default: ``'train```.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        shuffle (bool): Whether to shuffle the dataset. Default: ``True``.
-        base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-        min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-        max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        ignore_background (bool): If true, ignore the background class when calculating the training loss.
-            Default: ``true``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    both_transforms, image_transforms, target_transforms = build_ade20k_transformations(
-        split=split,
-        base_size=base_size,
-        min_resize_scale=min_resize_scale,
-        max_resize_scale=max_resize_scale,
-        final_size=final_size)
-
-    dataset = ADE20k(datadir=datadir,
-                     split=split,
-                     both_transforms=both_transforms,
-                     image_transforms=image_transforms,
-                     target_transforms=target_transforms)
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                          std=IMAGENET_CHANNEL_STD,
-                                          ignore_background=ignore_background)
-
-    return DataSpec(
-        dataloader=DataLoader(dataset=dataset,
-                              batch_size=batch_size,
-                              sampler=sampler,
-                              drop_last=drop_last,
-                              collate_fn=pil_image_collate,
-                              **dataloader_kwargs),
-        device_transforms=device_transform_fn,
-    )
-
-
-def build_streaming_ade20k_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-ade20k/',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    base_size: int = 512,
-    min_resize_scale: float = 0.5,
-    max_resize_scale: float = 2.0,
-    final_size: int = 512,
-    ignore_background: bool = True,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Build an ADE20k streaming dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str): Local filesystem directory where dataset is cached during operation.
-            Default: ``'/tmp/mds-cache/mds-ade20k/```.
-        split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
-        base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-        min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-        max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        ignore_background (bool): If true, ignore the background class when calculating the training loss.
-            Default: ``true``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_streaming_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingADE20K
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    # Build the sets of transformations for ADE20k
-    joint_transform, image_transform, target_transform = build_ade20k_transformations(
-        split=split,
-        base_size=base_size,
-        min_resize_scale=min_resize_scale,
-        max_resize_scale=max_resize_scale,
-        final_size=final_size,
-    )
-
-    dataset = StreamingADE20K(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        joint_transform=joint_transform,
-        transform=image_transform,
-        target_transform=target_transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    device_transform_fn = NormalizationFn(
-        mean=IMAGENET_CHANNEL_MEAN,
-        std=IMAGENET_CHANNEL_STD,
-        ignore_background=ignore_background,
-    )
-
-    return DataSpec(dataloader=dataloader, device_transforms=device_transform_fn)
-
-
-def build_synthetic_ade20k_dataloader(
-    global_batch_size: int,
-    *,
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    final_size: int = 512,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Builds a synthetic ADE20k dataloader.
-
-    Args:
-        batch_size (int): Global batch size.
-        split (str): The dataset split to use either 'train', 'val', or 'test'. Default: ``'train```.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        shuffle (bool): Whether to shuffle the dataset. Default: ``True``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        num_unique_samples (int): Number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): Device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): Memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_synthetic_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if split == 'train':
-        total_dataset_size = 20_206
-    elif split == 'val':
-        total_dataset_size = 2_000
-    else:
-        total_dataset_size = 3_352
-
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=total_dataset_size,
-        data_shape=[3, final_size, final_size],
-        label_shape=[final_size, final_size],
-        num_classes=150,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ))
-
-
-class RandomResizePair(torch.nn.Module):
-    """Resize the image and target to ``base_size`` scaled by a randomly sampled value.
-
-    Args:
-        min_scale (float): the minimum value the samples can be rescaled.
-        max_scale (float): the maximum value the samples can be rescaled.
-        base_size (Tuple[int, int]): a specified base size (height x width) to scale to get the resized dimensions.
-            When this is None, use the input image size. Default: ``None``.
-    """
-
-    def __init__(self, min_scale: float, max_scale: float, base_size: Optional[Tuple[int, int]] = None):
-        super().__init__()
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.base_size = base_size
-        warnings.warn(DeprecationWarning('RandomResizePair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-        resize_scale = np.random.random_sample() * (self.max_scale - self.min_scale) + self.min_scale
-        base_height, base_width = self.base_size if self.base_size else (image.height, image.width)
-        resized_dims = (int(base_height * resize_scale), int(base_width * resize_scale))
-        resized_image = TF.resize(image, resized_dims, interpolation=TF.InterpolationMode.BILINEAR)  # type: ignore
-        resized_target = TF.resize(target, resized_dims, interpolation=TF.InterpolationMode.NEAREST)  # type: ignore
-        return resized_image, resized_target
-
-
-# Based on: https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L584
-class RandomCropPair(torch.nn.Module):
-    """Crop the image and target at a randomly sampled position.
-
-    Args:
-        crop_size (Tuple[int, int]): the size (height x width) of the crop.
-        class_max_percent (float): the maximum percent of the image area a single class should occupy. Default is 1.0.
-        num_retry (int): the number of times to resample the crop if ``class_max_percent`` threshold is not reached.
-            Default is 1.
-    """
-
-    def __init__(self, crop_size: Tuple[int, int], class_max_percent: float = 1.0, num_retry: int = 1):
-        super().__init__()
-        self.crop_size = crop_size
-        self.class_max_percent = class_max_percent
-        self.num_retry = num_retry
-        warnings.warn(DeprecationWarning('RandomCropPair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-
-        # if image size is smaller than crop size, no cropping necessary
-        if image.height <= self.crop_size[0] and image.width <= self.crop_size[1]:
-            return image, target
-
-        # generate crop
-        crop = transforms.RandomCrop.get_params(
-            image, output_size=self.crop_size)  # type: ignore - transform typing excludes PIL.Image
-
-        if self.class_max_percent < 1.0:
-            for _ in range(self.num_retry):
-                # Crop target
-                target_crop = TF.crop(target, *crop)  # type: ignore - transform typing excludes PIL.Image
-
-                # count the number of each class represented in cropped target
-                labels, counts = np.unique(np.array(target_crop), return_counts=True)
-                counts = counts[labels != 0]
-
-                # if the class with the most area is within the class_max_percent threshold, stop retrying
-                if len(counts) > 1 and (np.max(counts) / np.sum(counts)) < self.class_max_percent:
-                    break
-
-                crop = transforms.RandomCrop.get_params(
-                    image, output_size=self.crop_size)  # type: ignore - transform typing excludes PIL.Image
-
-        image = TF.crop(image, *crop)  # type: ignore - transform typing excludes PIL.Image
-        target = TF.crop(target, *crop)  # type: ignore - transform typing excludes PIL.Image
-
-        return image, target
-
-
-class RandomHFlipPair(torch.nn.Module):
-    """Flip the image and target horizontally with a specified probability.
-
-    Args:
-        probability (float): the probability of flipping the image and target. Default: ``0.5``.
-    """
-
-    def __init__(self, probability: float = 0.5):
-        super().__init__()
-        self.probability = probability
-        warnings.warn(DeprecationWarning('RandomHFlipPair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-        if np.random.random_sample() > self.probability:
-            image = TF.hflip(image)  # type: ignore - transform typing does not include PIL.Image
-            target = TF.hflip(target)  # type: ignore - transform typing does not include PIL.Image
-        return image, target
-
-
-class PadToSize(torch.nn.Module):
-    """Pad an image to a specified size.
-
-    Args:
-        size (Tuple[int, int]): the size (height x width) of the image after padding.
-        fill (Union[int, Tuple[int, int, int]]): the value to use for the padded pixels. Default: ``0``.
-    """
-
-    def __init__(self, size: Tuple[int, int], fill: Union[int, Tuple[int, int, int]] = 0):
-        super().__init__()
-        self.size = size
-        self.fill = fill
-        warnings.warn(DeprecationWarning('PadToSize is deprecated and will be removed in v0.18'))
-
-    def forward(self, image: Image.Image):
-        padding = max(self.size[0] - image.height, 0), max(self.size[1] - image.width, 0)
-        padding = (padding[1] // 2, padding[0] // 2, ceil(padding[1] / 2), ceil(padding[0] / 2))
-        image = TF.pad(image, padding, fill=self.fill)  # type: ignore - transform typing does not include PIL.Image
-        return image
-
-
-class PhotometricDistoration(torch.nn.Module):
-    """Applies a combination of brightness, contrast, saturation, and hue jitters with random intensity.
-
-    This is a less severe form of PyTorch's ColorJitter used by the mmsegmentation library here:
-    https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L861
-
-    Args:
-        brightness (float): max and min to jitter brightness.
-        contrast (float): max and min to jitter contrast.
-        saturation (float): max and min to jitter saturation.
-        hue (float): max and min to jitter hue.
-    """
-
-    def __init__(self, brightness: float, contrast: float, saturation: float, hue: float):
-        super().__init__()
-        self.brightness = brightness
-        self.contrast = contrast
-        self.saturation = saturation
-        self.hue = hue
-        warnings.warn(DeprecationWarning('PhotometricDistoration is deprecated and will be removed in v0.18'))
-
-    def forward(self, image: Image.Image):
-        if np.random.randint(2):
-            brightness_factor = np.random.uniform(1 - self.brightness, 1 + self.brightness)
-            image = TF.adjust_brightness(
-                image, brightness_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        contrast_mode = np.random.randint(2)
-        if contrast_mode == 1 and np.random.randint(2):
-            contrast_factor = np.random.uniform(1 - self.contrast, 1 + self.contrast)
-            image = TF.adjust_contrast(
-                image,  # type: ignore - transform typing does not include PIL.Image
-                contrast_factor)
-
-        if np.random.randint(2):
-            saturation_factor = np.random.uniform(1 - self.saturation, 1 + self.saturation)
-            image = TF.adjust_saturation(
-                image, saturation_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        if np.random.randint(2):
-            hue_factor = np.random.uniform(-self.hue, self.hue)
-            image = TF.adjust_hue(image, hue_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        if contrast_mode == 0 and np.random.randint(2):
-            contrast_factor = np.random.uniform(1 - self.contrast, 1 + self.contrast)
-            image = TF.adjust_contrast(
-                image,  # type: ignore - transform typing does not include PIL.Image
-                contrast_factor)
-
-        return image
-
-
-class ADE20k(Dataset):
-    """PyTorch Dataset for ADE20k.
-
-    Args:
-        datadir (str): the path to the ADE20k folder.
-        split (str): the dataset split to use, either 'training', 'validation', or 'test'. Default: ``'training'``.
-        both_transforms (torch.nn.Module): transformations to apply to the image and target simultaneously.
-            Default: ``None``.
-        image_transforms (torch.nn.Module): transformations to apply to the image only. Default: ``None``.
-        target_transforms (torch.nn.Module): transformations to apply to the target only. Default ``None``.
-    """
-
-    def __init__(self,
-                 datadir: str,
-                 split: str = 'training',
-                 both_transforms: Optional[torch.nn.Module] = None,
-                 image_transforms: Optional[torch.nn.Module] = None,
-                 target_transforms: Optional[torch.nn.Module] = None):
-        warnings.warn(DeprecationWarning('ADE20k is deprecated and will be removed in v0.18'))
-        super().__init__()
-        self.datadir = datadir
-        self.split = split
-        self.both_transforms = both_transforms
-        self.image_transforms = image_transforms
-        self.target_transforms = target_transforms
-
-        if not os.path.exists(self.datadir):
-            raise FileNotFoundError(f'datadir path does not exist: {self.datadir}')
-
-        # Check split value
-        if self.split not in ['training', 'validation', 'test']:
-            raise ValueError(f'split must be one of [`training`, `validation`, `test`] but is: {self.split}')
-
-        self.image_dir = os.path.join(self.datadir, 'images', self.split)
-        if not os.path.exists(self.image_dir):
-            raise FileNotFoundError(f'ADE20k directory structure is not as expected: {self.image_dir} does not exist')
-
-        self.image_files = os.listdir(self.image_dir)
-
-        # Filter for ADE files
-        self.image_files = [f for f in self.image_files if f[:3] == 'ADE']
-
-        # Remove grayscale samples
-        if self.split == 'training':
-            corrupted_samples = ['00003020', '00001701', '00013508', '00008455']
-            for sample in corrupted_samples:
-                sample_file = f'ADE_train_{sample}.jpg'
-                if sample_file in self.image_files:
-                    self.image_files.remove(sample_file)
-
-    def __getitem__(self, index):
-        # Load image
-        image_file = self.image_files[index]
-        image_path = os.path.join(self.image_dir, image_file)
-        image = Image.open(image_path)
-
-        # Load annotation target if using either train or val splits
-        if self.split in ['training', 'validation']:
-            target_path = os.path.join(self.datadir, 'annotations', self.split, image_file.split('.')[0] + '.png')
-            target = Image.open(target_path)
-
-            if self.both_transforms:
-                image, target = self.both_transforms((image, target))
-
-            if self.target_transforms:
-                target = self.target_transforms(target)
-
-        if self.image_transforms:
-            image = self.image_transforms(image)
-
-        if self.split in ['training', 'validation']:
-            return image, target  # type: ignore
-        else:
-            return image
-
-    def __len__(self):
-        return len(self.image_files)
diff --git a/composer/datasets/brats.py b/composer/datasets/brats.py
deleted file mode 100644
index 9356bdfb66..0000000000
--- a/composer/datasets/brats.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""BraTS (Brain Tumor Segmentation) dataset.
-
-Please refer to the `Brain Tumor Segmentation (BraTS) challenge <https://www.med.upenn.edu/cbica/brats2021/>`_ for more
-details about this dataset.
-"""
-
-import glob
-import os
-import random
-import warnings
-
-import numpy as np
-import torch
-import torch.utils.data
-import torchvision
-
-from composer.utils import MissingConditionalImportError, dist
-
-PATCH_SIZE = [1, 192, 160]
-
-__all__ = ['PytTrain', 'PytVal']
-
-
-def build_brats_dataloader(datadir: str,
-                           global_batch_size: int,
-                           oversampling: float = 0.33,
-                           is_train: bool = True,
-                           drop_last: bool = True,
-                           shuffle: bool = True,
-                           **dataloader_kwargs):
-    """Builds a BRaTS dataloader
-
-    Args:
-        global_batch_size (int): Global batch size.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_brats_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    x_train, y_train, x_val, y_val = get_data_split(datadir)
-    dataset = PytTrain(x_train, y_train, oversampling) if is_train else PytVal(x_val, y_val)
-    collate_fn = None if is_train else _my_collate
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return torch.utils.data.DataLoader(dataset=dataset,
-                                       batch_size=batch_size,
-                                       sampler=sampler,
-                                       drop_last=drop_last,
-                                       collate_fn=collate_fn,
-                                       **dataloader_kwargs)
-
-
-def _my_collate(batch):
-    """Custom collate function to handle images with different depths."""
-    data = [item[0] for item in batch]
-    target = [item[1] for item in batch]
-
-    return [torch.Tensor(data), torch.Tensor(target)]
-
-
-def _coin_flip(prob):
-    return random.random() < prob
-
-
-def _random_augmentation(probability, augmented, original):
-    condition = _coin_flip(probability)
-    neg_condition = condition ^ True
-    return condition * augmented + neg_condition * original
-
-
-class Crop(object):
-
-    def __call__(self, data, oversampling):
-        img, lbl = data['image'], data['label']
-
-        def randrange(max_range):
-            return 0 if max_range == 0 else random.randrange(max_range)
-
-        def get_cords(cord, idx):
-            return cord[idx], cord[idx] + PATCH_SIZE[idx]
-
-        def _rand_crop(image, label):
-            ranges = [s - p for s, p in zip(image.shape[1:], PATCH_SIZE)]
-
-            cord = [randrange(x) for x in ranges]
-            low_x, high_x = get_cords(cord, 0)
-            low_y, high_y = get_cords(cord, 1)
-            image = image[:, low_x:high_x, low_y:high_y]
-            label = label[:, low_x:high_x, low_y:high_y]
-            return image, label, [low_x, high_x, low_y, high_y]
-
-        def rand_foreg_cropd(image, label):
-
-            import scipy.ndimage
-            cl = np.random.choice(np.unique(label[label > 0]))
-            foreg_slices = scipy.ndimage.find_objects(scipy.ndimage.measurements.label(label == cl)[0])
-            foreg_slices = [x for x in foreg_slices if x is not None]
-            slice_volumes = [np.prod([s.stop - s.start for s in sl]) for sl in foreg_slices]
-            slice_idx = np.argsort(slice_volumes)[-2:]
-            foreg_slices = [foreg_slices[i] for i in slice_idx]
-            if not foreg_slices:
-                return _rand_crop(image, label)
-            foreg_slice = foreg_slices[random.randrange(len(foreg_slices))]
-            low_x, high_x = adjust(foreg_slice, PATCH_SIZE, label, 1)
-            low_y, high_y = adjust(foreg_slice, PATCH_SIZE, label, 2)
-            image = image[:, low_x:high_x, low_y:high_y]
-            label = label[:, low_x:high_x, low_y:high_y]
-            return image, label, [low_x, high_x, low_y, high_y]
-
-        def adjust(foreg_slice, patch_size, label, idx):
-
-            diff = patch_size[idx - 1] - (foreg_slice[idx].stop - foreg_slice[idx].start)
-            sign = -1 if diff < 0 else 1
-            diff = abs(diff)
-            ladj = randrange(diff)
-            hadj = diff - ladj
-            low = max(0, foreg_slice[idx].start - sign * ladj)
-            high = min(label.shape[idx], foreg_slice[idx].stop + sign * hadj)
-            diff = patch_size[idx - 1] - (high - low)
-            if diff > 0 and low == 0:
-                high += diff
-            elif diff > 0:
-                low -= diff
-            return low, high
-
-        if random.random() < oversampling:
-            img, lbl, _ = rand_foreg_cropd(img, lbl)
-        else:
-            img, lbl, _ = _rand_crop(img, lbl)
-
-        return {'image': img, 'label': lbl}
-
-
-class Noise(object):
-
-    def __call__(self, data, oversampling):
-        img, lbl = data['image'], data['label']
-        std = np.random.uniform(0.0, oversampling)
-        noise = np.random.normal(0, scale=std, size=img.shape)
-        img_noised = img + noise
-        img = _random_augmentation(0.15, img_noised, img)
-
-        return {'image': img, 'label': lbl}
-
-
-class Blur(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-
-        transf = torchvision.transforms.GaussianBlur(kernel_size=3, sigma=(0.5, 1.5))
-        img_blured = transf(torch.Tensor(img)).numpy()
-        img = _random_augmentation(0.15, img_blured, img)
-
-        return {'image': img, 'label': lbl}
-
-
-class Brightness(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        brightness_scale = _random_augmentation(0.15, np.random.uniform(0.7, 1.3), 1.0)
-        img = img * brightness_scale
-
-        return {'image': img, 'label': lbl}
-
-
-class Contrast(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        min_, max_ = np.min(img), np.max(img)
-        scale = _random_augmentation(0.15, np.random.uniform(0.65, 1.5), 1.0)
-
-        img = torch.clamp(torch.Tensor(img * scale), min_, max_).numpy()
-        return {'image': img, 'label': lbl}
-
-
-class Flips(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        axes = [1, 2]
-        prob = 1 / len(axes)
-
-        for axis in axes:
-            if random.random() < prob:
-                img = np.flip(img, axis=axis).copy()
-                lbl = np.flip(lbl, axis=axis).copy()
-
-        return {'image': img, 'label': lbl}
-
-
-class Transpose(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        img, lbl = img.transpose((1, 0, 2, 3)), lbl.transpose((1, 0, 2, 3))
-
-        return {'image': img, 'label': lbl}
-
-
-class PytTrain(torch.utils.data.Dataset):
-
-    def __init__(self, images, labels, oversampling, transform=None):
-        self.images, self.labels = images, labels
-        self.oversampling = oversampling
-        self.transform = transform
-        self.rand_crop = Crop()
-        self.transpose = Transpose()
-        self.contrast = Contrast()
-        self.noise = Noise()
-        self.blur = Blur()
-        self.flips = Flips()
-        self.bright = Brightness()
-
-    def __len__(self):
-        return len(self.images)
-
-    def __getitem__(self, idx):
-        data = {'image': np.load(self.images[idx]), 'label': np.load(self.labels[idx])}
-        data = self.rand_crop(data, self.oversampling)
-        data = self.flips(data)
-        data = self.noise(data, self.oversampling)
-        data = self.blur(data)
-        data = self.bright(data)
-        data = self.contrast(data)
-        data = self.transpose(data)
-
-        return data['image'], data['label']
-
-
-class PytVal(torch.utils.data.Dataset):
-
-    def __init__(self, images, labels):
-        self.images, self.labels = images, labels
-
-    def __len__(self):
-        return len(self.images)
-
-    def __getitem__(self, idx):
-        data = {'image': np.load(self.images[idx]), 'label': np.load(self.labels[idx])}
-        return data['image'], data['label']
-
-
-def load_data(path, files_pattern):
-    data = sorted(glob.glob(os.path.join(path, files_pattern)))
-    assert len(data) > 0, f'Found no data at {path}'
-    return data
-
-
-def get_split(data, idx):
-    return list(np.array(data)[idx])
-
-
-def get_data_split(path: str):
-    try:
-        from sklearn.model_selection import KFold
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='unet',
-                                            conda_channel='conda-forge',
-                                            conda_package='scikit-learn') from e
-    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
-    imgs = load_data(path, '*_x.npy')
-    lbls = load_data(path, '*_y.npy')
-    assert len(imgs) == len(lbls), f'Found {len(imgs)} volumes but {len(lbls)} corresponding masks'
-    train_imgs, train_lbls, val_imgs, val_lbls = [], [], [], []
-
-    train_idx, val_idx = list(kfold.split(imgs))[0]
-    train_imgs = get_split(imgs, train_idx)
-    train_lbls = get_split(lbls, train_idx)
-    val_imgs = get_split(imgs, val_idx)
-    val_lbls = get_split(lbls, val_idx)
-
-    return train_imgs, train_lbls, val_imgs, val_lbls
diff --git a/composer/datasets/c4.py b/composer/datasets/c4.py
deleted file mode 100644
index 4402134a98..0000000000
--- a/composer/datasets/c4.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""C4 (Colossal Cleaned Common Crawl) dataset.
-
-This dataset is a colossal, cleaned version of Common Crawl's web crawl corpus and it is based on the `Common Crawl
-<https://commoncrawl.org>`_ dataset.
-"""
-import logging
-import warnings
-from typing import Any, Dict, Optional
-
-from torch.utils.data import DataLoader
-
-from composer.core import DataSpec
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-__all__ = ['build_streaming_c4_dataloader']
-
-
-def build_streaming_c4_dataloader(
-    global_batch_size: int,
-    remote: str = 's3://mosaicml-internal-dataset-c4/mds/2/',
-    local: str = '/tmp/mds-cache/mds-c4/',
-    split: str = 'train',
-    shuffle: bool = True,
-    drop_last: bool = True,
-    tokenizer_name: str = 'bert-base-uncased',
-    max_seq_len: int = 512,
-    group_method: str = 'truncate',
-    mlm: bool = False,
-    mlm_probability: float = 0.15,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Builds a :class:`.DataSpec` for the StreamingC4 (Colossal Cleaned Common Crawl) dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-            Default: ``'s3://mosaicml-internal-dataset-c4/mds/2/'``
-        local (str): Local filesystem directory where dataset is cached during operation.
-            Default: ``'/tmp/mds-cache/mds-c4/'``
-        split (str): What split of the dataset to use. Either ``'train'`` or ``'val'``.
-            Default: ``'train'``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        tokenizer_name (str): The name of the HuggingFace tokenizer to preprocess text with. Default:
-            ``'bert-base-uncased'``.
-        max_seq_len (int): The max sequence length of each token sample. Default: ``512``.
-        group_method (str): How to group text samples into token samples. Currently only `truncate` is supported.
-        mlm (bool): Whether or not to use masked language modeling. Default: ``False``.
-        mlm_probability (float): If ``mlm==True``, the probability that tokens are masked. Default: ``0.15``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_streaming_c4_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.text import StreamingC4
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    dataset = StreamingC4(
-        tokenizer_name=tokenizer_name,
-        max_seq_len=max_seq_len,
-        group_method=group_method,
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    collate_fn = transformers.DataCollatorForLanguageModeling(
-        tokenizer=dataset.tokenizer,
-        mlm=mlm,
-        mlm_probability=mlm_probability,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        drop_last=drop_last,
-        collate_fn=collate_fn,
-        **dataloader_kwargs,
-    )
-
-    return DataSpec(dataloader=dataloader)
diff --git a/composer/datasets/cifar.py b/composer/datasets/cifar.py
deleted file mode 100644
index f866c5be1b..0000000000
--- a/composer/datasets/cifar.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""CIFAR image classification dataset.
-
-The CIFAR datasets are a collection of labeled 32x32 colour images. Please refer to the `CIFAR dataset
-<https://www.cs.toronto.edu/~kriz/cifar.html>`_ for more details.
-"""
-
-import os
-import textwrap
-import warnings
-from typing import Any, Dict, List, Optional, Union
-
-import torch
-from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.ffcv_utils import write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'build_cifar10_dataloader', 'build_ffcv_cifar10_dataloader', 'build_streaming_cifar10_dataloader',
-    'build_synthetic_cifar10_dataloader'
-]
-
-CIFAR10_CHANNEL_MEAN = 0.4914, 0.4822, 0.4465
-CIFAR10_CHANNEL_STD = 0.247, 0.243, 0.261
-
-
-def build_cifar10_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Any,
-) -> DataSpec:
-    """Builds a CIFAR-10 dataloader with default transforms.
-
-    Args:
-        datadir (str): Path to the data directory
-        global_batch_size (int): Global batch size
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_cifar10_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if is_train:
-        transform = transforms.Compose([
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-    else:
-        transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-
-    with dist.run_local_rank_zero_first():
-        dataset = datasets.CIFAR10(
-            datadir,
-            train=is_train,
-            download=dist.get_local_rank() == 0 and download,
-            transform=transform,
-        )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ),)
-
-
-def build_ffcv_cifar10_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    prefetch_factor: int = 2,
-    num_workers: int = 8,
-    ffcv_dir: str = '/tmp',
-    ffcv_dest: str = 'cifar_train.ffcv',
-    ffcv_write_dataset: Union[str, bool] = False,
-    datadir: Union[str, None] = None,
-) -> DataSpec:
-    """Builds an FFCV CIFAR10 dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        prefetch_factor (int): Number of batches to prefect. Default: ``2``.
-        ffcv_dir (str, optional): A directory containing train/val <file>.ffcv files. If
-            these files don't exist and ``ffcv_write_dataset`` is ``True``, train/val
-            <file>.ffcv files will be created in this dir. Default: ``"/tmp"``.
-        ffcv_dest (str, optional): <file>.ffcv file that has dataset samples. Default: ``"cifar_train.ffcv"``.
-        ffcv_write_dataset (str | bool, optional): Whether to create dataset in FFCV format (<file>.ffcv) if it doesn't exist. Default:
-        ``False``.
-        datadir (str | None, optional): Path to the non-FFCV data directory.
-    """
-    warnings.warn(DeprecationWarning('build_ffcv_cifar10_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import ffcv
-        from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder
-        from ffcv.pipeline.operation import Operation
-    except ImportError:
-        raise ImportError(
-            textwrap.dedent("""\
-            Composer was installed without ffcv support.
-            To use ffcv with Composer, please install ffcv in your environment."""))
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset_filepath = os.path.join(ffcv_dir, ffcv_dest)
-    # always create if ffcv_write_dataset is true
-    if ffcv_write_dataset:
-        if dist.get_local_rank() == 0:
-            if datadir is None:
-                raise ValueError('datadir is required if use_synthetic is False and ffcv_write_dataset is True.')
-            ds = datasets.CIFAR10(
-                datadir,
-                train=is_train,
-                download=download,
-            )
-
-            write_ffcv_dataset(dataset=ds, write_path=dataset_filepath)
-
-        # Wait for the local rank 0 to be done creating the dataset in ffcv format.
-        dist.barrier()
-
-    if not os.path.exists(dataset_filepath):
-        raise ValueError(
-            f'Dataset file containing samples not found at {dataset_filepath}. Use ffcv_dir flag to point to a dir containing {dataset_filepath}.'
-        )
-
-    # Please note that this mean/std is different from the mean/std used for regular PyTorch dataloader as
-    # ToTensor does the normalization for PyTorch dataloaders.
-    cifar10_mean_ffcv = [125.307, 122.961, 113.8575]
-    cifar10_std_ffcv = [51.5865, 50.847, 51.255]
-    label_pipeline: List[Operation] = [IntDecoder(), ffcv.transforms.ToTensor(), ffcv.transforms.Squeeze()]
-    image_pipeline: List[Operation] = [SimpleRGBImageDecoder()]
-
-    if is_train:
-        image_pipeline.extend([
-            ffcv.transforms.RandomHorizontalFlip(),
-            ffcv.transforms.RandomTranslate(padding=2, fill=tuple(map(int, cifar10_mean_ffcv))),
-            ffcv.transforms.Cutout(4, tuple(map(int, cifar10_mean_ffcv))),
-        ])
-    # Common transforms for train and test
-    image_pipeline.extend([
-        ffcv.transforms.ToTensor(),
-        ffcv.transforms.ToTorchImage(channels_last=False, convert_back_int16=False),
-        ffcv.transforms.Convert(torch.float32),
-        transforms.Normalize(cifar10_mean_ffcv, cifar10_std_ffcv),
-    ])
-
-    ordering = ffcv.loader.OrderOption.RANDOM if is_train else ffcv.loader.OrderOption.SEQUENTIAL
-
-    return DataSpec(
-        ffcv.Loader(
-            dataset_filepath,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            order=ordering,
-            distributed=False,
-            pipelines={
-                'image': image_pipeline,
-                'label': label_pipeline,
-            },
-            batches_ahead=prefetch_factor,
-            drop_last=drop_last,
-        ),)
-
-
-def build_synthetic_cifar10_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Any,
-) -> DataSpec:
-    """Builds a synthetic CIFAR-10 dataset for debugging or profiling.
-
-    Args:
-        global_batch_size (int): Global batch size
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=50_000 if is_train else 10_000,
-        data_shape=[3, 32, 32],
-        num_classes=10,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ),)
-
-
-def build_streaming_cifar10_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-cifar10',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds a streaming CIFAR10 dataset
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str, optional): Local filesystem directory where dataset is cached during operation.
-            Defaults to ``'/tmp/mds-cache/mds-imagenet1k/```.
-        split (str): Which split of the dataset to use. Either ['train', 'val']. Default:
-            ``'train```.
-        drop_last (bool, optional): whether to drop last samples. Default: ``True``.
-        shuffle (bool, optional): whether to shuffle dataset. Defaults to ``True``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingCIFAR10
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    if split == 'train':
-        transform = transforms.Compose([
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-    else:
-        transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-
-    dataset = StreamingCIFAR10(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        transform=transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        sampler=None,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    return DataSpec(dataloader=dataloader)
diff --git a/composer/datasets/ffcv_utils.py b/composer/datasets/ffcv_utils.py
deleted file mode 100644
index cdf7616d47..0000000000
--- a/composer/datasets/ffcv_utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import warnings
-from typing import Optional
-
-import numpy as np
-
-from composer.core import Dataset
-from composer.utils import MissingConditionalImportError
-
-try:
-    import ffcv
-    ffcv_installed = True
-except ImportError:
-    ffcv_installed = False
-
-log = logging.getLogger(__name__)
-
-__all__ = ['write_ffcv_dataset', 'ffcv_monkey_patches']
-
-
-def _require_ffcv():
-    if not ffcv_installed:
-        raise MissingConditionalImportError(extra_deps_group='ffcv', conda_package='ffcv')
-
-
-def ffcv_monkey_patches():
-    warnings.warn(DeprecationWarning('ffcv_monkey_patches is deprecated and will be removed in v0.18'))
-    _require_ffcv()
-
-    # ffcv's __len__ function is expensive as it always calls self.next_traversal_order which does shuffling.
-    # Composer calls len(dataloader) function in training loop for every batch and thus len function causes 2x slowdown.
-    # ffcv's __len__ is fixed in 1.0.0 branch but for another reason (https://github.com/libffcv/ffcv/issues/163).
-    def new_len(self):
-        if not hasattr(self, 'init_traversal_order'):
-            self.init_traversal_order = self.next_traversal_order()
-        if self.drop_last:
-            return len(self.init_traversal_order) // self.batch_size
-        else:
-            return int(np.ceil(len(self.init_traversal_order) / self.batch_size))
-
-    ffcv.loader.loader.Loader.__len__ = new_len
-
-
-def write_ffcv_dataset(dataset: Optional[Dataset] = None,
-                       write_path: str = '/tmp/dataset.ffcv',
-                       max_resolution: Optional[int] = None,
-                       num_workers: int = 16,
-                       write_mode: str = 'raw',
-                       compress_probability: float = 0.50,
-                       jpeg_quality: float = 90,
-                       chunk_size: int = 100):
-    """Converts PyTorch compatible ``dataset`` into FFCV format at filepath ``write_path``.
-
-    Args:
-        dataset (Iterable[Sample]): A PyTorch dataset. Default: ``None``.
-        write_path (str): Write results to this file. Default: ``"/tmp/dataset.ffcv"``.
-        max_resolution (int): Limit resolution if provided. Default: ``None``.
-        num_workers (int): Numbers of workers to use. Default: ``16``.
-        write_mode (str): Write mode for the dataset. Default: ``'raw'``.
-        compress_probability (float): Probability with which image is JPEG-compressed. Default: ``0.5``.
-        jpeg_quality (float): Quality to use for jpeg compression. Default: ``90``.
-        chunk_size (int): Size of chunks processed by each worker during conversion. Default: ``100``.
-    """
-    warnings.warn(DeprecationWarning('write_ffcv_dataset is deprecated and will be removed in v0.18'))
-
-    _require_ffcv()
-    if dataset is None:
-        raise ValueError('dataset should not be None.')
-
-    log.info(f'Writing dataset in FFCV <file>.ffcv format to {write_path}.')
-    writer = ffcv.writer.DatasetWriter(write_path, {
-        'image':
-            ffcv.fields.RGBImageField(write_mode=write_mode,
-                                      max_resolution=max_resolution,
-                                      compress_probability=compress_probability,
-                                      jpeg_quality=jpeg_quality),
-        'label':
-            ffcv.fields.IntField()
-    },
-                                       num_workers=num_workers)
-    writer.from_indexed_dataset(dataset, chunksize=chunk_size)
diff --git a/composer/datasets/imagenet.py b/composer/datasets/imagenet.py
deleted file mode 100644
index 80d0f8c05d..0000000000
--- a/composer/datasets/imagenet.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ImageNet classification streaming dataset.
-
-The most widely used dataset for Image Classification algorithms. Please refer to the `ImageNet 2012 Classification
-Dataset <http://image-net.org/>`_ for more details.
-"""
-
-import os
-import warnings
-from typing import Any, Dict, List, Optional
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from torchvision import transforms
-from torchvision.datasets import ImageFolder
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.ffcv_utils import ffcv_monkey_patches, write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'build_imagenet_dataloader',
-    'build_streaming_imagenet1k_dataloader',
-    'build_synthetic_imagenet_dataloader',
-    'write_ffcv_imagenet',
-    'build_ffcv_imagenet_dataloader',
-]
-
-IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-
-def build_imagenet_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds an ImageNet dataloader.
-
-    Args:
-        datadir (str): path to location of dataset.
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if is_train:
-        # include fixed-size resize before RandomResizedCrop in training only
-        # if requested (by specifying a size > 0)
-        train_transforms: List[torch.nn.Module] = []
-
-        if resize_size > 0:
-            train_transforms.append(transforms.Resize(resize_size))
-
-        train_transforms += [
-            transforms.RandomResizedCrop(crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-            transforms.RandomHorizontalFlip()
-        ]
-        transformation = transforms.Compose(train_transforms)
-        split = 'train'
-    else:
-        val_transforms: List[torch.nn.Module] = []
-        if resize_size > 0:
-            val_transforms.append(transforms.Resize(resize_size))
-        val_transforms.append(transforms.CenterCrop(crop_size))
-        transformation = transforms.Compose(val_transforms)
-        split = 'val'
-
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)
-
-    dataset = ImageFolder(os.path.join(datadir, split), transformation)
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            collate_fn=pil_image_collate,
-            **dataloader_kwargs,
-        ),
-        device_transforms=device_transform_fn,
-    )
-
-
-def build_synthetic_imagenet_dataloader(
-    global_batch_size: int,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    is_train: bool = True,
-    crop_size: int = 224,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds a synthetic ImageNet dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        crop size (int): The crop size to use. Default: ``224``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_synthetic_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    total_dataset_size = 1_281_167 if is_train else 50_000
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=total_dataset_size,
-        data_shape=[3, crop_size, crop_size],
-        num_classes=1000,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            collate_fn=pil_image_collate,
-            **dataloader_kwargs,
-        ),)
-
-
-def write_ffcv_imagenet(
-    datadir: str,
-    savedir: str = '/tmp/imagenet_train.ffcv',
-    split: str = 'train',
-    num_workers: int = 8,
-):
-    """Converts an ImageNet dataset to FFCV format.
-
-        datadir (str): Path of ImageNet dataset, in ImageFolder format.
-        savedir (str): Path to save the FFCV dataset. Default: ``/tmp/imagenet_train.ffcv``.
-        split (str): 'train' or 'val'. Default: ``train``.
-        num_workers (int): Number of workers to use for conversion. Default: ``8``.
-    """
-    warnings.warn(DeprecationWarning('write_ffcv_imagenet is deprecated and will be removed in v0.18'))
-
-    if dist.get_local_rank() == 0:
-        ds = ImageFolder(os.path.join(datadir, split))
-        write_ffcv_dataset(dataset=ds,
-                           write_path=savedir,
-                           max_resolution=500,
-                           num_workers=num_workers,
-                           compress_probability=0.50,
-                           jpeg_quality=90)
-
-    # wait for rank 0 to finish conversion
-    dist.barrier()
-
-
-def build_ffcv_imagenet_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    cpu_only: bool = False,
-    drop_last: bool = True,
-    prefetch_factor: int = 2,
-    num_workers: int = 8,
-):
-    """Builds an FFCV ImageNet dataloader.
-
-    Args:
-        datadir (str): path to location of dataset.
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        cpu_only (int): Only perform transforms on 'cpu'. Default: ``False``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        prefetch_factor (int): Number of batches to prefect. Default: ``2``.
-        num_workers (int): Number of workers. Default: ``8``.
-    """
-    warnings.warn(DeprecationWarning('build_ffcv_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import ffcv
-        from ffcv.fields.decoders import CenterCropRGBImageDecoder, IntDecoder, RandomResizedCropRGBImageDecoder
-        from ffcv.pipeline.operation import Operation
-    except ImportError:
-        raise ImportError('Composer was installed without ffcv support.'
-                          'To use ffcv with Composer, please install ffcv.')
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    device = torch.device(f'cuda:{dist.get_local_rank()}')
-    label_pipeline: List[Operation] = [
-        IntDecoder(),
-        ffcv.transforms.ToTensor(),
-        ffcv.transforms.Squeeze(),
-        ffcv.transforms.ToDevice(device, non_blocking=True)
-    ]
-    image_pipeline: List[Operation] = []
-    if is_train:
-        image_pipeline.extend(
-            [RandomResizedCropRGBImageDecoder((crop_size, crop_size)),
-             ffcv.transforms.RandomHorizontalFlip()])
-        dtype = np.float16
-    else:
-        ratio = crop_size / resize_size if resize_size > 0 else 1.0
-        image_pipeline.extend([CenterCropRGBImageDecoder((crop_size, crop_size), ratio=ratio)])
-        dtype = np.float32
-
-    # Common transforms for train and test
-    if cpu_only:
-        image_pipeline.extend([
-            ffcv.transforms.NormalizeImage(np.array(IMAGENET_CHANNEL_MEAN), np.array(IMAGENET_CHANNEL_STD), dtype),
-            ffcv.transforms.ToTensor(),
-            ffcv.transforms.ToTorchImage(),
-        ])
-    else:
-        image_pipeline.extend([
-            ffcv.transforms.ToTensor(),
-            ffcv.transforms.ToDevice(device, non_blocking=True),
-            ffcv.transforms.ToTorchImage(),
-            ffcv.transforms.NormalizeImage(np.array(IMAGENET_CHANNEL_MEAN), np.array(IMAGENET_CHANNEL_STD), dtype),
-        ])
-
-    is_distributed = dist.get_world_size() > 1
-
-    ffcv_monkey_patches()
-    ordering = ffcv.loader.OrderOption.RANDOM if is_train else ffcv.loader.OrderOption.SEQUENTIAL
-
-    return ffcv.Loader(
-        datadir,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        order=ordering,
-        distributed=is_distributed,
-        pipelines={
-            'image': image_pipeline,
-            'label': label_pipeline
-        },
-        batches_ahead=prefetch_factor,
-        drop_last=drop_last,
-    )
-
-
-def build_streaming_imagenet1k_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-imagenet1k',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds an imagenet1k streaming dataset
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str, optional): Local filesystem directory where dataset is cached during operation.
-            Defaults to ``'/tmp/mds-cache/mds-imagenet1k/```.
-        split (str): Which split of the dataset to use. Either ['train', 'val']. Default:
-            ``'train```.
-        drop_last (bool, optional): whether to drop last samples. Default: ``True``.
-        shuffle (bool, optional): whether to shuffle dataset. Defaults to ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(
-        DeprecationWarning('build_streaming_imagenet1k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingImageNet
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    transform = []
-    if split == 'train':
-        # include fixed-size resize before RandomResizedCrop in training only
-        # if requested (by specifying a size > 0)
-        if resize_size > 0:
-            transform.append(transforms.Resize(resize_size))
-        # always include RandomResizedCrop and RandomHorizontalFlip
-        transform += [
-            transforms.RandomResizedCrop(crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-            transforms.RandomHorizontalFlip()
-        ]
-    else:
-        if resize_size > 0:
-            transform.append(transforms.Resize(resize_size))
-        transform.append(transforms.CenterCrop(crop_size))
-    transform.append(lambda image: image.convert('RGB'))
-    transform = transforms.Compose(transform)
-
-    dataset = StreamingImageNet(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        transform=transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)
-    return DataSpec(dataloader=dataloader, device_transforms=device_transform_fn)
diff --git a/composer/datasets/lm_dataset.py b/composer/datasets/lm_dataset.py
deleted file mode 100644
index 8d0cafebfd..0000000000
--- a/composer/datasets/lm_dataset.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import warnings
-from typing import List, cast
-
-from torch.utils.data import DataLoader, Dataset
-
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-
-def build_lm_dataloader(
-    datadir: List[str],
-    tokenizer_name: str,
-    global_batch_size: int,
-    *,
-    split: str = 'train',
-    shuffle: bool = True,
-    drop_last: bool = True,
-    use_masked_lm: bool = False,
-    num_tokens: int = 0,
-    mlm_probability: float = 0.15,
-    subsample_ratio: float = 1.0,
-    **dataloader_kwargs,
-):
-    """Builds a dataloader for a generic language modeling dataset.
-
-    Args:
-        datadir (list): List containing the string of the path to the HuggingFace
-            Datasets directory.
-        dataloader_hparams (DataLoaderHparams): DataLoaderHparams object.
-        tokenizer_name (str): The name of the HuggingFace tokenizer to
-            preprocess text with. See `HuggingFace documentation
-            <https://huggingface.co/models>`_.
-        global_batch_size (int): Global batch size.
-        split (str): the dataset split to use either 'train', 'val', or 'test'. Default: ``'train```. Default: ``'train'``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        use_masked_lm (bool): Whether the dataset should be encoded with masked
-            language modeling or not.
-        num_tokens (int, optional): Number of tokens to train on. ``0``
-            will train on all tokens in the dataset. Default: ``0``.
-        mlm_probability (float, optional): If using masked language modeling, the
-            probability with which tokens will be masked. Default: ``0.15``.
-        subsample_ratio (float, optional): Proportion of the dataset to use. Default:
-            ``1.0``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_lm_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    try:
-        import datasets
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='datasets') from e
-
-    assert tokenizer_name is not None
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    # loads a dataset that is assumed to be pre-tokenized
-    lm_datasets = [datasets.load_from_disk(i) for i in datadir]  #type: ignore (thirdparty)
-
-    # merge the dataset to re-sample from
-    merged_dataset = [[d[split]] for d in lm_datasets]
-    # flatten merged_dataset
-    merged_dataset = [item for sublist in merged_dataset for item in sublist]
-    lm_datasets = datasets.concatenate_datasets(merged_dataset)  #type: ignore (thirdparty)
-
-    total_num_samples = len(lm_datasets)  # type: ignore
-    tokens_per_sample = len(lm_datasets[0]['input_ids'])  #type: ignore (thirdparty)
-    total_num_tokens = total_num_samples * tokens_per_sample
-
-    # truncate the dataset to a specified size
-    num_samples = total_num_samples
-    if num_tokens > 0:
-        assert num_tokens <= total_num_tokens, f'Requested {num_tokens} tokens must be <= total_num_tokens={total_num_tokens}'
-        assert num_tokens % tokens_per_sample == 0, f'Requested {num_tokens} tokens is not divisible by tokens_per_sample={tokens_per_sample}'
-        num_samples = num_tokens // tokens_per_sample
-        subsample_ratio = num_samples / total_num_samples
-    elif subsample_ratio < 1.0:
-        num_samples = round(total_num_samples * subsample_ratio)
-        num_tokens = num_samples * tokens_per_sample
-    elif subsample_ratio == 1.0 and num_tokens == 0:
-        num_tokens = total_num_tokens
-    else:
-        log.warning('No subsampling going on!')
-
-    lm_datasets = lm_datasets.select(range(num_samples))  # type: ignore (thirdparty)
-    log.info(f'LM datasets: {lm_datasets}')
-    log.info(f'Subsample ratio: {subsample_ratio}')
-    log.info(f'Total number of samples: {num_samples:e}')
-    log.info(f'Total number of tokens: {num_tokens:e}')
-    dataset = lm_datasets
-
-    # for some tokenizers, e.g. GPT-2, they don't have padding tokens. Hence, we cannot use the LM collator.
-    if tokenizer.pad_token_id is None:
-        data_collator = transformers.default_data_collator
-    else:
-        data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer,
-                                                                     mlm=use_masked_lm,
-                                                                     mlm_probability=mlm_probability)
-
-    sampler = dist.get_sampler(
-        cast(Dataset, dataset),  # HF datasets do not subclass torch datasets, so this cast is needed
-        drop_last=drop_last,
-        shuffle=shuffle)
-
-    return DataLoader(
-        dataset=dataset,  # type: ignore
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        collate_fn=data_collator,
-        **dataloader_kwargs)
diff --git a/composer/datasets/mnist.py b/composer/datasets/mnist.py
deleted file mode 100644
index 4ec9601da4..0000000000
--- a/composer/datasets/mnist.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import warnings
-from typing import Any
-
-from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
-
-from composer.core import MemoryFormat
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.utils import dist
-
-
-def build_mnist_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Any,
-) -> DataLoader:
-    """Builds an MNIST dataloader.
-
-    Args:
-        datadir (str): Path to the data directory
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_mnist_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    transform = transforms.Compose([transforms.ToTensor()])
-
-    with dist.run_local_rank_zero_first():
-        dataset = datasets.MNIST(
-            datadir,
-            train=is_train,
-            download=dist.get_local_rank() == 0 and download,
-            transform=transform,
-        )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-
-def build_synthetic_mnist_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Any,
-) -> DataLoader:
-    """Builds a synthetic MNIST dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=60_000 if is_train else 10_000,
-        data_shape=[1, 28, 28],
-        num_classes=10,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
diff --git a/composer/datasets/synthetic.py b/composer/datasets/synthetic.py
deleted file mode 100644
index 37d2e0f52c..0000000000
--- a/composer/datasets/synthetic.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Synthetic datasets used for testing, profiling, and debugging."""
-
-from __future__ import annotations
-
-import warnings
-from typing import Callable, Optional, Sequence, Union
-
-import torch
-import torch.utils.data
-from PIL import Image
-from torchvision.datasets import VisionDataset
-
-from composer.core import MemoryFormat
-from composer.utils import StringEnum
-
-__all__ = ['SyntheticDataType', 'SyntheticDataLabelType', 'SyntheticBatchPairDataset', 'SyntheticPILDataset']
-
-
-class SyntheticDataType(StringEnum):
-    """Defines the distribution of the synthetic data.
-
-    Attributes:
-        GAUSSIAN: Standard Gaussian distribution.
-        SEPARABLE: Gaussian distributed, but classes will be mean-shifted for
-            separability.
-    """
-
-    GAUSSIAN = 'gaussian'
-    SEPARABLE = 'separable'
-
-
-class SyntheticDataLabelType(StringEnum):
-    """Defines the class label type of the synthetic data.
-
-    Attributes:
-        CLASSIFICATION_INT: Class labels are ints.
-        CLASSIFICATION_ONE_HOT: Class labels are one-hot vectors.
-    """
-    CLASSIFICATION_INT = 'classification_int'
-    CLASSIFICATION_ONE_HOT = 'classification_one_hot'
-
-
-class SyntheticBatchPairDataset(torch.utils.data.Dataset):
-    """Emulates a dataset of provided size and shape.
-
-    Args:
-        total_dataset_size (int): The total size of the dataset to emulate.
-        data_shape (List[int]): Shape of the tensor for input samples.
-        num_unique_samples_to_create (int): The number of unique samples to allocate memory for.
-        data_type (str or SyntheticDataType, optional), Type of synthetic data to create.
-            Default: ``SyntheticDataType.GAUSSIAN``.
-        label_type (str or SyntheticDataLabelType, optional), Type of synthetic data to
-            create. Default: ``SyntheticDataLabelType.CLASSIFICATION_INT``.
-        num_classes (int, optional): Number of classes to use. Required if
-            ``SyntheticDataLabelType`` is ``CLASSIFICATION_INT``
-            or``CLASSIFICATION_ONE_HOT``. Default: ``None``.
-        label_shape (List[int], optional): Shape of the tensor for each sample label.
-            Default: ``None``.
-        device (str): Device to store the sample pool. Set to ``'cuda'`` to store samples
-            on the GPU and eliminate PCI-e bandwidth with the dataloader. Set to ``'cpu'``
-            to move data between host memory and the gpu on every batch. Default:
-            ``'cpu'``.
-        memory_format (:class:`composer.core.MemoryFormat`, optional): Memory format for the sample pool.
-            Default: `MemoryFormat.CONTIGUOUS_FORMAT`.
-        transform (Callable, optional): Transform(s) to apply to data. Default: ``None``.
-    """
-
-    def __init__(self,
-                 *,
-                 total_dataset_size: int,
-                 data_shape: Sequence[int],
-                 num_unique_samples_to_create: int = 100,
-                 data_type: Union[str, SyntheticDataType] = SyntheticDataType.GAUSSIAN,
-                 label_type: Union[str, SyntheticDataLabelType] = SyntheticDataLabelType.CLASSIFICATION_INT,
-                 num_classes: Optional[int] = None,
-                 label_shape: Optional[Sequence[int]] = None,
-                 device: str = 'cpu',
-                 memory_format: Union[str, MemoryFormat] = MemoryFormat.CONTIGUOUS_FORMAT,
-                 transform: Optional[Callable] = None):
-        warnings.warn(DeprecationWarning('SyntheticBatchPairDataset is deprecated and will be removed in v0.18'))
-
-        self.total_dataset_size = total_dataset_size
-        self.data_shape = data_shape
-        self.num_unique_samples_to_create = num_unique_samples_to_create
-        self.data_type = SyntheticDataType(data_type)
-        self.label_type = SyntheticDataLabelType(label_type)
-        self.num_classes = num_classes
-        self.label_shape = label_shape
-        self.device = device
-        self.memory_format = MemoryFormat(memory_format)
-        self.transform = transform
-
-        self._validate_label_inputs(label_type=self.label_type,
-                                    num_classes=self.num_classes,
-                                    label_shape=self.label_shape)
-
-        # The synthetic data
-        self.input_data = None
-        self.input_target = None
-
-    def _validate_label_inputs(self, label_type: SyntheticDataLabelType, num_classes: Optional[int],
-                               label_shape: Optional[Sequence[int]]):
-        if label_type == SyntheticDataLabelType.CLASSIFICATION_INT or label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-            if num_classes is None or num_classes <= 0:
-                raise ValueError('classification label_types require num_classes > 0')
-
-    def __len__(self) -> int:
-        return self.total_dataset_size
-
-    def __getitem__(self, idx: int):
-        idx = idx % self.num_unique_samples_to_create
-        if self.input_data is None:
-            # Generating data on the first call to __getitem__ so that data is stored on the correct gpu,
-            # after DeviceSingleGPU calls torch.cuda.set_device
-            # This does mean that the first batch will be slower
-            # generating samples so all values for the sample are the sample index
-            # e.g. all(input_data[1] == 1). Helps with debugging.
-            assert self.input_target is None
-            input_data = torch.randn(self.num_unique_samples_to_create, *self.data_shape, device=self.device)
-
-            input_data = torch.clone(input_data)  # allocate actual memory
-            input_data = input_data.contiguous(memory_format=getattr(torch, self.memory_format.value))
-
-            if self.label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-                assert self.num_classes is not None
-                input_target = torch.zeros((self.num_unique_samples_to_create, self.num_classes), device=self.device)
-                input_target[:, 0] = 1.0
-            elif self.label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-                assert self.num_classes is not None
-                if self.label_shape:
-                    label_batch_shape = (self.num_unique_samples_to_create, *self.label_shape)
-                else:
-                    label_batch_shape = (self.num_unique_samples_to_create,)
-                input_target = torch.randint(0, self.num_classes, label_batch_shape, device=self.device)
-            else:
-                raise ValueError(f'Unsupported label type {self.data_type}')
-
-            # If separable, force the positive examples to have a higher mean than the negative examples
-            if self.data_type == SyntheticDataType.SEPARABLE:
-                assert self.label_type == SyntheticDataLabelType.CLASSIFICATION_INT, \
-                    'SyntheticDataType.SEPARABLE requires integer classes.'
-                assert torch.max(input_target) == 1 and torch.min(input_target) == 0, \
-                    'SyntheticDataType.SEPARABLE only supports binary labels'
-                # Make positive examples have mean = 3 and negative examples have mean = -3
-                # so they are easier to separate with a classifier
-                input_data[input_target == 0] -= 3
-                input_data[input_target == 1] += 3
-
-            self.input_data = input_data
-            self.input_target = input_target
-
-        assert self.input_target is not None
-
-        if self.transform is not None:
-            return self.transform(self.input_data[idx]), self.input_target[idx]
-        else:
-            return self.input_data[idx], self.input_target[idx]
-
-
-class SyntheticPILDataset(VisionDataset):
-    """Similar to :class:`SyntheticBatchPairDataset`, but yields samples of type :class:`~PIL.Image.Image` and supports
-    dataset transformations.
-
-    Args:
-        total_dataset_size (int): The total size of the dataset to emulate.
-        data_shape (List[int]): Shape of the tensor for input samples.
-        num_unique_samples_to_create (int): The number of unique samples to allocate memory for.
-        data_type (str or SyntheticDataType, optional), Type of synthetic data to create.
-            Default: ``SyntheticDataType.GAUSSIAN``.
-        label_type (str or SyntheticDataLabelType, optional), Type of synthetic data to
-            create. Default: ``SyntheticDataLabelType.CLASSIFICATION_INT``.
-        num_classes (int, optional): Number of classes to use. Required if
-            ``SyntheticDataLabelType`` is ``CLASSIFICATION_INT``
-            or ``CLASSIFICATION_ONE_HOT``. Default: ``None``.
-        label_shape (List[int], optional): Shape of the tensor for each sample label.
-            Default: ``None``.
-        transform (Callable, optional): Transform(s) to apply to data. Default: ``None``.
-    """
-
-    def __init__(self,
-                 *,
-                 total_dataset_size: int,
-                 data_shape: Sequence[int] = (64, 64, 3),
-                 num_unique_samples_to_create: int = 100,
-                 data_type: Union[str, SyntheticDataType] = SyntheticDataType.GAUSSIAN,
-                 label_type: Union[str, SyntheticDataLabelType] = SyntheticDataLabelType.CLASSIFICATION_INT,
-                 num_classes: Optional[int] = None,
-                 label_shape: Optional[Sequence[int]] = None,
-                 transform: Optional[Callable] = None):
-        warnings.warn(DeprecationWarning('SyntheticPILDataset is deprecated and will be removed in v0.18'))
-        super().__init__(root='', transform=transform)
-        self._dataset = SyntheticBatchPairDataset(
-            total_dataset_size=total_dataset_size,
-            data_shape=data_shape,
-            data_type=data_type,
-            num_unique_samples_to_create=num_unique_samples_to_create,
-            label_type=label_type,
-            num_classes=num_classes,
-            label_shape=label_shape,
-        )
-
-    def __len__(self) -> int:
-        return len(self._dataset)
-
-    def __getitem__(self, idx: int):
-        input_data, target = self._dataset[idx]
-
-        input_data = input_data.numpy()
-
-        # Shift and scale to be [0, 255]
-        input_data = (input_data - input_data.min())
-        input_data = (input_data * (255 / input_data.max())).astype('uint8')
-
-        sample = Image.fromarray(input_data)
-        if self.transform is not None:
-            return self.transform(sample), target
-        else:
-            return sample, target
diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py
index 46af9c9b0a..c8ff48cfbf 100644
--- a/composer/datasets/utils.py
+++ b/composer/datasets/utils.py
@@ -5,130 +5,28 @@
 
 import logging
 import textwrap
-import warnings
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional
 
-import numpy as np
 import torch
-from PIL import Image
 from torchvision import transforms
 from torchvision.datasets import VisionDataset
 
-from composer.core import Batch
-
 __all__ = [
     'add_vision_dataset_transform',
-    'NormalizationFn',
-    'pil_image_collate',
+    'MultiTokenEOSCriteria',
 ]
 
 log = logging.getLogger(__name__)
 
 
-class NormalizationFn:
-    """Normalizes input data and removes the background class from target data if desired.
-
-    An instance of this class can be used as the ``device_transforms`` argument
-    when constructing a :class:`~composer.core.data_spec.DataSpec`. When used here,
-    the data will normalized after it has been loaded onto the device (i.e., GPU).
-
-    Args:
-        mean (Tuple[float, float, float]): The mean pixel value for each channel (RGB) for
-            the dataset.
-        std (Tuple[float, float, float]): The standard deviation pixel value for each
-            channel (RGB) for the dataset.
-        ignore_background (bool): If ``True``, ignore the background class in the training
-            loss. Only used in semantic segmentation. Default: ``False``.
-    """
-
-    def __init__(self,
-                 mean: Tuple[float, float, float],
-                 std: Tuple[float, float, float],
-                 ignore_background: bool = False):
-        warnings.warn(DeprecationWarning('NormalizationFn is deprecated and will be removed in v0.18'))
-        self.mean = mean
-        self.std = std
-        self.ignore_background = ignore_background
-
-    def __call__(self, batch: Batch):
-        xs, ys = batch
-        assert isinstance(xs, torch.Tensor)
-        assert isinstance(ys, torch.Tensor)
-        device = xs.device
-
-        if not isinstance(self.mean, torch.Tensor):
-            self.mean = torch.tensor(self.mean, device=device)
-            self.mean = self.mean.view(1, 3, 1, 1)
-        if not isinstance(self.std, torch.Tensor):
-            self.std = torch.tensor(self.std, device=device)
-            self.std = self.std.view(1, 3, 1, 1)
-
-        xs = xs.float()
-        xs = xs.sub_(self.mean).div_(self.std)
-        if self.ignore_background:
-            ys = ys.sub_(1)
-        return xs, ys
-
-
-def pil_image_collate(
-        batch: List[Tuple[Image.Image, Union[Image.Image, np.ndarray]]],
-        memory_format: torch.memory_format = torch.contiguous_format) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Constructs a length 2 tuple of torch.Tensors from datasets that yield samples of type
-    :class:`PIL.Image.Image`.
-
-    This function can be used as the ``collate_fn`` argument of a :class:`torch.utils.data.DataLoader`.
-
-    Args:
-        batch (List[Tuple[Image.Image, Union[Image.Image, np.ndarray]]]): List of (image, target) tuples
-            that will be aggregated and converted into a single (:class:`~torch.Tensor`, :class:`~torch.Tensor`)
-            tuple.
-
-        memory_format (torch.memory_format): The memory format for the input and target tensors.
-
-    Returns:
-        (torch.Tensor, torch.Tensor): Tuple of (image tensor, target tensor)
-            The image tensor will be four-dimensional (NCHW or NHWC, depending on the ``memory_format``).
-    """
-    warnings.warn(DeprecationWarning('pil_image_collate is deprecated and will be removed in v0.18'))
-    imgs = [sample[0] for sample in batch]
-    w, h = imgs[0].size
-    image_tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=memory_format)
-
-    # Convert targets to torch tensor
-    targets = [sample[1] for sample in batch]
-    if isinstance(targets[0], Image.Image):
-        target_dims = (len(targets), targets[0].size[1], targets[0].size[0])
-    else:
-        target_dims = (len(targets),)
-    target_tensor = torch.zeros(target_dims, dtype=torch.int64).contiguous(memory_format=memory_format)
-
-    for i, img in enumerate(imgs):
-        nump_array = np.asarray(img, dtype=np.uint8)
-        if nump_array.ndim < 3:
-            nump_array = np.expand_dims(nump_array, axis=-1)
-
-        nump_array = np.rollaxis(nump_array, 2).copy()
-        if nump_array.shape[0] != 3:
-            assert nump_array.shape[0] == 1, 'unexpected shape'
-            nump_array = np.resize(nump_array, (3, h, w))
-        assert image_tensor.shape[1:] == nump_array.shape, 'shape mismatch'
-
-        image_tensor[i] += torch.from_numpy(nump_array)
-        target_tensor[i] += torch.from_numpy(np.array(targets[i], dtype=np.int64))
-
-    return image_tensor, target_tensor
-
-
 def add_vision_dataset_transform(dataset: VisionDataset, transform: Callable, is_tensor_transform: bool = False):
     """Add a transform to a dataset's collection of transforms.
-
     Args:
         dataset (VisionDataset): A torchvision dataset.
         transform (Callable): Function to be added to the dataset's collection of
             transforms.
         is_tensor_transform (bool): Whether ``transform`` acts on data of the type
             :class:`~torch.Tensor`. default: ``False``.
-
             * If ``True``, and :class:`~torchvision.transforms.ToTensor` is present in the transforms of the
               ``dataset``, then ``transform`` will be inserted after the
               :class:`~torchvision.transforms.ToTensor` transform.
@@ -136,12 +34,9 @@ def add_vision_dataset_transform(dataset: VisionDataset, transform: Callable, is
               inserted before :class:`~torchvision.transforms.ToTensor`.
             * If :class:`~torchvision.transforms.ToTensor` is not present, the transform will be appended to
               the end of collection of transforms.
-
     Returns:
         None: The ``dataset`` is modified in-place.
     """
-    warnings.warn(DeprecationWarning('add_vision_dataset_transform is deprecated and will be removed in v0.18'))
-
     transform_added_logstring = textwrap.dedent(f"""\
         Transform {transform} added to dataset.
         Dataset now has the following transforms: {dataset.transform}""")
diff --git a/composer/models/__init__.py b/composer/models/__init__.py
index 095f79d59a..50f714e0fd 100644
--- a/composer/models/__init__.py
+++ b/composer/models/__init__.py
@@ -10,37 +10,14 @@
 """
 
 from composer.models.base import ComposerModel
-from composer.models.bert import create_bert_classification, create_bert_mlm
-from composer.models.classify_mnist import mnist_model
-from composer.models.deeplabv3 import composer_deeplabv3
-from composer.models.efficientnetb0 import composer_efficientnetb0
-from composer.models.gpt2 import create_gpt2
 from composer.models.huggingface import HuggingFaceModel, write_huggingface_pretrained_from_composer_checkpoint
 from composer.models.initializers import Initializer
-from composer.models.mmdetection import MMDetModel
-from composer.models.resnet import composer_resnet
-from composer.models.resnet_cifar import composer_resnet_cifar
 from composer.models.tasks import ComposerClassifier
-from composer.models.timm import composer_timm
-from composer.models.unet import UNet
-from composer.models.vit_small_patch16 import vit_small_patch16
 
 __all__ = [
     'ComposerModel',
-    'create_bert_classification',
-    'create_bert_mlm',
-    'mnist_model',
-    'composer_deeplabv3',
-    'composer_efficientnetb0',
-    'create_gpt2',
     'HuggingFaceModel',
     'write_huggingface_pretrained_from_composer_checkpoint',
     'Initializer',
-    'MMDetModel',
-    'composer_resnet',
-    'composer_resnet_cifar',
     'ComposerClassifier',
-    'composer_timm',
-    'UNet',
-    'vit_small_patch16',
 ]
diff --git a/composer/models/bert/__init__.py b/composer/models/bert/__init__.py
deleted file mode 100644
index 69903a0115..0000000000
--- a/composer/models/bert/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The `BERT <https://huggingface.co/docs/transformers/master/en/model_doc/bert>`_ model family using `Hugging Face
-Transformers <https://huggingface.co/transformers/>`_."""
-
-from composer.models.bert.model import create_bert_classification as create_bert_classification
-from composer.models.bert.model import create_bert_mlm as create_bert_mlm
-
-__all__ = ['create_bert_classification', 'create_bert_mlm']
diff --git a/composer/models/bert/model.py b/composer/models/bert/model.py
deleted file mode 100644
index 7c79ef109b..0000000000
--- a/composer/models/bert/model.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Implements a BERT wrapper around a :class:`.ComposerTransformer`."""
-
-from __future__ import annotations
-
-import warnings
-from typing import Optional
-
-from torchmetrics import MeanSquaredError
-from torchmetrics.classification import MatthewsCorrCoef, MulticlassAccuracy
-from torchmetrics.regression import SpearmanCorrCoef
-
-from composer.metrics.nlp import BinaryF1Score, LanguageCrossEntropy, MaskedAccuracy
-from composer.models.huggingface import HuggingFaceModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['create_bert_mlm', 'create_bert_classification']
-
-
-def create_bert_mlm(use_pretrained: Optional[bool] = False,
-                    pretrained_model_name: Optional[str] = None,
-                    model_config: Optional[dict] = None,
-                    tokenizer_name: Optional[str] = None,
-                    gradient_checkpointing: Optional[bool] = False):
-    """BERT model based on |:hugging_face:| Transformers.
-
-    For more information, see `Transformers <https://huggingface.co/transformers/>`_.
-
-    Args:
-
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): The settings used to create a Hugging Face BertConfig. BertConfig is used to specify the
-        architecture of a Hugging Face model.
-        tokenizer_name (transformers.BertTokenizer, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "bert-base-uncased",
-              "architectures": ["BertForMaskedLM"],
-              "attention_probs_dropout_prob": 0.1,
-              "classifier_dropout": null,
-              "gradient_checkpointing": false,
-              "hidden_act": "gelu",
-              "hidden_dropout_prob": 0.1,
-              "hidden_size": 768,
-              "initializer_range": 0.02,
-              "intermediate_size": 3072,
-              "layer_norm_eps": 1e-12,
-              "max_position_embeddings": 512,
-              "model_type": "bert",
-              "num_attention_heads": 12,
-              "num_hidden_layers": 12,
-              "pad_token_id": 0,
-              "position_embedding_type": "absolute",
-              "transformers_version": "4.16.0",
-              "type_vocab_size": 2,
-              "use_cache": true,
-              "vocab_size": 30522
-            }
-
-   To create a BERT model for Masked Language Model pretraining:
-
-    .. testcode::
-
-        from composer.models import create_bert_mlm
-        model = create_bert_mlm()
-
-    """
-    warnings.warn(DeprecationWarning('create_bert_mlm is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'bert-base-uncased'
-
-    if use_pretrained:
-        assert transformers.AutoModelForMaskedLM.from_pretrained is not None, 'AutoModelForMaskedLM has from_pretrained method'
-        model = transformers.AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
-                                                                  **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForMaskedLM.from_config is not None, 'AutoModelForMaskedLM has from_config method'
-        model = transformers.AutoModelForMaskedLM.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()  # type: ignore
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)]
-    return HuggingFaceModel(model=model, tokenizer=tokenizer, use_logits=True, metrics=metrics)
-
-
-def create_bert_classification(num_labels: int = 2,
-                               use_pretrained: bool = False,
-                               pretrained_model_name: Optional[str] = None,
-                               model_config: Optional[dict] = None,
-                               tokenizer_name: Optional[str] = None,
-                               gradient_checkpointing: bool = False):
-    """BERT classification model based on |:hugging_face:| Transformers.
-
-    For more information, see `Transformers <https://huggingface.co/transformers/>`_.
-
-    Args:
-        num_labels (int, optional): The number of classes in the classification task. Default: ``2``.
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): The settings used to create a Hugging Face BertConfig. BertConfig is used to specify the
-        architecture of a Hugging Face model.
-        tokenizer_name (str, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "bert-base-uncased",
-              "architectures": [
-                "BertForSequenceClassification
-              ],
-              "attention_probs_dropout_prob": 0.1,
-              "classifier_dropout": null,
-              "gradient_checkpointing": false,
-              "hidden_act": "gelu",
-              "hidden_dropout_prob": 0.1,
-              "hidden_size": 768,
-              "id2label": {
-                "0": "LABEL_0",
-                "1": "LABEL_1",
-                "2": "LABEL_2"
-              },
-              "initializer_range": 0.02,
-              "intermediate_size": 3072,
-              "label2id": {
-                "LABEL_0": 0,
-                "LABEL_1": 1,
-                "LABEL_2": 2
-              },
-              "layer_norm_eps": 1e-12,
-              "max_position_embeddings": 512,
-              "model_type": "bert",
-              "num_attention_heads": 12,
-              "num_hidden_layers": 12,
-              "pad_token_id": 0,
-              "position_embedding_type": "absolute",
-              "transformers_version": "4.16.0",
-              "type_vocab_size": 2,
-              "use_cache": true,
-              "vocab_size": 30522
-            }
-
-   To create a BERT model for classification:
-
-    .. testcode::
-
-        from composer.models import create_bert_classification
-        model = create_bert_classification(num_labels=3) # if the task has three classes.
-
-    Note:
-        This function can be used to construct a BERT model for regression by setting ``num_labels == 1``.
-        This will have two noteworthy effects. First, it will switch the training loss to :class:`~torch.nn.MSELoss`.
-        Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and :class:`~torchmetrics.SpearmanCorrCoef`.
-
-        For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
-        metrics are :class:`~torchmetrics.MulticlassAccuracy` and :class:`~torchmetrics.MatthewsCorrCoef`, as well as :class:`.BinaryF1Score` if ``num_labels == 2``.
-    """
-    warnings.warn(DeprecationWarning('create_bert_classification is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    model_config['num_labels'] = num_labels
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'bert-base-uncased'
-
-    if use_pretrained:
-        assert transformers.AutoModelForSequenceClassification.from_pretrained is not None, 'AutoModelForSequenceClassification has from_pretrained method'
-        model = transformers.AutoModelForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name, **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForSequenceClassification.from_config is not None, 'AutoModelForSequenceClassification has from_config method'
-        model = transformers.AutoModelForSequenceClassification.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    if num_labels == 1:
-        # Metrics for a regression model
-        metrics = [MeanSquaredError(), SpearmanCorrCoef()]
-    else:
-        # Metrics for a classification model
-        metrics = [
-            MulticlassAccuracy(num_classes=num_labels, average='micro'),
-            MatthewsCorrCoef(task='multiclass', num_classes=num_labels)
-        ]
-        if num_labels == 2:
-            metrics.append(BinaryF1Score())
-
-    return HuggingFaceModel(model=model, tokenizer=tokenizer, use_logits=True, metrics=metrics)
diff --git a/composer/models/classify_mnist/__init__.py b/composer/models/classify_mnist/__init__.py
deleted file mode 100644
index 146326e893..0000000000
--- a/composer/models/classify_mnist/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A simple example convolutional neural network which can be used to classify MNIST data."""
-from composer.models.classify_mnist.model import mnist_model as mnist_model
-
-__all__ = ['mnist_model']
-
-_task = 'Image Classification'
-_dataset = 'MNIST'
-_name = 'SimpleConvNet'
-_quality = ''
-_metric = 'MulticlassAccuracy'
-_ttt = '?'
-_hparams = 'classify_mnist_cpu.yaml'
diff --git a/composer/models/classify_mnist/model.py b/composer/models/classify_mnist/model.py
deleted file mode 100644
index 708196af5b..0000000000
--- a/composer/models/classify_mnist/model.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A simple convolutional neural network extending :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import List, Optional, Sequence, Union
-
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['Model', 'mnist_model']
-
-
-class Model(nn.Module):
-    """Toy convolutional neural network architecture in pytorch for MNIST."""
-
-    def __init__(self, initializers: Sequence[Union[str, Initializer]], num_classes: int = 10):
-        warnings.warn(DeprecationWarning('Model is deprecated and will be removed in v0.18'))
-        super().__init__()
-
-        self.num_classes = num_classes
-
-        for initializer in initializers:
-            initializer = Initializer(initializer)
-            self.apply(initializer.get_initializer())
-
-        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
-        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
-        self.bn = nn.BatchNorm2d(32)
-        self.fc1 = nn.Linear(32 * 16, 32)
-        self.fc2 = nn.Linear(32, num_classes)
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = F.relu(out)
-        out = self.conv2(out)
-        out = self.bn(out)
-        out = F.relu(out)
-        out = F.adaptive_avg_pool2d(out, (4, 4))
-        out = torch.flatten(out, 1, -1)
-        out = self.fc1(out)
-        out = F.relu(out)
-        return self.fc2(out)
-
-
-def mnist_model(num_classes: int = 10, initializers: Optional[List[Initializer]] = None):
-    """Helper function to create a :class:`.ComposerClassifier` with a simple convolutional neural network.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``
-        initializers (List[Initializer], optional): list of Initializers
-            for the model. ``None`` for no initialization. Default: ``None``
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a simple MNIST model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import mnist_model
-
-        model = mnist_model()
-    """
-    warnings.warn(DeprecationWarning('mnist_model is deprecated and will be removed in v0.18'))
-
-    if initializers is None:
-        initializers = []
-
-    model = Model(initializers, num_classes)
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/deeplabv3/README.md b/composer/models/deeplabv3/README.md
deleted file mode 100644
index 2d207b4a26..0000000000
--- a/composer/models/deeplabv3/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# 🤿 DeepLabv3+
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Training Hyperparameters\]](#training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-[DeepLabv3+](https://arxiv.org/abs/1802.02611) is an architecture designed for semantic segmenation i.e. per-pixel classification. DeepLabv3+ takes in a feature map from a backbone architecture (e.g. ResNet-101), then outputs classifications for each pixel in the input image. Our implementation is a simple wrapper around [torchvision’s ResNet](https://pytorch.org/vision/stable/models.html#id10) for the backbone and [mmsegmentation’s DeepLabv3+](https://github.com/open-mmlab/mmsegmentation/tree/master/configs/deeplabv3plus) for the head.
-
-## Example
-
-<!--pytest.mark.skip-->
-```python
-from composer.models import composer_deeplabv3
-
-model = composer_deeplabv3(num_classes=150,
-                           backbone_arch="resnet101",
-                           backbone_weights="IMAGENET1K_V2",
-                           sync_bn=False
-)
-```
-
-## Architecture
-
-Based on [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
-
-<div align=center>
-<img src="https://storage.googleapis.com/docs.mosaicml.com/images/models/deeplabv3_v2.png" alt="deeplabv3plus" width="650">
-</div>
-
-
-- **Backbone network**: converts the input image into a feature map.
-    * Usually ResNet-101 with the strided convolutions converted to dilations convolutions in stage 3 and 4.
-    * The 3x3 convolutions in stage 3 and 4 have dilation sizes of 2 and 4, respectively, to compensate for the decreased receptive field.
-    * The average pooling and classification layer are ignored.
-- **Spatial Pyramid Pooling**: extracts multi-resolution features from the stage 4 backbone feature map.
-    * The backbone feature map is processed with four parallel convolution layers with dilations {1, 12, 24, 36} and kernel sizes {1x1, 3x3, 3x3, 3x3}.
-    * In parallel to the convolutions, global average pool the backbone feature map, then bilinearly upsample to be the same spatial dimension as the feature map.
-    * Concatenate the outputs from the convolutions and global average pool, then process with a 1x1 convolution.
-    * The 3x3 convolutions are implemented as depth-wise convolutions to reduce memory and computation cost.
-- **Decoder**: converts the output of spatial pyramid pooling (SPP) to class predictions of the same spatial dimension as the input image.
-    * SPP output is bilinearly upsampled to be the same spatial dimension as the output from the first stage in the backbone network.
-    * A 1x1 convolution is applied to the first stage activations, then this is concatenated with the upsampled SPP output.
-    * The concatenation is processed by a 3x3 convolution with dropout followed by a classification layer.
-    * The predictions are bilinearly upsampled to be the same resolution as the input image.
-
-## Training Hyperparameters
-
-We tested two sets of hyperparameters for DeepLabv3+ trained on the ADE20k dataset.
-
-### Typical ADE20k Model Hyperparameters
-
-- Model: deeplabv3:
-  - Initializers: kaiming_normal, bn_ones
-  - Number of classes: 150
-  - Backbone weights: IMAGENET1K_V1
-  - Sync BatchNorm
-- Optimizer: SGD
-  - Learning rate: 0.01
-  - Momentum: 0.9
-  - Weight decay: 5.0e-4
-  - Dampening: 0
-  - Nsterov: false
-- LR schedulers:
-  - Polynomial:
-    - Alpha_f: 0.01
-    - Power: 0.9
-- Number of epochs: 127
-- Batch size: 16
-- Precision: amp
-
-| Model | mIoU | Time-to-Train on 8xA100 |
-| --- | --- | --- |
-| ResNet101-DeepLabv3+ | 44.17 +/- 0.17 | 6.385 hr |
-
-### Composer ADE20k Model Hyperparameters
-
-- Model: deeplabv3:
-  - Initializers: kaiming_normal, bn_ones
-  - Number of classes: 150
-  - Backbone Architecture: resnet101
-  - Sync BatchNorm
-  - Backbone weights: IMAGENET1K_V2
-- Optimizer: Decoupled SGDW
-  - Learning rate: 0.01
-  - Momentum: 0.9
-  - Weight decay: 2.0e-5
-  - Dampening: 0
-  - Nesterov: false
-- LR schedulers:
-  - Cosine decay, t_max: 1dur
-- Number of epochs: 128
-- Batch size: 32
-- Precision: amp
-
-| Model | mIoU | Time-to-Train on 8xA100 |
-| --- | --- | --- |
-| ResNet101-DeepLabv3+ | 45.764 +/- 0.29 | 4.67 hr |
-
-Improvements:
-
-- New PyTorch pretrained weights
-- Cosine decay
-- Decoupled Weight Decay
-- Increase batch size to 32
-- Decrease weight decay to 2e-5
-
-## Attribution
-
-[Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611) by Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam
-
-[OpenMMLab Semantic Segmentation Toolbox and Benchmark](https://github.com/open-mmlab/mmsegmentation)
-
-[How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) by Vasilis Vryniotis
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.deeplabv3.composer_deeplabv3
-    :noindex:
-```
diff --git a/composer/models/deeplabv3/__init__.py b/composer/models/deeplabv3/__init__.py
deleted file mode 100644
index e3473a3015..0000000000
--- a/composer/models/deeplabv3/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""DeepLabV3 for image segmentation."""
-from composer.models.deeplabv3.model import composer_deeplabv3 as composer_deeplabv3
-
-__all__ = ['composer_deeplabv3']
diff --git a/composer/models/deeplabv3/model.py b/composer/models/deeplabv3/model.py
deleted file mode 100644
index 876604d3c5..0000000000
--- a/composer/models/deeplabv3/model.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""DeepLabV3 model extending :class:`.ComposerClassifier`."""
-
-import functools
-import textwrap
-import warnings
-from typing import Dict, Optional, Sequence
-
-import torch
-import torch.distributed as torch_dist
-import torch.nn.functional as F
-import torchvision
-from packaging import version
-from torchmetrics import MetricCollection
-from torchvision.models import _utils, resnet
-
-from composer.loss import DiceLoss, soft_cross_entropy
-from composer.metrics import CrossEntropy, MIoU
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-from composer.utils import dist
-
-__all__ = ['deeplabv3', 'composer_deeplabv3']
-
-_IMAGENET1K_V1_URL = 'https://download.pytorch.org/models/resnet101-63fe2227.pth'
-_IMAGENET1K_V2_URL = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth'
-
-
-class SimpleSegmentationModel(torch.nn.Module):
-
-    def __init__(self, backbone, classifier):
-        warnings.warn(DeprecationWarning('SimpleSegmentationModel is deprecated and will be removed in v0.18'))
-
-        super().__init__()
-        self.backbone = backbone
-        self.classifier = classifier
-
-    def forward(self, x):
-        input_shape = x.shape[-2:]
-        features = self.backbone(x)
-        logits = self.classifier(tuple(features.values()))
-        logits = F.interpolate(logits,
-                               size=input_shape,
-                               mode='bilinear',
-                               align_corners=False,
-                               recompute_scale_factor=False)
-        return logits
-
-
-def deeplabv3(num_classes: int,
-              backbone_arch: str = 'resnet101',
-              backbone_weights: Optional[str] = None,
-              sync_bn: bool = True,
-              use_plus: bool = True,
-              initializers: Sequence[Initializer] = ()):
-    """Helper function to build a mmsegmentation DeepLabV3 model.
-
-    Args:
-        num_classes (int): Number of classes in the segmentation task.
-        backbone_arch (str, optional): The architecture to use for the backbone. Must be either
-            [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``.
-        backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone.
-            Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``.
-        sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers.
-            Default: ``True``.
-        use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``.
-        initializers (Sequence[Initializer], optional): Initializers for the model. ``()`` for no initialization.
-            Default: ``()``.
-
-    Returns:
-        deeplabv3: A DeepLabV3 :class:`torch.nn.Module`.
-
-    Example:
-
-    .. code-block:: python
-
-        from composer.models.deeplabv3.deeplabv3 import deeplabv3
-
-        pytorch_model = deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None)
-    """
-    warnings.warn(DeprecationWarning('deeplabv3 is deprecated and will be removed in v0.18'))
-
-    # check that the specified architecture is in the resnet module
-    if not hasattr(resnet, backbone_arch):
-        raise ValueError(f'backbone_arch must be part of the torchvision resnet module, got value: {backbone_arch}')
-
-    # change the model weight url if specified
-    if version.parse(torchvision.__version__) < version.parse('0.13.0'):
-        pretrained = False
-        if backbone_weights:
-            pretrained = True
-            if backbone_weights == 'IMAGENET1K_V1':
-                resnet.model_urls[backbone_arch] = _IMAGENET1K_V1_URL  # pyright: ignore[reportGeneralTypeIssues]
-            elif backbone_weights == 'IMAGENET1K_V2':
-                resnet.model_urls[backbone_arch] = _IMAGENET1K_V2_URL  # pyright: ignore[reportGeneralTypeIssues]
-            else:
-                ValueError(
-                    textwrap.dedent(f"""\
-                        `backbone_weights` must be either "IMAGENET1K_V1" or "IMAGENET1K_V2"
-                        if torchvision.__version__ < 0.13.0. `backbone_weights` was {backbone_weights}."""))
-        backbone = getattr(resnet, backbone_arch)(pretrained=pretrained,
-                                                  replace_stride_with_dilation=[False, True, True])
-    else:
-        backbone = getattr(resnet, backbone_arch)(weights=backbone_weights,
-                                                  replace_stride_with_dilation=[False, True, True])
-
-    # specify which layers to extract activations from
-    return_layers = {'layer1': 'layer1', 'layer4': 'layer4'} if use_plus else {'layer4': 'layer4'}
-    backbone = _utils.IntermediateLayerGetter(backbone, return_layers=return_layers)
-
-    try:
-        from mmseg.models import ASPPHead, DepthwiseSeparableASPPHead
-    except ImportError as e:
-        raise ImportError(
-            textwrap.dedent("""\
-            Either mmcv or mmsegmentation is not installed. To install mmcv, please run pip install mmcv-full==1.4.4 -f
-             https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html where {cu_version} and
-             {torch_version} refer to your CUDA and PyTorch versions, respectively. To install mmsegmentation, please
-             run pip install mmsegmentation==0.22.0 on command-line.""")) from e
-
-    world_size = dist.get_world_size()
-    if sync_bn and world_size == 1:
-        warnings.warn('sync_bn was true, but only one process is present for training. sync_bn will be ignored.')
-
-    norm_type = 'SyncBN' if sync_bn and world_size > 1 else 'BN'
-    norm_cfg = {'type': norm_type, 'requires_grad': True}
-    if use_plus:
-        # mmseg config:
-        # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3plus_r50-d8.py
-        head = DepthwiseSeparableASPPHead(in_channels=2048,
-                                          in_index=-1,
-                                          channels=512,
-                                          dilations=(1, 12, 24, 36),
-                                          c1_in_channels=256,
-                                          c1_channels=48,
-                                          dropout_ratio=0.1,
-                                          num_classes=num_classes,
-                                          norm_cfg=norm_cfg,
-                                          align_corners=False)
-    else:
-        # mmseg config:
-        # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3_r50-d8.py
-        head = ASPPHead(in_channels=2048,
-                        in_index=-1,
-                        channels=512,
-                        dilations=(1, 12, 24, 36),
-                        dropout_ratio=0.1,
-                        num_classes=num_classes,
-                        norm_cfg=norm_cfg,
-                        align_corners=False)
-
-    model = SimpleSegmentationModel(backbone, head)
-
-    if initializers:
-        for initializer in initializers:
-            initializer_fn = Initializer(initializer).get_initializer()
-
-            # Only apply initialization to classifier head if pre-trained weights are used
-            if backbone_weights is None:
-                model.apply(initializer_fn)
-            else:
-                model.classifier.apply(initializer_fn)
-
-    if sync_bn and world_size > 1:
-        local_world_size = dist.get_local_world_size()
-
-        # List of ranks for each node, assumes that each node has the same number of ranks
-        num_nodes = world_size // local_world_size
-        process_group = None
-        if num_nodes > 1:
-            ranks_per_node = [
-                list(range(node * local_world_size, (node + 1) * local_world_size)) for node in range(num_nodes)
-            ]
-            process_groups = [torch_dist.new_group(ranks) for ranks in ranks_per_node]
-            process_group = process_groups[dist.get_node_rank()]
-
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=process_group)
-
-    return model
-
-
-def composer_deeplabv3(num_classes: int,
-                       backbone_arch: str = 'resnet101',
-                       backbone_weights: Optional[str] = None,
-                       sync_bn: bool = True,
-                       use_plus: bool = True,
-                       ignore_index: int = -1,
-                       cross_entropy_weight: float = 1.0,
-                       dice_weight: float = 0.0,
-                       initializers: Sequence[Initializer] = ()):
-    """Helper function to create a :class:`.ComposerClassifier` with a DeepLabv3(+) model. Logs
-        Mean Intersection over Union (MIoU) and Cross Entropy during training and validation.
-
-    From `Rethinking Atrous Convolution for Semantic Image Segmentation <https://arxiv.org/abs/1706.05587>`_
-        (Chen et al, 2017).
-
-    Args:
-        num_classes (int): Number of classes in the segmentation task.
-        backbone_arch (str, optional): The architecture to use for the backbone. Must be either
-            [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``.
-        backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone.
-            Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``.
-        sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers.
-            Default: ``True``.
-        use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``.
-        ignore_index (int): Class label to ignore when calculating the loss and other metrics. Default: ``-1``.
-        cross_entropy_weight (float): Weight to scale the cross entropy loss. Default: ``1.0``.
-        dice_weight (float): Weight to scale the dice loss. Default: ``0.0``.
-        initializers (List[Initializer], optional): Initializers for the model. ``[]`` for no initialization.
-            Default: ``[]``.
-
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a DeepLabv3(+) model.
-
-    Example:
-
-    .. code-block:: python
-
-        from composer.models import composer_deeplabv3
-
-        model = composer_deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None)
-    """
-    warnings.warn(DeprecationWarning('composer_deeplabv3 is deprecated and will be removed in v0.18'))
-
-    model = deeplabv3(backbone_arch=backbone_arch,
-                      backbone_weights=backbone_weights,
-                      use_plus=use_plus,
-                      num_classes=num_classes,
-                      sync_bn=sync_bn,
-                      initializers=initializers)
-
-    train_metrics = MetricCollection(
-        [CrossEntropy(ignore_index=ignore_index),
-         MIoU(num_classes, ignore_index=ignore_index)])
-    val_metrics = MetricCollection(
-        [CrossEntropy(ignore_index=ignore_index),
-         MIoU(num_classes, ignore_index=ignore_index)])
-
-    ce_loss_fn = functools.partial(soft_cross_entropy, ignore_index=ignore_index)
-    dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True)
-
-    def _combo_loss(output, target) -> Dict[str, torch.Tensor]:
-        loss = {'total': torch.zeros(1, device=output.device, dtype=output.dtype)}
-        if cross_entropy_weight:
-            loss['cross_entropy'] = ce_loss_fn(output, target)
-            loss['total'] += loss['cross_entropy'] * cross_entropy_weight
-        if dice_weight:
-            loss['dice'] = dice_loss_fn(output, target)
-            loss['total'] += loss['dice'] * dice_weight
-        return loss
-
-    composer_model = ComposerClassifier(module=model,
-                                        train_metrics=train_metrics,
-                                        val_metrics=val_metrics,
-                                        loss_fn=_combo_loss)
-    return composer_model
diff --git a/composer/models/efficientnetb0/README.md b/composer/models/efficientnetb0/README.md
deleted file mode 100644
index 9cb1096bc6..0000000000
--- a/composer/models/efficientnetb0/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# EfficientNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` /`Image Classification`
-
-The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks, but were initially designed for image classification. The model family was designed to reach the highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and image resolution according to an empirically determined scaling law.
-
-## Example
-
-```python
-from composer.models import composer_efficientnetb0
-
-model = composer_efficientnetb0(num_classes=1000, drop_connect_rate=0.2)
-```
-
-## Architecture
-
-The table below from Tan and Le specifies the EfficientNet baseline architecture broken up into separate stages. MBConv indicates a mobile inverted bottleneck with a specific expansion size and kernel size. Resolution is the expected input resolution of the current stage. Number of channels is the number of output channels of the current stage. Number of layers indicates the number of repeated blocks in each stage. Subsequent EfficientNet family members scale the resolution, number of channels, and number of layers according to the resolution, width, and depth scaling parameters defined by Tan and Le.
-
-![efficientnet_arch.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/efficientnet_arch.png)
-
-## Family members
-
-Tan and Le included 8 members in their model family. The goal was for each family member to have approximately double the FLOPs of the previous family member. Currently, we only support EfficientNet-B0.
-
-| Model Family Member | Parameter Count | TPU Repo Accuracy* | Our Accuracy** | Training Time on 8x3080 |
-|---------------------|-----------------|--------------------|----------------|-------------------------|
-| EfficientNet-B0     | 5.3M            | 77.1%              | 77.22%         | 23.3 hr                 |
-| EfficientNet-B1     | 7.8M            | 79.1%              | TBA            | TBA                     |
-| EfficientNet-B2     | 9.2M            | 80.1%              | TBA            | TBA                     |
-| EfficientNet-B3     | 12M             | 81.6%              | TBA            | TBA                     |
-| EfficientNet-B4     | 19M             | 82.9%              | TBA            | TBA                     |
-| EfficientNet-B5     | 30M             | 83.6%              | TBA            | TBA                     |
-| EfficientNet-B6     | 43M             | 84.0%              | TBA            | TBA                     |
-| EfficientNet-B7     | 66M             | 84.3%              | TBA            | TBA                     |
-
-*Includes label smoothing, sample-wise stochastic depth, and AutoAugment
-
-**Includes label smoothing and sample-wise stochastic depth
-
-## Default Training Hyperparameters
-
-We use the following default hyperparameters from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet):
-
-```yaml
-optimizer:
-  rmsprop:
-    lr: 0.08
-    momentum: 0.9
-    alpha: 0.9
-    eps: 0.01
-    weight_decay: 1.0e-5
-schedulers:
-  - cosine_decay_with_warmup:
-      t_warmup: "16ep"
-train_batch_size: 4096
-max_duration: 400ep
-```
-
-Our implementation differs from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) in that we:
-
-- Apply weight decay to batch normalization trainable parameters
-- Use `momentum = 0.1` and `eps = 1e-5` as batch normalization parameters
-
-## Attribution
-
-Paper: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le
-
-Code: [gen-efficientnet-pytorch Github repository](https://github.com/rwightman/gen-efficientnet-pytorch) by Ross Wightman
-
-Hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) by Nvidia
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.efficientnetb0.composer_efficientnetb0
-    :noindex:
-```
diff --git a/composer/models/efficientnetb0/__init__.py b/composer/models/efficientnetb0/__init__.py
deleted file mode 100644
index d1101f595c..0000000000
--- a/composer/models/efficientnetb0/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety
-of vision tasks, but were initially designed for image classification. The model family was designed to reach the
-highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and
-image resolution according to an empirically determined scaling law.
-
-See the :doc:`Model Card </model_cards/efficientnet>` for more details.
-"""
-from composer.models.efficientnetb0.model import composer_efficientnetb0 as composer_efficientnetb0
-
-__all__ = ['composer_efficientnetb0']
-
-_task = 'Image Classification'
-_dataset = 'ImageNet'
-_name = 'EfficientNet-B0'
-_quality = '76.63'
-_metric = 'Top-1 Accuracy'
-_ttt = '21h 48m'
-_hparams = 'efficientnetb0.yaml'
diff --git a/composer/models/efficientnetb0/_layers.py b/composer/models/efficientnetb0/_layers.py
deleted file mode 100644
index 1dbf62450d..0000000000
--- a/composer/models/efficientnetb0/_layers.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Callable, Optional
-
-import torch
-from torch import nn as nn
-
-
-def round_channels(
-    channels: float,
-    width_multiplier: float,
-    divisor: int = 8,
-    min_value: Optional[int] = None,
-) -> int:
-    """Round number of channels after scaling with width multiplier.
-
-    This function ensures that channel integers halfway in-between divisors is rounded up.
-
-    Args:
-        channels (float): Number to round.
-        width_multiplier (float): Amount to scale `channels`.
-        divisor (int): Number to make the output divisible by.
-        min_value (int, optional): Minimum value the output can be. If not specified, defaults
-            to the ``divisor``.
-    """
-    if not width_multiplier:
-        return int(channels)
-    channels *= width_multiplier
-
-    min_value = min_value or divisor
-    new_channels = max(min_value, int(channels + divisor / 2) // divisor * divisor)
-    if new_channels < 0.9 * channels:  # increase channels if rounding decreases by >10%
-        new_channels += divisor
-    return new_channels
-
-
-def calculate_same_padding(kernel_size, dilation, stride):
-    """Calculates the amount of padding to use to get the "SAME" functionality in Tensorflow."""
-    return ((stride - 1) + dilation * (kernel_size - 1)) // 2
-
-
-def drop_connect(inputs: torch.Tensor, drop_connect_rate: float, training: bool):
-    """Randomly mask a set of samples. Provides similar regularization as stochastic depth.
-
-    Args:
-        input (torch.Tensor): Input tensor to mask.
-        drop_connect_rate (float): Probability of droppping each sample.
-        training (bool): Whether or not the model is training
-    """
-    if not training:
-        return inputs
-
-    keep_prob = 1 - drop_connect_rate
-    rand_tensor = keep_prob + torch.rand(
-        [inputs.size()[0], 1, 1, 1],
-        dtype=inputs.dtype,
-        device=inputs.device,
-    )
-    rand_tensor.floor_()  # binarize
-    output = inputs.div(keep_prob) * rand_tensor
-    return output
-
-
-class SqueezeExcite(nn.Module):
-    """Squeeze Excite Layer.
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        latent_channels (int): Number of hidden channels.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        act_layer: Callable[..., nn.Module] = nn.ReLU,
-    ):
-        super().__init__()
-
-        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.conv_reduce = nn.Conv2d(in_channels, latent_channels, kernel_size=1, bias=True)
-        self.act1 = act_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(latent_channels, in_channels, kernel_size=1, bias=True)
-        self.gate_fn = torch.nn.Sigmoid()
-
-    def forward(self, x: torch.Tensor):
-        out = self.global_avg_pool(x)
-        out = self.conv_reduce(out)
-        out = self.act1(out)
-        out = self.conv_expand(out)
-        out = x * self.gate_fn(out)
-        return out
-
-
-class DepthwiseSeparableConv(nn.Module):
-    """Depthwise Separable Convolution layer.
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        out_channels (int): Number of channels in the output tensor.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution.
-        se_ratio (float): How much to scale `in_channels` for the hidden layer
-            dimensionality of the squeeze-excite module.
-        drop_connect_rate (float): Probability of dropping a sample before the
-            identity connection, provides regularization similar to stochastic
-            depth.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-        norm_kwargs (dict): Normalization layer's keyword arguments.
-        norm_layer (torch.nn.Module): Normalization layer to use in block.
-    """
-
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int,
-                 stride: int,
-                 se_ratio: float,
-                 drop_connect_rate: float,
-                 act_layer: Callable[..., nn.Module],
-                 norm_kwargs: dict,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        super().__init__()
-        self.drop_connect_rate = drop_connect_rate
-        self.has_residual = (in_channels == out_channels and stride == 1)
-        self.has_se = se_ratio > 0.0
-
-        padding = calculate_same_padding(kernel_size, dilation=1, stride=stride)
-        self.conv_depthwise = nn.Conv2d(in_channels=in_channels,
-                                        out_channels=in_channels,
-                                        groups=in_channels,
-                                        kernel_size=kernel_size,
-                                        stride=stride,
-                                        padding=padding,
-                                        bias=False)
-        self.bn1 = norm_layer(in_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        if self.has_se:
-            latent_channels = max(1, int(in_channels * se_ratio))
-            self.se = SqueezeExcite(in_channels, latent_channels, act_layer)
-
-        self.conv_pointwise = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            bias=False,
-        )
-        self.bn2 = norm_layer(out_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-    def forward(self, input: torch.Tensor):
-        residual = input
-
-        out = self.conv_depthwise(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-
-        if self.has_se:
-            out = self.se(out)
-
-        out = self.conv_pointwise(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-
-        if self.has_residual:
-            if self.drop_connect_rate > 0.0:
-                out = drop_connect(out, self.drop_connect_rate, self.training)
-            out += residual
-        return out
-
-
-class MBConvBlock(nn.Module):
-    """Mobile Inverted Residual Bottleneck Block.
-
-    This block is implemented as as defined in
-    `MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/abs/1801.04381>`_ (Sandler et al, 2018).
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        out_channels (int): Number of channels in the output tensor.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution.
-        expand_ratio (int): How much to expand the input channels for the
-            depthwise convolution.
-        se_ratio (float): How much to scale `in_channels` for the hidden layer
-            dimensionality of the squeeze-excite module.
-        drop_connect_rate (float): Probability of dropping a sample before the
-            identity connection, provides regularization similar to stochastic
-            depth.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-        norm_kwargs (dict): Normalization layer's keyword arguments.
-        norm_layer (torch.nn.Module): Normalization layer to use in block.
-    """
-
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int,
-                 stride: int,
-                 expand_ratio: int,
-                 se_ratio: float,
-                 drop_connect_rate: float,
-                 act_layer: Callable[..., nn.Module],
-                 norm_kwargs: dict,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        super().__init__()
-        self.drop_connect_rate = drop_connect_rate
-        self.has_residual = (in_channels == out_channels and stride == 1)
-        self.has_se = se_ratio > 0.0
-
-        mid_channels = round_channels(in_channels, expand_ratio)
-
-        # Point-wise convolution expansion
-        self.conv1x1_expand = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(mid_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Depth-wise Convolution
-        padding = calculate_same_padding(kernel_size, dilation=1, stride=stride)
-        self.conv_depthwise = nn.Conv2d(in_channels=mid_channels,
-                                        out_channels=mid_channels,
-                                        groups=mid_channels,
-                                        kernel_size=kernel_size,
-                                        stride=stride,
-                                        padding=padding,
-                                        bias=False)
-        self.bn2 = norm_layer(mid_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        # Squeeze and Excitation layer, if specified
-        if self.has_se:
-            latent_channels = max(1, int(in_channels * se_ratio))
-            self.se = SqueezeExcite(mid_channels, latent_channels, act_layer)
-
-        # Point-wise convolution contraction
-        self.conv1x1_contract = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(out_channels, **norm_kwargs)
-
-    def forward(self, input: torch.Tensor):
-        residual = input
-
-        out = self.conv1x1_expand(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-
-        out = self.conv_depthwise(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-
-        if self.has_se:
-            out = self.se(out)
-
-        out = self.conv1x1_contract(out)
-        out = self.bn3(out)
-
-        if self.has_residual:
-            if self.drop_connect_rate:
-                out = drop_connect(out, self.drop_connect_rate, self.training)
-            out += residual
-        return out
diff --git a/composer/models/efficientnetb0/efficientnets.py b/composer/models/efficientnetb0/efficientnets.py
deleted file mode 100644
index 7c544a5143..0000000000
--- a/composer/models/efficientnetb0/efficientnets.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""EfficientNet model.
-
-Adapted from `(Generic) EfficientNets for PyTorch. <https://github.com/rwightman/gen-efficientnet-pytorch>`_.
-"""
-
-import math
-import re
-import warnings
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-
-from composer.models.efficientnetb0._layers import (DepthwiseSeparableConv, MBConvBlock, calculate_same_padding,
-                                                    round_channels)
-
-__all__ = ['EfficientNet']
-
-
-class EfficientNet(nn.Module):
-    """EfficientNet model based on (`Tan et al, 2019 <https://arxiv.org/abs/1905.11946>`_).
-
-    Args:
-        num_classes (int): Size of the EfficientNet output, typically viewed
-             as the number of classes in a classification task.
-        width_multiplier (float, optional): How much to scale the EfficientNet-B0 channel
-             dimension throughout the model. Default: ``1.0``.
-        depth_multiplier (float, optional): How much to scale the EFficientNet-B0 depth. Default: ``1.0``.
-        drop_rate (float, optional): Dropout probability for the penultimate activations. Default: ``0.2``.
-        drop_connect_rate (float, optional): Probability of dropping a sample before the
-             identity connection, provides regularization similar to stochastic
-             depth. Default: ``0.2``.
-        act_layer (torch.nn.Module, optional): Activation layer to use in the model. Default: ``nn.SiLU``.
-        norm_kwargs (dict, optional): Normalization layer's keyword arguments. Default: ``{"momentum": 0.1, "eps": 1e-5}``.
-        norm_layer (torch.nn.Module, optional): Normalization layer to use in the model. Default: ``nn.BatchNorm2d``.
-    """
-
-    # EfficientNet-B0 architecture specification.
-    # block_strings are decoded into block level hyperparameters.
-    # r=repeat, k=kernel_size, s=stride, e=expand_ratio, i=in_channels, o=out_channels, se=se_ratio.
-    _blocks_strings = [
-        'r1_k3_s1_e1_i32_o16_se0.25',
-        'r2_k3_s2_e6_i16_o24_se0.25',
-        'r2_k5_s2_e6_i24_o40_se0.25',
-        'r3_k3_s2_e6_i40_o80_se0.25',
-        'r3_k5_s1_e6_i80_o112_se0.25',
-        'r4_k5_s2_e6_i112_o192_se0.25',
-        'r1_k3_s1_e6_i192_o320_se0.25',
-    ]
-
-    def __init__(self,
-                 num_classes: int,
-                 width_multiplier: float = 1.0,
-                 depth_multiplier: float = 1.0,
-                 drop_rate: float = 0.2,
-                 drop_connect_rate: float = 0.2,
-                 act_layer: Callable[..., nn.Module] = nn.SiLU,
-                 norm_kwargs: Optional[dict] = None,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        warnings.warn(DeprecationWarning('EfficientNet is deprecated and will be removed in v0.18'))
-
-        super(EfficientNet, self).__init__()
-        self.num_classes = num_classes
-
-        if norm_kwargs is None:
-            norm_kwargs = {'momentum': 0.1, 'eps': 1e-5}
-
-        in_channels = 3
-        out_channels = round_channels(32, width_multiplier)
-        padding = calculate_same_padding(kernel_size=3, dilation=1, stride=2)
-        self.conv_stem = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=2,
-            padding=padding,
-            bias=False,
-        )
-        self.bn1 = norm_layer(num_features=out_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Count the number of blocks in the model
-        block_count = 0.
-        for block_string in self._blocks_strings:
-            _, num_repeat = self._decode_block_string(block_string)
-            block_count += num_repeat
-
-        # Decode block strings and add blocks
-        block_idx = 0.
-        blocks = []
-        block_args = {}
-        for block_string in self._blocks_strings:
-            block_args, num_repeat = self._decode_block_string(block_string)
-            # Scale channels and number of repeated blocks based on multipliers
-            block_args['in_channels'] = round_channels(
-                block_args['in_channels'],
-                width_multiplier,
-            )
-            block_args['out_channels'] = round_channels(
-                block_args['out_channels'],
-                width_multiplier,
-            )
-            num_repeat = int(math.ceil(depth_multiplier * num_repeat))
-
-            # Add activation, normalization layers, and drop connect
-            block_args['act_layer'] = act_layer
-            block_args['norm_kwargs'] = norm_kwargs
-            block_args['norm_layer'] = norm_layer
-
-            # Delete expand_ratio when set to 1 to use depthwise separable convolution layer
-            if block_args['expand_ratio'] == 1:
-                del block_args['expand_ratio']
-
-            for i in range(num_repeat):
-                # Linearly decay drop_connect_rate across model depth
-                block_args['drop_connect_rate'] = drop_connect_rate * block_idx / block_count
-
-                if 'expand_ratio' not in block_args:
-                    blocks.append(DepthwiseSeparableConv(**block_args))
-                else:
-                    blocks.append(MBConvBlock(**block_args))
-                block_idx += 1
-
-                # Only the first block in a stage can have stride != 1
-                if i == 0:
-                    block_args['stride'] = 1
-                    block_args['in_channels'] = block_args['out_channels']
-
-        self.blocks = nn.Sequential(*blocks)
-
-        in_channels = block_args['out_channels']
-        out_channels = round_channels(1280, width_multiplier)
-        self.conv_head = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
-        self.bn2 = norm_layer(out_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.dropout = nn.Dropout(drop_rate)
-        self.classifier = nn.Linear(out_channels, num_classes)
-
-        # Initialization from gen-efficientnet-pytorch repo
-        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                fan_out = (m.kernel_size[0] * m.kernel_size[1] * m.out_channels) // m.groups
-                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-                if m.bias is not None:
-                    m.bias.data.zero_()
-            elif isinstance(m, torch.nn.BatchNorm2d):
-                m.weight.data.fill_(1.0)
-                m.bias.data.zero_()
-            elif isinstance(m, torch.nn.Linear):
-                fan_out = m.weight.size(0)
-                init_range = 1.0 / math.sqrt(fan_out)
-                m.weight.data.uniform_(-init_range, init_range)
-                m.bias.data.zero_()
-
-    def extract_features(self, input: torch.Tensor):
-        out = self.conv_stem(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-        out = self.blocks(out)
-        out = self.conv_head(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-        out = self.global_avg_pool(out)
-        return out.flatten(1)
-
-    def forward(self, input: torch.Tensor):
-        out = self.extract_features(input)
-        out = self.dropout(out)
-        return self.classifier(out)
-
-    @staticmethod
-    def get_model_from_name(model_name: str, num_classes, drop_connect_rate: float):
-        """Instantiate an EfficientNet model family member based on the model_name string.
-
-        Args:
-            model_name: (str): One of ``'efficientnet-b0'`` through ``'efficientnet-b7'``.
-            num_classes (int): Size of the EfficientNet output, typically viewed as the number of classes in a classification task.
-            drop_connect_rate (float): Probability of dropping a sample before the identity connection,
-                provides regularization similar to stochastic depth.
-        """
-
-        # Coefficients: width, depth, res, dropout
-        model_arch = {
-            'efficientnet-b0': (1.0, 1.0, 224, 0.2),
-            'efficientnet-b1': (1.0, 1.1, 240, 0.2),
-            'efficientnet-b2': (1.1, 1.2, 260, 0.3),
-            'efficientnet-b3': (1.2, 1.4, 300, 0.3),
-            'efficientnet-b4': (1.4, 1.8, 380, 0.4),
-            'efficientnet-b5': (1.6, 2.2, 456, 0.4),
-            'efficientnet-b6': (1.8, 2.6, 528, 0.5),
-            'efficientnet-b7': (2.0, 3.1, 600, 0.5),
-        }
-
-        model_params = model_arch[model_name]
-        width_multiplier = model_params[0]
-        depth_multiplier = model_params[1]
-        drop_rate = model_params[3]
-        return EfficientNet(num_classes=num_classes,
-                            width_multiplier=width_multiplier,
-                            depth_multiplier=depth_multiplier,
-                            drop_rate=drop_rate,
-                            drop_connect_rate=drop_connect_rate)
-
-    def _decode_block_string(self, block_string: str):
-        """Decodes an EfficientNet block specification string into a dictionary of keyword arguments for a block in the
-        architecture."""
-
-        arg_strings = block_string.split('_')
-        args = {}
-        for arg_string in arg_strings:
-            splits = re.split(r'(\d.*)', arg_string)
-            if len(splits) >= 2:
-                key, value = splits[:2]
-                args[key] = value
-        num_repeat = int(args['r'])
-        block_args = {
-            'kernel_size': int(args['k']),
-            'stride': int(args['s']),
-            'expand_ratio': int(args['e']),
-            'in_channels': int(args['i']),
-            'out_channels': int(args['o']),
-            'se_ratio': float(args['se']) if 'se' in args else None,
-        }
-        return block_args, num_repeat
diff --git a/composer/models/efficientnetb0/model.py b/composer/models/efficientnetb0/model.py
deleted file mode 100644
index 67ae193895..0000000000
--- a/composer/models/efficientnetb0/model.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A :class:`.ComposerClassifier` wrapper around the EfficientNet-b0 architecture."""
-
-import warnings
-
-from composer.models.efficientnetb0.efficientnets import EfficientNet
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_efficientnetb0']
-
-
-def composer_efficientnetb0(num_classes: int = 1000, drop_connect_rate: float = 0.2) -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with an EfficientNet-b0 architecture.
-
-    See `Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_
-        (Tan et al, 2019) for more details.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        drop_connect_rate (float, optional): Probability of dropping a sample within a block before identity
-            connection. Default: ``0.2``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a EfficientNet-B0 model.
-
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_efficientnetb0
-
-        model = composer_efficientnetb0()  # creates EfficientNet-b0 for image classification
-    """
-    warnings.warn(DeprecationWarning('composer_efficientnetb0 is deprecated and will be removed in v0.18'))
-    model = EfficientNet.get_model_from_name(model_name='efficientnet-b0',
-                                             num_classes=num_classes,
-                                             drop_connect_rate=drop_connect_rate)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/gpt2/README.md b/composer/models/gpt2/README.md
deleted file mode 100644
index 52ee26a97f..0000000000
--- a/composer/models/gpt2/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# GPT-2
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`NLP` /  ``Autoregressive Language Modeling``
-
-The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales. This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream language generation tasks, such as summarization, translation, and dialog.
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's GPT-2 model with one of our configurations.
-
-## Example
-
-<!-- TODO: Address timeouts -->
-<!--pytest.mark.skip-->
-```python
-import transformers
-from composer.models import GPT2Model
-
-model = GPT2Model(module=transformers.AutoModelForCausalLM.from_pretrained("gpt2"),
-                  config=transformers.GPT2Config.from_pretrained("gpt2"),
-                  tokenizer_name="gpt2")
-```
-
-## Architecture
-
-GPT-2 consists of a a decoder-only Transformer parameterized by $n_{layer}$, $d_{model}$, $d_{ff}$, $d_{attn}$ and $n_{heads}$. The parameters for each model family member can be seen below:
-
-| Name       | $n_{layer}$ | $d_{model}$ | $d_{ff}$ | $d_{attn}$ | $n_{heads}$ |
-|------------|-------------|-------------|----------|------------|-------------|
-| GPT-2 52M  | 8           | 512         | 2048     | 8          | 8           |
-| GPT-2 83M  | 10          | 640         | 2560     | 640        | 10          |
-| GPT-2 125M | 12          | 768         | 3072     | 768        | 12          |
-
-## Family Members
-
-We implement three members of this family at different scales: GPT 52M, GPT 83M, and GPT 125M. These models are named after their parameter counts. We selected these particular configurations because (1) they represent points on the pareto frontier of the scaling law for language models as described by [Kaplan et al. at OpenAI](https://arxiv.org/abs/2001.08361) and (2) they are small enough to rapidly iterate on methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Final Loss | Predicted Perplexity | Actual Perplexity |
-|---------------------|------------|---------------------------|-----------------|------------|----------------------|-------------------|
-| GPT-2 52M           | 53.9M      | 02:44                     | 4.6B            | 3.43       | 32.54                | 30.88             |
-| GPT-2 83M           | 85.8M      | 04:52                     | 5.5B            | 3.28       | 27.84                | 26.57             |
-| GPT-2 125M          | 114M       | 08:25                     | 6.7B            | 3.18       | 24.64                | 24.04             |
-
-
-There are two ways of varying the amount of time necessary to train a model or the cost necessary to do so: varying the size of the model or varying the number of steps (and therefore data) for which the model is trained. With the GPT family of models, we explore both of these axes. To develop methods for these models, we generally begin with the smallest members of this model family for initial experimentation and scale up once the ideas have been refined.
-
-To explore tradeoffs between quality and number of training steps: we have ablated both number of training steps, and number of data points to train on. We do this by checkpointing the model throughout training.
-
-To explore tradeoffs between quality and the size of the model, we use [Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361) to provide suggestions on model capacity and dataset size, and then sweep hyperparameters such as learning rate and batch size to minimize loss.
-
-
-## Attribution
-
-The GPT model family is described in *[Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)* by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-
-The Scaling Law that we use to choose the members of this model family are described in *[Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361)* by Jared Kaplan, Sam McCandish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
-
-## Default Training Hyperparameters
-
-Below are hyperparameters we used to train GPT-2 125M on [OpenWebText](https://huggingface.co/datasets/openwebtext).
-
-```yaml
-optimizer:
-  adamw:
-    lr: 6.0e-4
-    betas:
-      - 0.9
-      - 0.999
-    eps: 1.0e-08
-    weight_decay: 0.0
-schedulers:
-  - cosine_decay_with_warmup:
-      t_warmup: 140ba
-train_batch_size: 512
-```
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.gpt2.GPT2Model
-    :noindex:
-```
diff --git a/composer/models/gpt2/__init__.py b/composer/models/gpt2/__init__.py
deleted file mode 100644
index 1ae37b122a..0000000000
--- a/composer/models/gpt2/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales.
-This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream
-language generation tasks, such as summarization, translation, and dialog.
-
-See the :doc:`Model Card </model_cards/GPT2>` for more details.
-"""
-
-from composer.models.gpt2.model import create_gpt2 as create_gpt2
-
-__all__ = ['create_gpt2']
-
-_metadata = {
-    'gpt2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 52M',
-        '_quality': '30.88',
-        '_metric': 'Perplexity',
-        '_ttt': '02:44',
-        '_hparams': 'gpt2_52m.yaml'
-    },
-    'gpt2 -- TODO RENAME TO GPT2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 83M',
-        '_quality': '26.57',
-        '_metric': 'Perplexity',
-        '_ttt': '04:52',
-        '_hparams': 'gpt2_83m.yaml'
-    },
-    'gpt2 --! TODO RENAME TO GPT2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 125M',
-        '_quality': '24.04',
-        '_metric': 'Perplexity',
-        '_ttt': '08:25',
-        '_hparams': 'gpt2_125m.yaml'
-    }
-}
diff --git a/composer/models/gpt2/model.py b/composer/models/gpt2/model.py
deleted file mode 100644
index ea924b7b99..0000000000
--- a/composer/models/gpt2/model.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""GPT-2 model based on `Hugging Face GPT-2 <https://huggingface.co/docs/transformers/master/en/model_doc/gpt2>`_.
-
-Implemented as a wrapper using :class:`.ComposerTrainer`.
-"""
-
-from __future__ import annotations
-
-import warnings
-from typing import Optional
-
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
-from composer.models.huggingface import HuggingFaceModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['create_gpt2']
-
-
-def create_gpt2(use_pretrained: Optional[bool] = False,
-                pretrained_model_name: Optional[str] = None,
-                model_config: Optional[dict] = None,
-                tokenizer_name: Optional[str] = None,
-                gradient_checkpointing: Optional[bool] = False):
-    """Implements :class:`~composer.models.huggingface.HuggingFaceModel` to wrap `Hugging Face GPT-2 \
-    transformers <https://huggingface.co/docs/transformers/master/en/model_doc/gpt2#overview>`_. Logs training and
-    validation perplexity.
-
-    From `Language Models are Unsupervised Multitask Learners <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ (Radford et al, 2018).
-
-    Args:
-
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): A dictionary providing a HuggingFace model configuration.
-        tokenizer_name (str, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "gpt2",
-              "activation_function": "gelu_new",
-              "architectures": ["GPT2LMHeadModel"],
-              "attn_pdrop": 0.1,
-              "bos_token_id": 50256,
-              "embd_pdrop": 0.1,
-              "eos_token_id": 50256,
-              "initializer_range": 0.02,
-              "layer_norm_epsilon": 1e-05,
-              "model_type": "gpt2",
-              "n_ctx": 1024,
-              "n_embd": 768,
-              "n_head": 12,
-              "n_inner": null,
-              "n_layer": 12,
-              "n_positions": 1024,
-              "reorder_and_upcast_attn": false,
-              "resid_pdrop": 0.1,
-              "scale_attn_by_inverse_layer_idx": false,
-              "scale_attn_weights": true,
-              "summary_activation": null,
-              "summary_first_dropout": 0.1,
-              "summary_proj_to_labels": true,
-              "summary_type": "cls_index",
-              "summary_use_proj": true,
-              "task_specific_params": {
-              "text-generation": {
-              "do_sample": true,
-              "max_length": 50 }
-              },
-              "transformers_version": "4.16.0",
-              "use_cache": true,
-              "vocab_size": 50257
-            }
-
-   To create a GPT-2 model for language modeling pretraining:
-
-    .. testcode::
-
-        from composer.models import create_gpt2
-
-        composer_model = create_gpt2()
-
-    """
-    warnings.warn(DeprecationWarning('create_gpt2 is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'gpt2'
-
-    if use_pretrained:
-        assert transformers.AutoModelForCausalLM.from_pretrained is not None, 'AutoModelForCausalLM has from_pretrained method'
-        model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
-                                                                  **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForCausalLM.from_config is not None, 'AutoModelForCausalLM has from_config method'
-        model = transformers.AutoModelForCausalLM.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()  # type: ignore
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    return HuggingFaceModel(model=model,
-                            tokenizer=tokenizer,
-                            metrics=[LanguageCrossEntropy(), LanguagePerplexity()],
-                            use_logits=True)
diff --git a/composer/models/mmdetection.py b/composer/models/mmdetection.py
deleted file mode 100644
index 2e53aac543..0000000000
--- a/composer/models/mmdetection.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper class that converts mmdet detection models to composer models"""
-
-from __future__ import annotations
-
-import warnings
-from typing import TYPE_CHECKING, Any, List, Optional
-
-import numpy as np
-import torch
-from torchmetrics import Metric
-from torchmetrics.collections import MetricCollection
-
-from composer.models import ComposerModel
-
-if TYPE_CHECKING:
-    import mmdet
-
-__all__ = ['MMDetModel']
-
-
-class MMDetModel(ComposerModel):
-    """A wrapper class that adapts mmdetection detectors to composer models.
-
-    Args:
-        model (mmdet.models.detectors.BaseDetector): An MMdetection Detector.
-        metrics (list[Metric], optional): list of torchmetrics to apply to the output of `eval_forward`. Default: ``None``.
-
-    .. warning:: This wrapper is designed to work with mmdet datasets.
-
-    Example:
-
-    .. code-block:: python
-
-        from mmdet.models import build_model
-        from mmcv import ConfigDict
-        from composer.models import MMDetModel
-
-        yolox_s_config = dict(
-            type='YOLOX',
-            input_size=(640, 640),
-            random_size_range=(15, 25),
-            random_size_interval=10,
-            backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
-            neck=dict(type='YOLOXPAFPN', in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
-            bbox_head=dict(type='YOLOXHead', num_classes=num_classes, in_channels=128, feat_channels=128),
-            train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
-            test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
-        yolox = build_model(ConfigDict(yolox_s_config))
-        yolox.init_weights()
-        model = MMDetModel(yolox)
-    """
-
-    def __init__(
-            self,
-            model: mmdet.models.detectors.BaseDetector,  # type: ignore
-            metrics: Optional[List[Metric]] = None) -> None:
-        warnings.warn(DeprecationWarning('MMDetModel is deprecated and will be removed in v0.18'))
-        super().__init__()
-        self.model = model
-
-        self.train_metrics = None
-        self.val_metrics = None
-
-        if metrics:
-            metric_collection = MetricCollection(metrics)
-            self.train_metrics = metric_collection.clone(prefix='train_')
-            self.val_metrics = metric_collection.clone(prefix='val_')
-
-    def forward(self, batch):
-        # this will return a dictionary of losses in train mode and model outputs in test mode.
-        return self.model(**batch)
-
-    def loss(self, outputs, batch, **kwargs):
-        return outputs
-
-    def eval_forward(self, batch, outputs: Optional[Any] = None):
-        """
-        Args:
-            batch (dict): a eval batch of the format:
-
-
-            ``img`` (List[torch.Tensor]): list of image torch.Tensors of shape (batch, c, h , w).
-
-
-            ``img_metas`` (List[Dict]): (1, batch_size) list of ``image_meta`` dicts.
-        Returns: model predictions: A batch_size length list of dictionaries containg detection boxes in (x,y, x2, y2) format, class labels, and class probabilities.
-        """
-        device = batch['img'][0].device
-        batch.pop('gt_labels')
-        batch.pop('gt_bboxes')
-        results = self.model(return_loss=False, rescale=True, **batch)  # models behave differently in eval mode
-
-        # outputs are a list of bbox results (x, y, x2, y2, score)
-        # pack mmdet bounding boxes and labels into the format for torchmetrics MAP expects
-        preds = []
-        for bbox_result in results:
-            boxes_scores = np.vstack(bbox_result)
-            boxes, scores = torch.from_numpy(boxes_scores[..., :-1]).to(device), torch.from_numpy(
-                boxes_scores[..., -1]).to(device)
-            labels = [np.full(result.shape[0], i, dtype=np.int32) for i, result in enumerate(bbox_result)]
-            pred = {
-                'labels': torch.from_numpy(np.concatenate(labels)).to(device).long(),
-                'boxes': boxes.float(),
-                'scores': scores.float()
-            }
-            preds.append(pred)
-        return preds
-
-    def get_metrics(self, is_train: bool = False):
-        if is_train:
-            metrics = self.train_metrics
-        else:
-            metrics = self.val_metrics
-        return metrics if metrics else {}
-
-    def update_metric(self, batch: Any, outputs: Any, metric: Metric):
-        targets_box = batch.pop('gt_bboxes')[0]
-        targets_cls = batch.pop('gt_labels')[0]
-        targets = []
-        for i in range(len(targets_box)):
-            t = {'boxes': targets_box[i], 'labels': targets_cls[i]}
-            targets.append(t)
-        metric.update(outputs, targets)
diff --git a/composer/models/resnet/README.md b/composer/models/resnet/README.md
deleted file mode 100644
index 430dd303b4..0000000000
--- a/composer/models/resnet/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# 🏙️ ResNet
-[\[How to Use\]](#how-to-use) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Image Classification`
-
-The ResNet model family is a set of convolutional neural networks that can be used as a basis for a variety of vision tasks. Our implementation is a simple wrapper on top of the [torchvision ResNet implementation](https://pytorch.org/vision/stable/models.html).
-
-## How to Use
-
-```python
-from composer.models import composer_resnet
-
-model = composer_resnet(
-    model_name="resnet50",
-    num_classes=1000,
-    weights=None
-)
-```
-
-## Architecture
-
-The basic architecture defined in the original papers is as follows:
-
-- The first layer is a 7x7 Convolution with stride 2 and 64 filters.
-- Subsequent layers follow 4 stages with {64, 128, 256, 512} input channels with a varying number of residual blocks at each stage that depends on the family member. At the end of every stage, the resolution is reduced by half using a convolution with stride 2.
-- The final section consists of a global average pooling followed by a linear + softmax layer that outputs values for the specified number of classes.
-
-The below table from [He et al.](https://arxiv.org/abs/1512.03385) details some of the building blocks for ResNets of different sizes.
-
-![resnet.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/resnet.png)
-
-## Family Members
-
-ResNet family members are identified by their number of layers. Parameter count, accuracy, and training time are provided below.
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 8xA100s |
-|----------------------|-----------------|--------------|--------------------------|
-| ResNet-18            | 11.5M           | TBA          | TBA                      |
-| ResNet-34            | 21.8M           | TBA          | TBA                      |
-| ResNet-50            | 25.6M           | 76.5%        | 3.83 hrs                 |
-| ResNet-101           | 44.5M           | 78.1%        | 5.50 hrs                 |
-| ResNet-152           | 60.2M           | TBA          | TBA                      |
-
-
-> ❗ **Note**: Please see the [CIFAR ResNet model card](https://docs.mosaicml.com/projects/composer/en/stable/model_cards/cifar_resnet.html#architecture) for the differences between CIFAR and ImageNet ResNets.
-
-## Default Training Hyperparameters
-
-- Optimizer: Decoupled SGDW
-  - Learning rate: 2.048
-    Momentum: 0.875
-    Weight_decay: 5.0e-4
-- LR schedulers:
-  - Cosine decay with warmup for 8 epochs
-- Batch size: 2048
-- Number of epochs: 90ep
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-
-Code and hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5) by Nvidia
-
-## API Reference
-
-```{eval-rst}
-.. autofunction:: composer.models.resnet.model.composer_resnet
-    :noindex:
-```
diff --git a/composer/models/resnet/__init__.py b/composer/models/resnet/__init__.py
deleted file mode 100644
index e00a37035b..0000000000
--- a/composer/models/resnet/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The ResNet model family is a set of convolutional neural networks described in `Deep Residual Learning for Image
-Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015). ResNets can be used as the base for a variety of
-vision tasks. ImageNet ResNets are a subset of the ResNet family which were designed specifically for classification on
-the ImageNet dataset.
-
-See the :doc:`Model Card </model_cards/resnet>` for more details.
-"""
-from composer.models.resnet.model import composer_resnet
-
-__all__ = ['composer_resnet']
-
-_metadata = {
-    'resnet18': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet18',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet18.yaml'
-    },
-    'resnet34': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet34',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet34.yaml'
-    },
-    'resnet50': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet50',
-        '_quality': '76.51',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '3h 33m',
-        '_hparams': 'resnet50.yaml'
-    },
-    'resnet101': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet101',
-        '_quality': '78.10',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '8h 15m',
-        '_hparams': 'resnet101.yaml',
-    },
-    'resnet152': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet152',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet152.yaml'
-    }
-}
diff --git a/composer/models/resnet/model.py b/composer/models/resnet/model.py
deleted file mode 100644
index 5b023fabcf..0000000000
--- a/composer/models/resnet/model.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A :class:`.ComposerClassifier` wrapper around the torchvision implementations of the ResNet model family."""
-
-import logging
-import warnings
-from typing import List, Optional
-
-from torchmetrics import MetricCollection
-from torchmetrics.classification import MulticlassAccuracy
-from torchvision.models import resnet
-
-from composer.loss import loss_registry
-from composer.metrics import CrossEntropy
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_resnet']
-
-log = logging.getLogger(__name__)
-
-valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
-
-
-def composer_resnet(model_name: str,
-                    num_classes: int = 1000,
-                    weights: Optional[str] = None,
-                    groups: int = 1,
-                    width_per_group: int = 64,
-                    initializers: Optional[List[Initializer]] = None,
-                    loss_name: str = 'soft_cross_entropy') -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model.
-
-    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
-
-    Args:
-        model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``,
-            ``"resnet152"``].
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``.
-        groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``.
-        width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage.
-            Default: ``64``.
-        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
-            Default: ``None``.
-        loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or
-            'binary_cross_entropy_with_logits'. Loss function must be in
-            :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``".
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_resnet
-
-        model = composer_resnet(model_name='resnet18')  # creates a torchvision resnet18 for image classification
-    """
-    warnings.warn(DeprecationWarning('composer_resnet is deprecated and will be removed in v0.18'))
-
-    valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
-    if model_name not in valid_model_names:
-        raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.')
-
-    if loss_name not in loss_registry.keys():
-        raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the '
-                         'specified loss function is present in composer.loss.loss.py')
-
-    if loss_name == 'binary_cross_entropy_with_logits' and (initializers is None or
-                                                            Initializer.LINEAR_LOG_CONSTANT_BIAS not in initializers):
-        log.warning('UserWarning: Using `binary_cross_entropy_loss_with_logits` '
-                    'without using `initializers.linear_log_constant_bias` can degrade '
-                    'performance. '
-                    'Please ensure you are using `initializers. '
-                    'linear_log_constant_bias`.')
-
-    if initializers is None:
-        initializers = []
-
-    # Instantiate model
-    model_fn = getattr(resnet, model_name)
-    model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group)
-
-    # Grab loss function from loss registry
-    loss_fn = loss_registry[loss_name]
-
-    # Create metrics for train and validation
-    train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro')
-    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')])
-
-    # Apply Initializers to model
-    for initializer in initializers:
-        initializer = Initializer(initializer)
-        model.apply(initializer.get_initializer())
-
-    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
-    return composer_model
diff --git a/composer/models/resnet_cifar/README.md b/composer/models/resnet_cifar/README.md
deleted file mode 100644
index 5a32ae03b8..0000000000
--- a/composer/models/resnet_cifar/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# CIFAR ResNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Image Classification`
-
-The ResNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks. CIFAR ResNet models are a subset of this family designed specifically for the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) and [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) datasets.
-
-## Example
-
-```python
-from composer.models import composer_resnet_cifar
-
-model = composer_resnet_cifar(model_name='resnet_56', num_classes=10)
-```
-
-## Architecture
-
-Residual Networks are feedforward convolutional networks with “residual” connections between non-consecutive layers.
-
-The model architecture is defined by the original paper:
-
-- The network inputs are of dimension 32×32x3.
-- The first layer is 3×3 convolutions
-- The subsequent layers are a stack of 6n layers with 3×3 convolutions on the feature maps of sizes {32,16,8}, with 2n layers for each feature map size. The number of filters are {16,32,64} for the respective feature map sizes. Subsampling is performed by convolutions with a stride of 2
-- The network ends with a global average pooling, a linear layer with the output dimension equal to the number of classes, and softmax function.
-
-There are a total 6n+2 stacked weighted layers. Each family member is specified by the number of layers, for example n=9 corresponds to ResNet56
-
-The biggest differences between CIFAR ResNet models and ImageNet ResNet models are:
-
-- CIFAR ResNet models use fewer filters for each convolution.
-- The ImageNet ResNets contain four stages, while the CIFAR ResNets contain three stages. In addition, CIFAR ResNets uniformly distribute blocks across each stage while ImageNet ResNets have a specific number of blocks for each stage.
-
-## Family Members
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 1x3080 |
-|----------------------|-----------------|--------------|-------------------------|
-| ResNet20             | 0.27M           | TBA          | TBA                     |
-| ResNet32             | 0.46M           | TBA          | TBA                     |
-| ResNet44             | 0.66M           | TBA          | TBA                     |
-| ResNet56             | 0.85M           | 93.1%        | 35 min                  |
-| ResNet110            | 1.7M            | TBA          | TBA                     |
-## Default Training Hyperparameters
-
-```yaml
-optimizer:
-  sgd:
-    learning_rate: 1.2
-    momentum: 0.9
-    weight_decay: 1e-4
-schedulers:
-  - multistep_with_warmup:
-      t_warmup: "5ep"
-      milestones:
-        - "80ep"
-        - "120ep"
-      gamma: 0.1
-train_batch_size: 1024
-max_duration: 160ep
-```
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-
-Note that this paper set the standard for ResNet style architectures for both CIFAR-10/100 and ImageNet
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.resnet_cifar.model.composer_resnet_cifar
-    :noindex:
-```
diff --git a/composer/models/resnet_cifar/__init__.py b/composer/models/resnet_cifar/__init__.py
deleted file mode 100644
index 2ea6ac226c..0000000000
--- a/composer/models/resnet_cifar/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A ResNet model family adapted for CIFAR10 image sizes.
-
-See the :doc:`Model Card </model_cards/cifar_resnet>` for more details.
-"""
-
-from composer.models.resnet_cifar.model import composer_resnet_cifar as composer_resnet_cifar
-
-__all__ = ['composer_resnet_cifar']
-_metadata = {
-    'resnet9': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet9',
-        '_quality': 'tbd',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'tbd',
-        '_hparams': 'resnet9_cifar10.yaml'
-    },
-    'resnet20': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet20',
-        '_quality': 'tbd',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'tbd',
-        '_hparams': 'resnet20_cifar10.yaml'
-    },
-    'resnet56': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet56',
-        '_quality': '93.1',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '35m',
-        '_hparams': 'resnet56_cifar10.yaml'
-    }
-}
diff --git a/composer/models/resnet_cifar/model.py b/composer/models/resnet_cifar/model.py
deleted file mode 100644
index 5bb8660b56..0000000000
--- a/composer/models/resnet_cifar/model.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ResNet models for CIFAR extending :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import List, Optional
-
-from composer.models.initializers import Initializer
-from composer.models.resnet_cifar.resnets import ResNet9, ResNetCIFAR
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_resnet_cifar']
-
-
-def composer_resnet_cifar(model_name: str,
-                          num_classes: int = 10,
-                          initializers: Optional[List[Initializer]] = None) -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with a CIFAR ResNet models.
-
-    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
-    ResNet9 is based on the  model from myrtle.ai `blog`_.
-
-    Args:
-        model_name (str): ``"resnet_9"``, ``"resnet_20"``, or ``"resnet_56"``.
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``.
-        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
-            Default: ``None``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a CIFAR ResNet model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_resnet_cifar
-
-        model = composer_resnet_cifar(model_name="resnet_56")  # creates a resnet56 for cifar image classification
-
-    .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/
-    """
-    warnings.warn(DeprecationWarning('composer_resnet_cifar is deprecated and will be removed in v0.18'))
-    if initializers is None:
-        initializers = []
-
-    if model_name == 'resnet_9':
-        model = ResNet9(num_classes)  # current initializers don't work with this architecture.
-    else:
-        model = ResNetCIFAR.get_model_from_name(model_name, initializers, num_classes)
-
-    composer_model = ComposerClassifier(module=model, num_classes=num_classes)
-    return composer_model
diff --git a/composer/models/resnet_cifar/resnets.py b/composer/models/resnet_cifar/resnets.py
deleted file mode 100644
index b4f1576b46..0000000000
--- a/composer/models/resnet_cifar/resnets.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The CIFAR ResNet torch module.
-
-See the :doc:`Model Card </model_cards/resnet>` for more details.
-"""
-
-# Code below adapted from https://github.com/facebookresearch/open_lth
-# and https://github.com/pytorch/vision
-
-from typing import List, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torchvision.models.resnet import BasicBlock
-
-from composer.models import Initializer
-
-__all__ = ['ResNetCIFAR', 'ResNet9']
-
-
-class ResNetCIFAR(nn.Module):
-    """A residual neural network as originally designed for CIFAR-10."""
-
-    class Block(nn.Module):
-        """A ResNet block."""
-
-        def __init__(self, f_in: int, f_out: int, downsample: bool = False):
-            super(ResNetCIFAR.Block, self).__init__()
-
-            stride = 2 if downsample else 1
-            self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)
-            self.bn1 = nn.BatchNorm2d(f_out)
-            self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)
-            self.bn2 = nn.BatchNorm2d(f_out)
-            self.relu = nn.ReLU(inplace=True)
-
-            # No parameters for shortcut connections.
-            if downsample or f_in != f_out:
-                self.shortcut = nn.Sequential(
-                    nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),
-                    nn.BatchNorm2d(f_out),
-                )
-            else:
-                self.shortcut = nn.Sequential()
-
-        def forward(self, x: torch.Tensor):
-            out = self.relu(self.bn1(self.conv1(x)))
-            out = self.bn2(self.conv2(out))
-            out += self.shortcut(x)
-            return self.relu(out)
-
-    def __init__(self, plan: List[Tuple[int, int]], initializers: List[Initializer], outputs: int = 10):
-        super(ResNetCIFAR, self).__init__()
-        outputs = outputs or 10
-
-        self.num_classes = outputs
-
-        # Initial convolution.
-        current_filters = plan[0][0]
-        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn = nn.BatchNorm2d(current_filters)
-        self.relu = nn.ReLU(inplace=True)
-
-        # The subsequent blocks of the ResNet.
-        blocks = []
-        for segment_index, (filters, num_blocks) in enumerate(plan):
-            for block_index in range(num_blocks):
-                downsample = segment_index > 0 and block_index == 0
-                blocks.append(ResNetCIFAR.Block(current_filters, filters, downsample))
-                current_filters = filters
-
-        self.blocks = nn.Sequential(*blocks)
-
-        # Final fc layer. Size = number of filters in last segment.
-        self.fc = nn.Linear(plan[-1][0], outputs)
-        self.criterion = nn.CrossEntropyLoss()
-
-        for initializer in initializers:
-            initializer = Initializer(initializer)
-            self.apply(initializer.get_initializer())
-
-    def forward(self, x: torch.Tensor):
-        out = self.relu(self.bn(self.conv(x)))
-        out = self.blocks(out)
-        out = F.avg_pool2d(out, out.size()[3])
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        return out
-
-    @staticmethod
-    def is_valid_model_name(model_name: str):
-        valid_model_names = [f'resnet_{layers}' for layers in (20, 56)]
-        return (model_name in valid_model_names)
-
-    @staticmethod
-    def get_model_from_name(model_name: str, initializers: List[Initializer], outputs: int = 10):
-        """The naming scheme for a ResNet is ``'resnet_D[_W]'``.
-
-        D is the model depth (e.g. ``'resnet_56'``)
-        """
-
-        if not ResNetCIFAR.is_valid_model_name(model_name):
-            raise ValueError('Invalid model name: {}'.format(model_name))
-
-        depth = int(model_name.split('_')[-1])  # for resnet56, depth 56, width 16
-        if len(model_name.split('_')) == 2:
-            width = 16
-        else:
-            width = int(model_name.split('_')[3])
-
-        if (depth - 2) % 3 != 0:
-            raise ValueError('Invalid ResNetCIFAR depth: {}'.format(depth))
-        num_blocks = (depth - 2) // 6
-
-        model_arch = {
-            56: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)],
-            20: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)],
-        }
-
-        return ResNetCIFAR(model_arch[depth], initializers, outputs)
-
-
-# adapted from https://raw.githubusercontent.com/matthias-wright/cifar10-resnet/master/model.py
-# under the MIT license
-class ResNet9(nn.Module):
-    """A 9-layer residual network, excluding BatchNorms and activation functions.
-
-    Based on the myrtle.ai `blog`_ and Deep Residual Learning for Image Recognition (`He et al, 2015`_).
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``.
-
-    .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/
-    .. _He et al, 2015: https://arxiv.org/abs/1512.03385
-    """
-
-    def __init__(self, num_classes: int = 10):
-        super().__init__()
-
-        self.body = nn.Sequential(
-            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=64, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=128, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            BasicBlock(inplanes=128, planes=128, stride=1),
-            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=256, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=256, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            BasicBlock(inplanes=256, planes=256, stride=1),
-        )
-
-        self.fc = nn.Linear(in_features=256, out_features=num_classes, bias=True)
-
-    def forward(self, x):
-        out = self.body(x)
-        out = F.avg_pool2d(out, out.size()[3])
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        return out
diff --git a/composer/models/timm/__init__.py b/composer/models/timm/__init__.py
deleted file mode 100644
index b7960b426a..0000000000
--- a/composer/models/timm/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`_
-used to create :class:`.ComposerClassifier`."""
-
-from composer.models.timm.model import composer_timm as composer_timm
-
-__all__ = ['composer_timm']
diff --git a/composer/models/timm/model.py b/composer/models/timm/model.py
deleted file mode 100644
index df0ffbca91..0000000000
--- a/composer/models/timm/model.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`_
-used to create :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import Optional
-
-from composer.models.tasks import ComposerClassifier
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['composer_timm']
-
-
-def composer_timm(model_name: str,
-                  pretrained: bool = False,
-                  num_classes: int = 1000,
-                  drop_rate: float = 0.0,
-                  drop_path_rate: Optional[float] = None,
-                  drop_block_rate: Optional[float] = None,
-                  global_pool: Optional[str] = None,
-                  bn_momentum: Optional[float] = None,
-                  bn_eps: Optional[float] = None) -> ComposerClassifier:
-    """A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-
-    model>`_ used to create :class:`.ComposerClassifier`.
-
-    Args:
-        model_name (str): timm model name e.g: ``"resnet50"``. List of models can be found at
-            `PyTorch Image Models <https://github.com/rwightman/pytorch-image-models>`_.
-        pretrained (bool, optional): Imagenet pretrained. Default: ``False``.
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        drop_rate (float, optional): Dropout rate. Default: ``0.0``.
-        drop_path_rate (float, optional): Drop path rate (model default if ``None``). Default: ``None``.
-        drop_block_rate (float, optional): Drop block rate (model default if ``None``). Default: ``None``.
-        global_pool (str, optional): Global pool type, one of (``"fast"``, ``"avg"``, ``"max"``, ``"avgmax"``, ``"avgmaxc"``). Model default if ``None``. Default: ``None``.
-        bn_momentum (float, optional): BatchNorm momentum override (model default if ``None``). Default: ``None``.
-        bn_eps (float, optional): BatchNorm epsilon override (model default if ``None``). Default: ``None``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with the specified TIMM model.
-
-    Resnet18 Example:
-
-    .. testcode::
-
-        from composer.models import composer_timm
-
-        model = composer_timm(model_name='resnet18')  # creates a timm resnet18
-    """
-    warnings.warn(DeprecationWarning('composer_timm is deprecated and will be removed in v0.18'))
-    try:
-        import timm
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='timm', conda_package='timm>=0.5.4',
-                                            conda_channel=None) from e
-    model = timm.create_model(  # type: ignore (third-party)
-        model_name=model_name,
-        pretrained=pretrained,
-        num_classes=num_classes,
-        drop_rate=drop_rate,
-        drop_path_rate=drop_path_rate,
-        drop_block_rate=drop_block_rate,
-        global_pool=global_pool,
-        bn_momentum=bn_momentum,
-        bn_eps=bn_eps)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/unet/README.md b/composer/models/unet/README.md
deleted file mode 100644
index 530832051b..0000000000
--- a/composer/models/unet/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# UNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Segmentation`
-
-Unet is an architecture used for image segmentation.
-
-## Example
-
-<!--pytest-codeblocks:importorskip(monai)-->
-<!--pytest-codeblocks:importorskip(scikit-learn)-->
-```python
-from composer.models import UNet
-
-model = UNet()
-```
-
-## Architecture
-
-The figure below ([source](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet)) shows a 3D version of the UNet architecture. Quoting the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet), "U-Net is composed of a contractive and an expanding path, that aims at building a bottleneck in its centremost part through a combination of convolution, instance norm and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve training."
-
-![unet3d.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/unet3d.png)
-
-
-There are 3 main differences between our implementation and the original NVDA DALI implementation.
-
-The first two refer to removing the NVDA DALI pipeline and replacing all transforms with torch implementations. We are omitting the Zoom transform and use a kernel size of 3 for the Gaussian Blur transform.
-
-While NVDA DLE examples reports the training accuracy using an average of 5 folds, we are using only 1 fold in the interest of faster iteration time, so all of our results are reported using fold 0 and 200 epochs.
-
-
-## Default Training Hyperparameters
-
-Below are the hyperparameters we used to train UNet on the [BraTS](http://braintumorsegmentation.org) image segmentation dataset.
-
-```yaml
-optimizer:
-  radam:
-    lr: 0.001
-    betas: [0.9, 0.999]
-    eps: 0.00000001
-    weight_decay: 0.0001
-schedulers:
-  - constant: {}
-train_batch_size: 64
-max_duration: 200ep
-```
-
-
-## Attribution
-
-The UNet model has been introduced in "U-Net: Convolutional Networks for Biomedical Image Segmentation" by Olaf Ronneberger, Philipp Fischer, Thomas Brox in [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597).
-
-We are using the NVDA DLE examples version in
-[https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet).
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.unet.UNet
-    :noindex:
-```
diff --git a/composer/models/unet/__init__.py b/composer/models/unet/__init__.py
deleted file mode 100644
index 6f26bd4625..0000000000
--- a/composer/models/unet/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset.
-
-See the :doc:`Model Card </model_cards/unet>` for more details.
-"""
-
-from composer.models.unet.unet import UNet as UNet
-
-__all__ = ['UNet']
-
-_task = 'Image Segmentation'
-_dataset = 'BRATS'
-_name = 'UNet'
-_quality = '69.1'
-_metric = 'Dice'
-_ttt = '21m'
-_hparams = 'unet.yaml'
diff --git a/composer/models/unet/_layers.py b/composer/models/unet/_layers.py
deleted file mode 100644
index 6fae767bf5..0000000000
--- a/composer/models/unet/_layers.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-## Code adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-normalizations = {
-    'instancenorm3d': nn.InstanceNorm3d,
-    'instancenorm2d': nn.InstanceNorm2d,
-    'batchnorm3d': nn.BatchNorm3d,
-    'batchnorm2d': nn.BatchNorm2d,
-}
-
-convolutions = {
-    'Conv2d': nn.Conv2d,
-    'Conv3d': nn.Conv3d,
-    'ConvTranspose2d': nn.ConvTranspose2d,
-    'ConvTranspose3d': nn.ConvTranspose3d,
-}
-
-
-def get_norm(name, out_channels):
-    if 'groupnorm' in name:
-        return nn.GroupNorm(32, out_channels, affine=True)
-    return normalizations[name](out_channels, affine=True)
-
-
-def get_conv(in_channels, out_channels, kernel_size, stride, dim, bias=False):
-    conv = convolutions[f'Conv{dim}d']
-    padding = get_padding(kernel_size, stride)
-    return conv(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
-
-
-def get_transp_conv(in_channels, out_channels, kernel_size, stride, dim):
-    conv = convolutions[f'ConvTranspose{dim}d']
-    padding = get_padding(kernel_size, stride)
-    output_padding = get_output_padding(kernel_size, stride, padding)
-    return conv(in_channels, out_channels, kernel_size, stride, padding, output_padding, bias=True)
-
-
-def get_padding(kernel_size, stride):
-    #kernel_size_np = np.cast(np.ndarray, np.atleast_1d(kernel_size))
-    #stride_np = np.cast(np.ndarray, np.atleast_1d(stride))
-    kernel_size_np = np.atleast_1d(kernel_size)
-    stride_np = np.atleast_1d(stride)
-    padding_np = (kernel_size_np - stride_np + 1) / 2  # type: ignore
-    padding = tuple(int(p) for p in padding_np)  # type: ignore
-    return padding if len(padding) > 1 else padding[0]
-
-
-def get_output_padding(kernel_size, stride, padding):
-    kernel_size_np = np.atleast_1d(kernel_size)
-    stride_np = np.atleast_1d(stride)
-    padding_np = np.atleast_1d(padding)
-    out_padding_np = 2 * padding_np + stride_np - kernel_size_np
-    out_padding = tuple(int(p) for p in out_padding_np)
-    return out_padding if len(out_padding) > 1 else out_padding[0]
-
-
-class ConvLayer(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ConvLayer, self).__init__()
-        self.conv = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim'])
-        self.norm = get_norm(kwargs['norm'], out_channels)
-        self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True)
-
-    def forward(self, data):
-        out = self.conv(data)
-        out = self.norm(out)
-        out = self.lrelu(out)
-        return out
-
-
-class ConvBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ConvBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs)
-        self.conv2 = ConvLayer(out_channels, out_channels, kernel_size, 1, **kwargs)
-
-    def forward(self, input_data):
-        out = self.conv1(input_data)
-        out = self.conv2(out)
-        return out
-
-
-class ResidBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ResidBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs)
-        self.conv2 = get_conv(out_channels, out_channels, kernel_size, 1, kwargs['dim'])
-        self.norm = get_norm(kwargs['norm'], out_channels)
-        self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True)
-        self.downsample = None
-        if max(stride) > 1 or in_channels != out_channels:  # type: ignore
-            self.downsample = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim'])
-            self.norm_res = get_norm(kwargs['norm'], out_channels)
-
-    def forward(self, input_data):
-        residual = input_data
-        out = self.conv1(input_data)
-        out = self.conv2(out)
-        out = self.norm(out)
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-            residual = self.norm_res(residual)
-        out = self.lrelu(out + residual)
-        return out
-
-
-class UpsampleBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(UpsampleBlock, self).__init__()
-        self.transp_conv = get_transp_conv(in_channels, out_channels, stride, stride, kwargs['dim'])
-        self.conv_block = ConvBlock(2 * out_channels, out_channels, kernel_size, 1, **kwargs)
-
-    def forward(self, input_data, skip_data):
-        out = self.transp_conv(input_data)
-        out = torch.cat((out, skip_data), dim=1)
-        out = self.conv_block(out)
-        return out
-
-
-class OutputBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, dim):
-        super(OutputBlock, self).__init__()
-        self.conv = get_conv(in_channels, out_channels, kernel_size=1, stride=1, dim=dim, bias=True)
-        nn.init.constant_(self.conv.bias, 0)
-
-    def forward(self, input_data):
-        return self.conv(input_data)
diff --git a/composer/models/unet/model.py b/composer/models/unet/model.py
deleted file mode 100644
index 08c49ff57c..0000000000
--- a/composer/models/unet/model.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset.
-
-See the :doc:`Model Card </model_cards/unet>` for more details.
-"""
-
-import warnings
-
-import torch.nn as nn
-
-from composer.models.unet._layers import ConvBlock, OutputBlock, ResidBlock, UpsampleBlock
-
-__all__ = ['UNet']
-
-
-class UNet(nn.Module):
-    """Unet Architecture adapted from NVidia `Deep Learning Examples`_.
-
-    .. _Deep Learning Examples: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/
-
-    Args:
-        in_channels (int): Number of input channels.
-        n_class (int): Number of output layers.
-        kernels (list): Conv layer kernel sizes.
-        strides (list): Conv layer strides.
-        normalization_layer (str): Normalization layer type, one of (``"batch"``, ``"instance"``).
-        negative_slope (float): Leaky relu negative slope.
-        residual (bool): Use residual connections.
-        dimension (int): Filter dimensions.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        n_class,
-        kernels,
-        strides,
-        normalization_layer,
-        negative_slope,
-        residual,
-        dimension,
-    ):
-        warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18'))
-        super(UNet, self).__init__()
-        self.dim = dimension
-        self.n_class = n_class
-        self.residual = residual
-        self.negative_slope = negative_slope
-        self.norm = normalization_layer + f'norm{dimension}d'
-        self.filters = [min(2**(5 + i), 320 if dimension == 3 else 512) for i in range(len(strides))]
-
-        down_block = ResidBlock if self.residual else ConvBlock
-        self.input_block = self.get_conv_block(
-            conv_block=down_block,
-            in_channels=in_channels,
-            out_channels=self.filters[0],
-            kernel_size=kernels[0],
-            stride=strides[0],
-        )
-        self.downsamples = self.get_module_list(
-            conv_block=down_block,
-            in_channels=self.filters[:-1],
-            out_channels=self.filters[1:],
-            kernels=kernels[1:-1],
-            strides=strides[1:-1],
-        )
-        self.bottleneck = self.get_conv_block(
-            conv_block=down_block,
-            in_channels=self.filters[-2],
-            out_channels=self.filters[-1],
-            kernel_size=kernels[-1],
-            stride=strides[-1],
-        )
-        self.upsamples = self.get_module_list(
-            conv_block=UpsampleBlock,
-            in_channels=self.filters[1:][::-1],
-            out_channels=self.filters[:-1][::-1],
-            kernels=kernels[1:][::-1],
-            strides=strides[1:][::-1],
-        )
-        self.output_block = self.get_output_block(decoder_level=0)
-        self.apply(self.initialize_weights)
-        self.n_layers = len(self.upsamples) - 1
-
-    def forward(self, input_data):
-        out = self.input_block(input_data)
-        encoder_outputs = [out]
-        for downsample in self.downsamples:
-            out = downsample(out)
-            encoder_outputs.append(out)
-        out = self.bottleneck(out)
-        for idx, upsample in enumerate(self.upsamples):
-            out = upsample(out, encoder_outputs[self.n_layers - idx])
-        out = self.output_block(out)
-        return out
-
-    def get_conv_block(self, conv_block, in_channels, out_channels, kernel_size, stride):
-        return conv_block(
-            dim=self.dim,
-            stride=stride,
-            norm=self.norm,
-            kernel_size=kernel_size,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            negative_slope=self.negative_slope,
-        )
-
-    def get_output_block(self, decoder_level):
-        return OutputBlock(in_channels=self.filters[decoder_level], out_channels=self.n_class, dim=self.dim)
-
-    def get_module_list(self, in_channels, out_channels, kernels, strides, conv_block):
-        layers = []
-        for in_channel, out_channel, kernel, stride in zip(in_channels, out_channels, kernels, strides):
-            conv_layer = self.get_conv_block(conv_block, in_channel, out_channel, kernel, stride)
-            layers.append(conv_layer)
-        return nn.ModuleList(layers)
-
-    def initialize_weights(self, module):
-        name = module.__class__.__name__.lower()
-        if name in ['conv2d']:
-            nn.init.kaiming_normal_(module.weight, a=self.negative_slope)
diff --git a/composer/models/unet/unet.py b/composer/models/unet/unet.py
deleted file mode 100644
index dde555bb4f..0000000000
--- a/composer/models/unet/unet.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A U-Net model extending :class:`.ComposerModel`."""
-
-import logging
-import warnings
-from typing import Any, Dict, Optional, Sequence, Union
-
-import torch
-import torch.nn as nn
-from torchmetrics import Metric
-
-from composer.metrics.metrics import Dice
-from composer.models.base import ComposerModel
-from composer.models.unet.model import UNet as UNetModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-log = logging.getLogger(__name__)
-
-__all__ = ['UNet']
-
-
-class UNet(ComposerModel):
-    """A U-Net model extending :class:`.ComposerModel`.
-
-    See U-Net: Convolutional Networks for Biomedical Image Segmentation (`Ronneberger et al, 2015`_)
-    on the U-Net architecture.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``3``.
-
-    .. _Ronneberger et al, 2015: https://arxiv.org/abs/1505.04597
-    """
-
-    def __init__(self, num_classes: int = 3) -> None:
-        warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18'))
-
-        super().__init__()
-        try:
-            from monai.losses import DiceLoss
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='unet',
-                                                conda_package='monai',
-                                                conda_channel='conda-forge') from e
-
-        self.module = self.build_nnunet()
-
-        self.dice = Dice(num_classes=num_classes)
-        self.dloss = DiceLoss(include_background=False, softmax=True, to_onehot_y=True, batch=True)
-        self.closs = nn.CrossEntropyLoss()
-
-    def loss(self, outputs: Any, batch: Any, *args, **kwargs) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
-        _, y = batch
-        y = y.squeeze(1)  # type: ignore
-        loss = self.dloss(outputs, y)
-        loss += self.closs(outputs, y[:, 0].long())
-        return loss
-
-    @staticmethod
-    def metric_mean(name, outputs):
-        return torch.stack([out[name] for out in outputs]).mean(dim=0)
-
-    def get_metrics(self, is_train: bool = False) -> Dict[str, Metric]:
-        return {'Dice': self.dice}
-
-    def forward(self, batch: Any) -> torch.Tensor:
-        x, _ = batch
-        x = x.squeeze(1)  # type: ignore
-        logits = self.module(x)
-        return logits
-
-    def inference2d(self, image):
-        """Runs inference on a 3D image, by passing each depth slice through the model."""
-        batch_modulo = image.shape[2] % 64
-        if batch_modulo != 0:
-            batch_pad = 64 - batch_modulo
-            image = nn.ConstantPad3d((0, 0, 0, 0, batch_pad, 0), 0)(image)
-
-        image = torch.transpose(image.squeeze(0), 0, 1)
-        preds_shape = (image.shape[0], 4, *image.shape[2:])
-        preds = torch.zeros(preds_shape, dtype=image.dtype, device=image.device)
-        for start in range(0, image.shape[0] - 64 + 1, 64):
-            end = start + 64
-            with torch.no_grad():
-                pred = self.module(image[start:end])
-            preds[start:end] = pred.data
-        if batch_modulo != 0:
-            preds = preds[batch_pad:]  # type: ignore
-        return torch.transpose(preds, 0, 1).unsqueeze(0)
-
-    def eval_forward(self, batch: Any, outputs: Optional[Any] = None):
-        assert self.training is False, 'For validation, model must be in eval mode'
-        image, _ = batch
-        pred = self.inference2d(image)
-        return pred
-
-    def build_nnunet(self) -> torch.nn.Module:
-        kernels = [[3, 3]] * 6
-        strides = [[1, 1]] + [[2, 2]] * 5
-        model = UNetModel(in_channels=4,
-                          n_class=4,
-                          kernels=kernels,
-                          strides=strides,
-                          dimension=2,
-                          residual=True,
-                          normalization_layer='batch',
-                          negative_slope=0.01)
-
-        return model
diff --git a/composer/models/vit_small_patch16/__init__.py b/composer/models/vit_small_patch16/__init__.py
deleted file mode 100644
index 9992807ade..0000000000
--- a/composer/models/vit_small_patch16/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ViT Small Patch 16 for image classification."""
-
-from composer.models.vit_small_patch16.model import vit_small_patch16 as vit_small_patch16
-
-__all__ = ['vit_small_patch16']
-
-_task = 'Image Classification'
-_dataset = 'ImageNet'
-_name = 'ViT-Small-Patch16'
-_quality = '74.52'
-_metric = 'Top-1 Accuracy'
-_ttt = '1d 59m'
-_hparams = 'vit_small_patch16.yaml'
diff --git a/composer/models/vit_small_patch16/model.py b/composer/models/vit_small_patch16/model.py
deleted file mode 100644
index dacb9db56a..0000000000
--- a/composer/models/vit_small_patch16/model.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Implements ViT-S/16 as a :class:`.ComposerClassifier`."""
-
-import warnings
-
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['vit_small_patch16']
-
-
-def vit_small_patch16(num_classes: int = 1000,
-                      image_size: int = 224,
-                      channels: int = 3,
-                      dropout: float = 0.0,
-                      embedding_dropout: float = 0.0):
-    """Helper function to create a :class:`.ComposerClassifier` using a ViT-S/16 model.
-
-    See `Training data-efficient image transformers & distillation through attention <https://arxiv.org/pdf/2012.12877.pdf>`_
-        (Touvron et al, 2021) for details on ViT-S/16.
-
-    Args:
-        num_classes (int, optional): number of classes for the model. Default: ``1000``.
-        image_size (int, optional): input image size. If you have rectangular images, make sure your image
-         size is the maximum of the width and height. Default: ``224``.
-        channels (int, optional): number of  image channels. Default: ``3``.
-        dropout (float, optional): 0.0 - 1.0 dropout rate. Default: ``0``.
-        embedding_dropout (float, optional): 0.0 - 1.0 embedding dropout rate. Default: ``0``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a ViT-S/16 model.
-    """
-    warnings.warn(DeprecationWarning('vit_small_patch16 is deprecated and will be removed in v0.18'))
-
-    from vit_pytorch import ViT
-    model = ViT(
-        image_size=image_size,
-        channels=channels,
-        num_classes=num_classes,
-        dim=384,  # embed dim/width
-        patch_size=16,
-        depth=12,  # layers
-        heads=6,
-        mlp_dim=1536,
-        dropout=dropout,
-        emb_dropout=embedding_dropout)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/utils/collect_env.py b/composer/utils/collect_env.py
index 2926c54a6f..02e74af8f9 100644
--- a/composer/utils/collect_env.py
+++ b/composer/utils/collect_env.py
@@ -378,7 +378,6 @@ def print_env(file: Optional[TextIO] = None) -> None:
         [pip3] torch-optimizer==0.1.0
         [pip3] torchmetrics==0.7.3
         [pip3] torchvision==0.10.1+cu111
-        [pip3] vit-pytorch==0.27.0
         [conda] Could not collect
 
 
diff --git a/docs/source/composer_model.rst b/docs/source/composer_model.rst
index 3f4c32dab8..bd80be1d10 100644
--- a/docs/source/composer_model.rst
+++ b/docs/source/composer_model.rst
@@ -75,8 +75,6 @@ We also provide several common classes for various tasks, specifically:
 
 -  :class:`.ComposerClassifier` - classification tasks with a cross entropy
    loss and accuracy metric.
--  :func:`.composer_timm` - creates classification models from the popular `TIMM`_
-   library.
 -  :class:`.HuggingFaceModel` - :class:`.ComposerModel` wrapper for a 🤗 `Transformers`_ model.
 
 .. note::
@@ -195,18 +193,6 @@ Integrations
 ------------
 
 
-
-TIMM
-~~~~
-
-Integrate with your favorite `TIMM`_ models with our :func:`.composer_timm` function.
-
-.. code:: python
-
-    from composer.models import composer_timm
-
-    timm_model = composer_timm(model_name='resnet50', pretrained=True)
-
 BERT Example with 🤗 Transformers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -265,5 +251,4 @@ and make it compatible with our trainer.
 .. |loss| replace:: :meth:`~.ComposerModel.loss`
 .. _MMDetection: https://mmdetection.readthedocs.io/en/latest/
 .. _Transformers: https://huggingface.co/docs/transformers/index
-.. _TIMM: https://timm.fast.ai/
 .. _torchvision: https://pytorch.org/vision/stable/models.html
diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 2b640283b3..89d068efe2 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -48,7 +48,6 @@
 from composer.core import Timestamp as Timestamp
 from composer.core import TimeUnit as TimeUnit
 from composer.core import types as types
-from composer.datasets.synthetic import SyntheticBatchPairDataset
 from composer.devices import DeviceCPU
 from composer.loggers import InMemoryLogger as InMemoryLogger
 from composer.loggers import Logger as Logger
@@ -87,7 +86,7 @@
     sys.path.insert(0, _repo_root)
 
 from tests.common import SimpleModel
-from tests.common.datasets import RandomTextClassificationDataset
+from tests.common.datasets import RandomClassificationDataset, RandomTextClassificationDataset
 
 # Disable mosaicml logger
 os.environ['MOSAICML_PLATFORM'] = 'False'
@@ -112,11 +111,10 @@
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 
-dataset = SyntheticBatchPairDataset(
-    total_dataset_size=100,
-    data_shape=data_shape,
+dataset = RandomClassificationDataset(
+    shape=data_shape,
+    size=100,
     num_classes=num_classes,
-    num_unique_samples_to_create=10,
 )
 
 train_dataset = dataset
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d55745608f..b2cebc5281 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -20,8 +20,6 @@ the following installation targets are available:
   and building documentation.
 * ``pip install 'mosaicml[deepspeed]'``: Installs Composer with support for :mod:`deepspeed`.
 * ``pip install 'mosaicml[nlp]'``: Installs Composer with support for NLP models and algorithms.
-* ``pip install 'mosaicml[unet]'``: Installs Composer with support for :doc:`Unet </model_cards/unet>`.
-* ``pip install 'mosaicml[timm]'``: Installs Composer with support for :mod:`timm`.
 * ``pip install 'mosaicml[wandb]'``: Installs Composer with support for :mod:`wandb`.
 * ``pip install 'mosaicml[comet_ml]'``: Installs Composer with support for :mod:`comet_ml`.
 * ``pip install 'mosaicml[tensorboard]'``: Installs Composer with support for :mod:`tensorboard`.
@@ -29,7 +27,6 @@ the following installation targets are available:
 * ``pip install 'mosaicml[mlflow]'``: Installs Composer with support for :mod:`mlflow`.
 * ``pip install 'mosaicml[oci]'``: Installs Composer with support for :mod:`oci`.
 * ``pip install 'mosaicml[onnx]'``: Installs Composer with support for :mod:`onnx`.
-* ``pip install 'mosaicml[vit]'``: Installs Composer with support for :mod:`vit`.
 * ``pip install 'mosaicml[coco]'``: Installs Composer with support for :mod:`coco`.
 * ``pip install 'mosaicml[libcloud]'``: Installs Composer with support for :mod:`libcloud`.
 * ``pip install 'mosaicml[all]'``: Installs all optional dependencies.
diff --git a/docs/source/getting_started/quick_start.rst b/docs/source/getting_started/quick_start.rst
index c3c7d6f7ed..f7613384ba 100644
--- a/docs/source/getting_started/quick_start.rst
+++ b/docs/source/getting_started/quick_start.rst
@@ -61,7 +61,7 @@ Besides easily running our built-in algorithms, Composer also features:
 * An interface to flexibly add algorithms to the training loop
 * An engine that manages the ordering of algorithms for composition
 * A trainer to handle boilerplate around numerics, distributed training, and others
-* Integration with popular model libraries such as TIMM and HuggingFace Transformers
+* Integration with popular model libraries such as HuggingFace Transformers
 
 Next steps
 ----------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ce95ba6e1b..425dcad93c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -47,7 +47,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c
 
    examples/getting_started.ipynb
    examples/functional_api.ipynb
-   examples/medical_image_segmentation.ipynb
    examples/custom_speedup_methods.ipynb
    examples/finetune_huggingface.ipynb
    examples/pretrain_finetune_huggingface.ipynb
@@ -136,19 +135,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c
    method_cards/swa.md
    method_cards/weight_standardization.md
 
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-   :caption: Model Library
-
-   model_cards/BERT.md
-   model_cards/cifar_resnet.md
-   model_cards/deeplabv3.md
-   model_cards/efficientnet.md
-   model_cards/GPT2.md
-   model_cards/resnet.md
-   model_cards/unet.md
-
 .. toctree::
    :hidden:
    :caption: API Reference
diff --git a/docs/source/method_cards/decoupled_weight_decay.md b/docs/source/method_cards/decoupled_weight_decay.md
index 2d9f78f94f..71e0f4312f 100644
--- a/docs/source/method_cards/decoupled_weight_decay.md
+++ b/docs/source/method_cards/decoupled_weight_decay.md
@@ -16,9 +16,7 @@ L2 regularization is typically considered equivalent to weight decay, but this e
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
-
-from composer.models import composer_resnet
+from tests.common import RandomImageDataset, composer_resnet
 
 model = composer_resnet('resnet50')
 
diff --git a/docs/source/method_cards/stochastic_depth.md b/docs/source/method_cards/stochastic_depth.md
index 6cc24f5944..2609266a5e 100644
--- a/docs/source/method_cards/stochastic_depth.md
+++ b/docs/source/method_cards/stochastic_depth.md
@@ -14,7 +14,7 @@ Block-wise stochastic depth assigns every residual block a probability of droppi
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 ```
@@ -27,7 +27,6 @@ import torch
 import torch.nn.functional as F
 
 import composer.functional as cf
-from composer.models import composer_resnet
 
 # Training
 
@@ -63,7 +62,7 @@ for epoch in range(1):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
@@ -75,7 +74,6 @@ eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 # The trainer will automatically run it at the appropriate point in the training loop
 
 from composer.algorithms import StochasticDepth
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 
 # Train model
diff --git a/docs/source/model_cards/BERT.md b/docs/source/model_cards/BERT.md
deleted file mode 100644
index 476a7cfc51..0000000000
--- a/docs/source/model_cards/BERT.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# 🦭 BERT
-
-Category of Task: ``NLP``
-
-Kind of Task: ``Masked Language Modeling``
-
-## Overview
-
-The BERT model family is a set of transformer-based networks for Masked language modeling at various scales. This family was originally proposed by Google AI and is trained on the BooksCorpus (800M words) and English Wikipedia (2,500M words). It is useful for downstream language classification tasks such as Sentence Classification, Sentiment Analysis, Sentence Similarity, and Natural Language Inference.
-
-## Attribution
-
-The BERT model family is described in *[BERT: Pre-training of Deep Bidirectional Transformers for
-Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)* by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
-
-
-## Architecture
-
-BERT consists of a multi-layer bidirectional Transformer encoder parameterised by $n_{num_hidden_layers}$, $d_{hidden_size}$, $d_{num_attention_heads}$. The parameters for each model family member can be seen below:
-
-| Name        | Parameters | $n_{num_hidden_layers}$ | $d_{hidden_size}$ | $d_{num_attention_heads}$ |
-|-------------|------------|-------------|--------------|------------|
-| BERT-Base   | 110M       | 12          | 768         | 12         |
-| BERT-Large  | 340M       | 24          | 1024        | 16         |
-
-## Family Members
-
-We chose to implement BERT-Base as it is small enough to rapidly test methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Cross Entropy Loss | Masked Accuracy |
-|---------------------|------------|---------------------------|-----------------|--------------------------|-----------------|
-| BERT-Base           | 110M       | 10h 38m                   | 35.2B           | 1.59                     | 0.67            |
-
-## Implementation Details
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's BERT model with one of our configurations.
-
-Our recipe training is based off *[How to Train BERT with an Academic Budget](https://arxiv.org/pdf/2104.07705.pdf)* by Peter Izsak, Moshe Berchansky, and Omer Levy. Specifically, we skip the Next Sentence Prediction loss and maintain a sequence length of 128.
-
-After reproducing the original work, we decided to pre-train with the C4 dataset (Colossal Clean Crawled Corpus) from *[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683v3.pdf)* by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. C4 has been shown to yield better results on downstream tasks.
diff --git a/docs/source/model_cards/GPT2.md b/docs/source/model_cards/GPT2.md
deleted file mode 100644
index 81b9df1a1e..0000000000
--- a/docs/source/model_cards/GPT2.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# 📚 GPT-2
-
-Category of Task: ``NLP``
-
-Kind of Task: ``Autoregressive Language Modeling``
-
-## Overview
-
-The GPT-2 model family is a set of transformer-based networks for autoregressive language modeling at various scales. This family was originally proposed by OpenAI and is trained on the OpenWebText dataset. It is useful for downstream language generation tasks such as summarization, translation, and dialog.
-
-## Attribution
-
-The GPT model family is described in *[Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)* by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-
-The scaling law that we use to choose the members of this model family is described in *[Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361)* by Jared Kaplan, Sam McCandish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
-
-## Architecture
-
-GPT-2 consists of a decoder-only Transformer parameterized by $n_{layer}$, $d_{model}$, $d_{ff}$, $d_{attn}$ and $n_{heads}$. The parameters for each model family member can be seen below:
-
-| Name       | $n_{layer}$ | $d_{model}$ | $d_{ff}$ | $d_{attn}$ | $n_{heads}$ |
-|------------|-------------|-------------|----------|------------|-------------|
-| GPT-2 52M  | 8           | 512         | 2048     | 8          | 8           |
-| GPT-2 83M  | 10          | 640         | 2560     | 640        | 10          |
-| GPT-2 125M | 12          | 768         | 3072     | 768        | 12          |
-
-## Family Members
-
-We implement three members of this family at different scales: GPT 52M, GPT 83M, and GPT 125M. These models are named after their parameter counts. We selected these particular configurations because (1) they represent points on the pareto frontier of the scaling law for language models as described by [Kaplan et al. at OpenAI](https://arxiv.org/abs/2001.08361) and (2) they are small enough to rapidly iterate on methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Final Loss | Predicted Perplexity | Actual Perplexity |
-|---------------------|------------|---------------------------|-----------------|------------|----------------------|-------------------|
-| GPT-2 52M           | 53.9M      | 02:44                     | 4.6B            | 3.43       | 32.54                | 30.88             |
-| GPT-2 83M           | 85.8M      | 04:52                     | 5.5B            | 3.28       | 27.84                | 26.57             |
-| GPT-2 125M          | 114M       | 08:25                     | 6.7B            | 3.18       | 24.64                | 24.04             |
-
-## Implementation Details
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's GPT-2 model with one of our configurations.
-
-## Exploring Tradeoffs Between Quality and Training Speed / Cost
-
-There are two ways of varying the amount of time necessary to train a model and the cost necessary to do so: varying the size of the model or varying the number of steps (and therefore data) for which the model is trained. With the GPT family of models, we explore both of these axes. To develop methods for these models, we generally begin with the smallest members of this model family for initial experimentation and scale up once the ideas have been refined.
-
-To explore tradeoffs between quality and the number of training steps, we have ablated both the number of training steps and the number of data points to train on. We do this by checkpointing the model throughout training.
-
-To explore tradeoffs between quality and the size of the model, we use "Scaling Laws for Neural Language Models" to provide suggestions on model capacity and dataset size and then sweep hyperparameters such as learning rate and batch size to minimize loss.
diff --git a/docs/source/model_cards/cifar_resnet.md b/docs/source/model_cards/cifar_resnet.md
deleted file mode 100644
index babd11892c..0000000000
--- a/docs/source/model_cards/cifar_resnet.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# 👀 CIFAR ResNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Image Classification`
-
-## Overview
-
-The ResNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks. CIFAR ResNet models are a subset of this family designed specifically for the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) and [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) datasets.
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by He, Zhang, Ren, and Sun 2015. Note that this paper set the standard for ResNet style architectures for both CIFAR-10/100 and ImageNet.
-
-## Architecture
-
-Residual Networks are feedforward convolutional networks with "residual" connections between non-consecutive layers.
-
-The model architecture is defined by the original paper:
-
-- The network inputs are of dimension 32×32x3.
-- The first layer uses 3×3 convolutions.
-- The subsequent layers are a stack of 6n layers with 3×3 convolutions on the feature maps of sizes {32,16,8}, with 2n layers for each feature map size. The number of filters are {16,32,64} for the respective feature map sizes. Subsampling is performed by convolutions with a stride of 2.
-- The network ends with a global average pooling, followed by a linear layer with the output dimension equal to the number of classes and a softmax activation.
-
-There are a total 6n+2 stacked weighted layers. Each family member is specified by the number of layers, for example n=9 corresponds to ResNet56.
-
-The biggest differences between CIFAR ResNet models and ImageNet ResNet models are:
-
-- ImageNet ResNets substantially downsample their input compared to CIFAR ResNets. The input layer of ImageNet ResNets is a 7x7 convolutional layer with stride 2, followed shortly thereafter by a 3x3 maxpool with stride 2, after which the input continues on to the convolutional blocks. CIFAR ResNets only have a single 3x3, stride 1, convolutional input layer.
-- CIFAR ResNet models use fewer filters for each convolution.
-- The ImageNet ResNets contain four stages, while the CIFAR ResNets contain three stages. In addition, CIFAR ResNets uniformly distribute blocks across each stage while ImageNet ResNets have a specific number of blocks for each stage.
-
-## Family members
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 1x3080 |
-|----------------------|-----------------|--------------|-------------------------|
-| ResNet20             | 0.27M           | TBA          | TBA                     |
-| ResNet32             | 0.46M           | TBA          | TBA                     |
-| ResNet44             | 0.66M           | TBA          | TBA                     |
-| ResNet56             | 0.85M           | 93.1%        | 35 min                  |
-| ResNet110            | 1.7M            | TBA          | TBA                     |
-
-## Default Training Hyperparameters
-
-- Optimizer: SGD
-    - Learning rate: 1.2
-    - Momentum: 0.9
-    - Weight decay: 1e-4
-- Batch size: 1024
-- LR Schedulers
-    - Linear warmup for 5 epochs
-    - Multistep decay by 0.1 at epochs 80 and 120
-- Number of epochs: 160
diff --git a/docs/source/model_cards/deeplabv3.md b/docs/source/model_cards/deeplabv3.md
deleted file mode 120000
index 330a125c39..0000000000
--- a/docs/source/model_cards/deeplabv3.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../composer/models/deeplabv3/README.md
\ No newline at end of file
diff --git a/docs/source/model_cards/efficientnet.md b/docs/source/model_cards/efficientnet.md
deleted file mode 100644
index 8f6aa28dfa..0000000000
--- a/docs/source/model_cards/efficientnet.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# 🕸️ EfficientNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Image Classification`
-
-## Overview
-
-The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks, although they were initially designed for image classification. The model family was designed to reach the highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and image resolution according to an empirically determined scaling law.
-
-## Attribution
-
-Paper: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le
-
-Code: [gen-efficientnet-pytorch Github repository](https://github.com/rwightman/gen-efficientnet-pytorch) by Ross Wightman
-
-Hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) by Nvidia
-
-## Architecture
-
-The table below from Tan and Le specifies the EfficientNet baseline architecture broken up into separate stages. MBConv indicates a mobile inverted bottleneck with a specific expansion size and kernel size. Resolution is the expected input resolution of the current stage. Number of channels is the number of output channels of the current stage. Number of layers indicates the number of repeated blocks in each stage. Subsequent EfficientNet family members scale the resolution, number of channels, and number of layers according to the resolution, width, and depth scaling parameters defined by Tan and Le.
-
-![efficientnet_arch.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/efficientnet_arch.png)
-
-## Family members
-
-Tan and Le included 8 members in their model family. The goal was for each family member to have approximately double the FLOPs of the previous family member. Currently, we only support EfficientNet-B0.
-
-| Model Family Member | Parameter Count | TPU Repo Accuracy* | Our Accuracy** | Training Time on 8x3080 |
-|---------------------|-----------------|--------------------|----------------|-------------------------|
-| EfficientNet-B0     | 5.3M            | 77.1%              | 77.22%         | 23.3 hr                 |
-| EfficientNet-B1     | 7.8M            | 79.1%              | TBA            | TBA                     |
-| EfficientNet-B2     | 9.2M            | 80.1%              | TBA            | TBA                     |
-| EfficientNet-B3     | 12M             | 81.6%              | TBA            | TBA                     |
-| EfficientNet-B4     | 19M             | 82.9%              | TBA            | TBA                     |
-| EfficientNet-B5     | 30M             | 83.6%              | TBA            | TBA                     |
-| EfficientNet-B6     | 43M             | 84.0%              | TBA            | TBA                     |
-| EfficientNet-B7     | 66M             | 84.3%              | TBA            | TBA                     |
-
-*Includes label smoothing, sample-wise stochastic depth, and AutoAugment
-
-**Includes label smoothing and sample-wise stochastic depth
-
-## Default Training Hyperparameters
-
-Our default hyperparameters are identical to the Nvidia Deep Learning Examples except:
-
-- Applying weight decay to batch normalization trainable parameters
-- Batch normalization parameters are `momentum = 0.1` and `eps = 1e-5`
diff --git a/docs/source/model_cards/resnet.md b/docs/source/model_cards/resnet.md
deleted file mode 120000
index d7534294da..0000000000
--- a/docs/source/model_cards/resnet.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../composer/models/resnet/README.md
\ No newline at end of file
diff --git a/docs/source/model_cards/unet.md b/docs/source/model_cards/unet.md
deleted file mode 100644
index d896854d3f..0000000000
--- a/docs/source/model_cards/unet.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# ↩️ UNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Segmentation`
-
-Link to Code: [https://github.com/mosaicml/composer/tree/main/composer/models/unet](https://github.com/mosaicml/composer/tree/main/composer/models/unet)
-
-## Overview
-
-UNet is an architecture used in image segmentation. The example we are using is for medical brain tumor data.
-
-## Attribution
-
-The UNet model has been introduced in "U-Net: Convolutional Networks for Biomedical Image Segmentation" by Olaf Ronneberger, Philipp Fischer, Thomas Brox in [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597).
-
-We are using the NVDA DLE examples version in
-[https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet).
-
-## Architecture
-
-The figure below shows a 3D version of the UNet architecture. Quoting the DLE examples, U-Net is composed of a contractive and an expanding path that aims at building a bottleneck in its centermost part through a combination of convolution, instance norm, and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve training.
-
-![unet3d.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/unet3d.png)
-
-## Implementation Details
-
-There are 3 main differences between our implementation and the original NVDA DALI implementation.
-
-The first two refer to removing the NVDA DALI pipeline and replacing all transforms with torch implementations. We are omitting the Zoom transform and use a kernel size of 3 for the Gaussian Blur transform.
-
-While NVDA DLE examples reports the training accuracy using an average of 5 folds, we are using only 1 fold in the interest of faster iteration time, so all of our results are reported using fold 0 and 200 epochs.
-
-## Exploring Tradeoffs Between Quality and Training Speed/Cost
-
-As noted above, we are reporting only 1 fold and a fixed number of 200 epochs in training the model, while DLE uses early stopping.
diff --git a/docs/source/trainer/using_the_trainer.rst b/docs/source/trainer/using_the_trainer.rst
index 385fd10677..ab57c3f913 100644
--- a/docs/source/trainer/using_the_trainer.rst
+++ b/docs/source/trainer/using_the_trainer.rst
@@ -11,7 +11,7 @@ The Composer :class:`.Trainer` implements a highly-optimized PyTorch training lo
    learning work, with reproducible results in time-to-train and
    accuracy.
 -  Integration with your favorite model hubs:
-   🤗 `Transformers`_, `TIMM`_, and `torchvision`_.
+   🤗 `Transformers`_ and `torchvision`_.
 -  Iterate faster! We take care of performance and efficiency.
 
 .. note::
@@ -215,11 +215,12 @@ well as Composer's custom schedulers.
 .. testcode::
 
     from composer import Trainer
-    from composer.models import composer_resnet
+    from composer.models.tasks import ComposerClassifier
+    import torchvision.models as models
     from torch.optim import SGD
     from torch.optim.lr_scheduler import LinearLR
 
-    model = composer_resnet(model_name="resnet50", num_classes=1000)
+    model = ComposerClassifier(module=models.resnet18(), num_classes=1000)
     optimizer = SGD(model.parameters(), lr=0.1)
     scheduler = LinearLR(optimizer)
 
@@ -626,5 +627,4 @@ This was just a quick tour of the features available within our trainer.
 Please see the other guides and notebooks for further details.
 
 .. _Transformers: https://huggingface.co/docs/transformers/index
-.. _TIMM: https://fastai.github.io/timmdocs/
 .. _torchvision: https://pytorch.org/vision/stable/models.html
diff --git a/examples/checkpoint_with_wandb.py b/examples/checkpoint_with_wandb.py
index 44d54a51be..2d05bb0861 100644
--- a/examples/checkpoint_with_wandb.py
+++ b/examples/checkpoint_with_wandb.py
@@ -5,6 +5,9 @@
 
 import shutil
 
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils.data
 from torch.optim import SGD
 from torchvision.datasets import MNIST
@@ -12,7 +15,7 @@
 
 from composer import Trainer
 from composer.loggers import WandBLogger
-from composer.models.classify_mnist import mnist_model
+from composer.models.tasks import ComposerClassifier
 
 # Configure the WandBLogger to log artifacts, and set the project name
 # The project name must be deterministic, so we can restore from it
@@ -23,7 +26,36 @@
 
 # Configure the trainer -- here, we train a simple MNIST classifier
 print('Starting the first training run\n')
-model = mnist_model(num_classes=10)
+
+
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
+
+model = ComposerClassifier(module=Model(num_classes=10))
 optimizer = SGD(model.parameters(), lr=0.01)
 train_dataloader = torch.utils.data.DataLoader(
     dataset=MNIST('~/datasets', train=True, download=True, transform=ToTensor()),
diff --git a/examples/custom_speedup_methods.ipynb b/examples/custom_speedup_methods.ipynb
index 8ee769da20..79116ee328 100644
--- a/examples/custom_speedup_methods.ipynb
+++ b/examples/custom_speedup_methods.ipynb
@@ -120,7 +120,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next, we will define a model. For this, we will simply use Composer's ResNet-56. One quirk to be aware of with this model is that the forward method takes in an `(X, y)` pair of inputs and targets, essentially what the dataloaders will spit out."
+    "Next, we will define a model. For this, we will simply use Composer's ResNet-50. One quirk to be aware of with this model is that the forward method takes in an `(X, y)` pair of inputs and targets, essentially what the dataloaders will spit out."
    ]
   },
   {
@@ -129,9 +129,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer.models import composer_resnet_cifar\n",
+    "from torchvision.models import resnet\n",
+    "from composer.models.tasks import ComposerClassifier\n",
     "\n",
-    "model = composer_resnet_cifar(model_name='resnet_56', num_classes=10)"
+    "model = ComposerClassifier(model=resnet('resnet50'), num_classes=10)"
    ]
   },
   {
diff --git a/examples/exporting_for_inference.ipynb b/examples/exporting_for_inference.ipynb
index e65814ed6b..7b4da1fac7 100644
--- a/examples/exporting_for_inference.ipynb
+++ b/examples/exporting_for_inference.ipynb
@@ -188,7 +188,7 @@
     "## Export Using a Callback\n",
     "<a id=\"Export-Using-a-Callback\"></a>\n",
     "\n",
-    "The Composer trainer also lets you specify an export callback that automatically exports at the end of training. Since we will be training a model for a few epochs, we'll first create a dataloader with synthetic dataset for this tutorial."
+    "The Composer trainer also lets you specify an export callback that automatically exports at the end of training. Since we will be training a model for a few epochs, we'll first create a dataloader with CIFAR for this tutorial."
    ]
   },
   {
@@ -198,10 +198,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer.datasets.synthetic import SyntheticBatchPairDataset\n",
     "from torch.utils.data import DataLoader\n",
+    "from torchvision import datasets\n",
     "\n",
-    "dataset = SyntheticBatchPairDataset(total_dataset_size=8, data_shape=(3, 224, 224), num_classes=1000)\n",
+    "train_dataset = datasets.CIFAR10(\"./data\", train=True, download=True)\n",
+    "\n",
+    "dataset = DataLoader(train_dataset, batch_size=4)\n",
     "dataloader = DataLoader(dataset=dataset, batch_size=4)"
    ]
   },
@@ -222,11 +224,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "from torchvision.models import resnet\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
     "from composer.models import ComposerClassifier\n",
     "\n",
-    "model = ComposerClassifier(module=resnet.resnet50(), num_classes=1000)"
+    "class Model(nn.Module):\n",
+    "    \"\"\"Toy convolutional neural network architecture in pytorch for MNIST.\"\"\"\n",
+    "\n",
+    "    def __init__(self, num_classes: int = 10):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.num_classes = num_classes\n",
+    "\n",
+    "        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)\n",
+    "        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)\n",
+    "        self.bn = nn.BatchNorm2d(32)\n",
+    "        self.fc1 = nn.Linear(32 * 16, 32)\n",
+    "        self.fc2 = nn.Linear(32, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = self.conv1(x)\n",
+    "        out = F.relu(out)\n",
+    "        out = self.conv2(out)\n",
+    "        out = self.bn(out)\n",
+    "        out = F.relu(out)\n",
+    "        out = F.adaptive_avg_pool2d(out, (4, 4))\n",
+    "        out = torch.flatten(out, 1, -1)\n",
+    "        out = self.fc1(out)\n",
+    "        out = F.relu(out)\n",
+    "        return self.fc2(out)\n",
+    "\n",
+    "\n",
+    "model = ComposerClassifier(module=Model(num_classes=10))"
    ]
   },
   {
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
deleted file mode 100644
index fcae2e2d9d..0000000000
--- a/examples/imagenet/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# ImageNet Examples
-
-These examples illustrate how to train models on ImageNet-1k using Composer.
-
-Prerequisites:
-- Install Composer: `pip install mosaicml`
-- ImageNet already downloaded on the system used for training
-
-## ResNet
-
-The `train_resnet_imagenet1k.py` script trains models from the ResNet architecture family e.g. ResNet-50, ResNet-101.
-
-Few details about the script:
-- Models are pulled directly from torchvision, then wrapped into a `ComposerModel`
-- Saves checkpoints to "checkpoints/{run_name}" by default where `run_name` is random if not specified
-- Logs time-to-train, throughput, and learning rate throughout training.
-
-### Example configurations:
-
-<!--pytest.mark.skip-->
-
-```bash
-# Single GPU/CPU depending on torch.cuda.is_available()
-python train_resnet_imagenet1k.py /path/to/imagenet
-
-# Log experiments to Weights and Biases
-python train_resnet_imagenet1k.py /path/to/imagenet --wandb_logger --wandb_entity my_username --wandb_project my_project --wandb_run_name my_run_name
-
-# Single/Multi GPU training (infers the number of GPUs available)
-composer train_resnet_imagenet1k.py /path/to/imagenet
-
-# Manually specify number of GPUs to use:
-composer -n $N_GPUS train_resnet_imagenet1k.py /path/to/imagenet
-
-# Mild ResNet recipe for fastest training to ~76.5% accuracy:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name mild --train_crop_size 176 --eval_crop_size 224 --max_duration 36ep  --loss_name binary_cross_entropy
-
-# Medium ResNet recipe highest accuracy with similar training time as baseline:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name medium --train_crop_size 176 --eval_crop_size 224 --max_duration 135ep  --loss_name binary_cross_entropy
-
-# Spicy ResNet recipe for our most accurate ResNet over a long training schedule:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name spicy --train_crop_size 176 --eval_crop_size 224 --max_duration 270ep  --loss_name binary_cross_entropy
-```
diff --git a/examples/imagenet/train_resnet_imagenet1k.py b/examples/imagenet/train_resnet_imagenet1k.py
deleted file mode 100644
index d6f1dee008..0000000000
--- a/examples/imagenet/train_resnet_imagenet1k.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Example script to train a ResNet model on ImageNet."""
-
-import argparse
-import logging
-import os
-
-import torch
-from torch.utils.data import DataLoader
-from torchmetrics import MetricCollection
-from torchmetrics.classification import MulticlassAccuracy
-from torchvision import transforms
-from torchvision.datasets import ImageFolder
-from torchvision.models import resnet
-
-from composer import DataSpec, Time, Trainer
-from composer.algorithms import (EMA, SAM, BlurPool, ChannelsLast, ColOut, LabelSmoothing, MixUp, ProgressiveResizing,
-                                 RandAugment, StochasticDepth)
-from composer.callbacks import CheckpointSaver, LRMonitor, SpeedMonitor
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.loggers import WandBLogger
-from composer.loss import binary_cross_entropy_with_logits, soft_cross_entropy
-from composer.metrics import CrossEntropy
-from composer.models.tasks import ComposerClassifier
-from composer.optim import CosineAnnealingWithWarmupScheduler, DecoupledSGDW
-from composer.utils import dist
-
-logging.basicConfig()
-logging.getLogger().setLevel(logging.INFO)
-
-parser = argparse.ArgumentParser()
-
-# Dataloader arguments
-parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str)
-parser.add_argument('--train_crop_size', help='Training image crop size', type=int, default=224)
-parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=256)
-parser.add_argument('--eval_crop_size', help='Evaluation image crop size', type=int, default=224)
-parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=2048)
-parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=2048)
-
-# Model arguments
-parser.add_argument('--model_name',
-                    help='Name of the resnet model to train',
-                    default='resnet50',
-                    choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'])
-parser.add_argument('--loss_name',
-                    help='Name of the loss function to use for training',
-                    default='cross_entropy',
-                    choices=['cross_entropy', 'binary_cross_entropy'])
-
-# Optimizer arguments
-parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=2.048)
-parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.875)
-parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-4)
-
-# LR scheduler arguments
-parser.add_argument('--t_warmup',
-                    help='Duration of learning rate warmup specified as a Time string',
-                    type=Time.from_timestring,
-                    default='8ep')
-parser.add_argument('--t_max',
-                    help='Duration to cosine decay the learning rate specified as a Time string',
-                    type=Time.from_timestring,
-                    default='1dur')
-
-# Save checkpoint arguments
-parser.add_argument('--save_checkpoint_dir',
-                    help='Directory in which to save model checkpoints',
-                    type=str,
-                    default='checkpoints/{run_name}')
-parser.add_argument('--checkpoint_interval', help='Frequency to save checkpoints', type=str, default='1ep')
-
-# Load checkpoint arguments, assumes resuming the previous training run instead of fine-tuning
-parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str)
-
-# Recipes
-parser.add_argument('--recipe_name',
-                    help='Either "mild", "medium" or "spicy" in order of increasing training time and accuracy',
-                    type=str,
-                    choices=['mild', 'medium', 'spicy'])
-
-# Logger parameters: progress bar logging is used by default
-# Only has Weights and Biases option to reduce the number of arguments. Other loggers can be substituted in the script
-parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true')
-parser.add_argument('--wandb_entity', help='WandB entity name', type=str)
-parser.add_argument('--wandb_project', help='WandB project name', type=str)
-parser.add_argument('--wandb_run_name', help='WandB run name', type=str)
-
-# Trainer arguments
-parser.add_argument('--run_name', help='Name of the training run used for checkpointing and other logging', type=str)
-parser.add_argument('--seed', help='Random seed', type=int, default=17)
-parser.add_argument('--max_duration',
-                    help='Duration to train specified as a Time string',
-                    type=Time.from_timestring,
-                    default='90ep')
-parser.add_argument('--eval_interval',
-                    help='How frequently to run evaluation on the validation set specified as a Time string',
-                    type=Time.from_timestring,
-                    default='1ep')
-
-args = parser.parse_args()
-
-
-def _main():
-
-    # Divide batch sizes by number of devices if running multi-gpu training
-    if dist.get_world_size():
-        args.train_batch_size = args.train_batch_size // dist.get_world_size()
-        args.eval_batch_size = args.eval_batch_size // dist.get_world_size()
-
-    # Scale by 255 since the collate `pil_image_collate` results in images in range 0-255
-    # If using ToTensor() and the default collate, remove the scaling by 255
-    IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-    IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-    # Train dataset
-    logging.info('Building train dataloader')
-    train_transforms = transforms.Compose([
-        transforms.RandomResizedCrop(args.train_crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-        transforms.RandomHorizontalFlip(),
-    ])
-    train_dataset = ImageFolder(os.path.join(args.data_dir, 'train'), train_transforms)
-    # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup
-    train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True)
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=True,
-        sampler=train_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,  # Reduce overhead of creating new workers at the expense of using slightly more RAM
-    )
-    # DataSpec allows for on-gpu transformations, marginally relieving dataloader bottleneck
-    train_dataspec = DataSpec(dataloader=train_dataloader,
-                              device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD))
-    logging.info('Built train dataloader\n')
-
-    # Validation dataset
-    logging.info('Building evaluation dataloader')
-    eval_transforms = transforms.Compose([
-        transforms.Resize(args.eval_resize_size),
-        transforms.CenterCrop(args.eval_crop_size),
-    ])
-    eval_dataset = ImageFolder(os.path.join(args.data_dir, 'val'), eval_transforms)
-    # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup,
-    eval_sampler = dist.get_sampler(eval_dataset, drop_last=False, shuffle=False)
-    eval_dataloader = DataLoader(
-        eval_dataset,
-        batch_size=args.eval_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=False,
-        sampler=eval_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,  # Reduce overhead of creating new workers at the expense of using slightly more RAM
-    )
-    eval_dataspec = DataSpec(dataloader=eval_dataloader,
-                             device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD))
-    logging.info('Built evaluation dataloader\n')
-
-    # Instantiate torchvision ResNet model
-    logging.info('Building Composer model')
-    model_fn = getattr(resnet, args.model_name)
-    model = model_fn(num_classes=1000, groups=1, width_per_group=64)
-
-    # Specify model initialization
-    def weight_init(w: torch.nn.Module):
-        if isinstance(w, torch.nn.Linear) or isinstance(w, torch.nn.Conv2d):
-            torch.nn.init.kaiming_normal_(w.weight)
-        if isinstance(w, torch.nn.BatchNorm2d):
-            w.weight.data = torch.rand(w.weight.data.shape)
-            w.bias.data = torch.zeros_like(w.bias.data)
-        # When using binary cross entropy, set the classification layer bias to -log(num_classes)
-        # to ensure the initial probabilities are approximately 1 / num_classes
-        if args.loss_name == 'binary_cross_entropy' and isinstance(w, torch.nn.Linear):
-            w.bias.data = torch.ones(w.bias.shape) * -torch.log(torch.tensor(w.bias.shape[0]))
-
-    model.apply(weight_init)
-
-    # Performance metrics to log other than training loss
-    train_metrics = MulticlassAccuracy(num_classes=1000, average='micro')
-    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=1000, average='micro')])
-
-    # Cross entropy loss that can handle both index and one-hot targets
-
-    if args.loss_name == 'binary_cross_entropy':
-        loss_fn = binary_cross_entropy_with_logits
-    else:
-        loss_fn = soft_cross_entropy
-
-    # Wrapper function to convert a classification PyTorch model into a Composer model
-    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
-    logging.info('Built Composer model\n')
-
-    # Optimizer
-    logging.info('Building optimizer and learning rate scheduler')
-    optimizer = DecoupledSGDW(composer_model.parameters(),
-                              lr=args.learning_rate,
-                              momentum=args.momentum,
-                              weight_decay=args.weight_decay)
-
-    # Learning rate scheduler: LR warmup for 8 epochs, then cosine decay for the rest of training
-    lr_scheduler = CosineAnnealingWithWarmupScheduler(t_warmup=args.t_warmup, t_max=args.t_max)
-    logging.info('Built optimizer and learning rate scheduler\n')
-
-    # Callbacks for logging
-    logging.info('Building SpeedMonitor, LRMonitor, and CheckpointSaver callbacks')
-    speed_monitor = SpeedMonitor(window_size=50)  # Measures throughput as samples/sec and tracks total training time
-    lr_monitor = LRMonitor()  # Logs the learning rate
-
-    # Callback for checkpointing
-    checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval)
-    logging.info('Built SpeedMonitor, LRMonitor, and CheckpointSaver callbacks\n')
-
-    # Recipes for training ResNet architectures on ImageNet in order of increasing training time and accuracy
-    # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/
-    logging.info('Building algorithm recipes')
-    if args.recipe_name == 'mild':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.08),
-        ]
-    elif args.recipe_name == 'medium':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.1),
-            MixUp(alpha=0.2),
-            SAM(rho=0.5, interval=10),
-        ]
-    elif args.recipe_name == 'spicy':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.6, delay_fraction=0.2, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.13),
-            MixUp(alpha=0.25),
-            SAM(rho=0.5, interval=5),
-            ColOut(p_col=0.05, p_row=0.05),
-            RandAugment(depth=1, severity=9),
-            StochasticDepth(target_layer_name='ResNetBottleneck',
-                            stochastic_method='sample',
-                            drop_distribution='linear',
-                            drop_rate=0.1)
-        ]
-    else:
-        algorithms = None
-    logging.info('Built algorithm recipes\n')
-
-    logger = None
-    if args.wandb_logger:
-        if args.wandb_entity is None:
-            raise ValueError('Please specify --wandb_entity argument')
-        if args.wandb_project is None:
-            raise ValueError('Please specify --wandb_project argument')
-        if args.wandb_run_name is None:
-            raise ValueError('Please specify --wandb_run_name argument')
-        logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project, name=args.wandb_run_name)
-
-    # Create the Trainer!
-    logging.info('Building Trainer')
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-    precision = 'amp' if device == 'gpu' else 'fp32'  # Mixed precision for fast training when using a GPU
-    trainer = Trainer(run_name=args.run_name,
-                      model=composer_model,
-                      train_dataloader=train_dataspec,
-                      eval_dataloader=eval_dataspec,
-                      eval_interval=args.eval_interval,
-                      optimizers=optimizer,
-                      schedulers=lr_scheduler,
-                      algorithms=algorithms,
-                      loggers=logger,
-                      max_duration=args.max_duration,
-                      callbacks=[speed_monitor, lr_monitor, checkpoint_saver],
-                      load_path=args.load_checkpoint_path,
-                      device=device,
-                      precision=precision,
-                      device_train_microbatch_size='auto',
-                      seed=args.seed)
-    logging.info('Built Trainer\n')
-
-    # Start training!
-    logging.info('Train!')
-    trainer.fit()
-
-
-if __name__ == '__main__':
-    _main()
diff --git a/examples/medical_image_segmentation.ipynb b/examples/medical_image_segmentation.ipynb
deleted file mode 100644
index d13f88fbea..0000000000
--- a/examples/medical_image_segmentation.ipynb
+++ /dev/null
@@ -1,725 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 🩺 Image Segmentation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this notebook you will use Composer and PyTorch to segment pneumothorax (air around or outside of the lungs) from chest radiographic images. This dataset was originally released for a [kaggle competition][kaggle] by the [Society for Informatics in Medicine][siim] (SIIM).\n",
-    "\n",
-    "**Disclaimer: This example represents a minimal working baseline. In order to get competitive results this notebook must run for a long time.**\n",
-    "\n",
-    "### Recommended Background\n",
-    "\n",
-    "This tutorial goes through the process of starting a project from scratch with Composer. It assumes you are fairly familiar with how such a process might look if working with PyTorch. In addition, it assumes some familiarity with computer vision models and methods.\n",
-    "\n",
-    "To better understand the Composer part, make sure you're comfortable with the material in our [Getting Started][getting_started] tutorial.\n",
-    "\n",
-    "### Tutorial Goals and Concepts Covered\n",
-    "\n",
-    "The goal of this tutorial is to provide an executable example of a computer vision project in Composer from the ground up.\n",
-    "\n",
-    "We will cover:\n",
-    "\n",
-    "- installing relevant packages\n",
-    "- downloading the SIIM dataset from kaggle\n",
-    "- cleaning and resampling the dataset\n",
-    "- splitting data for validation\n",
-    "- visualizing model inputs\n",
-    "- training a baseline model with Composer\n",
-    "- using Composer methods\n",
-    "- next steps\n",
-    "\n",
-    "Let's get started!\n",
-    "\n",
-    "[kaggle]: https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/overview\n",
-    "[siim]: https://siim.org/\n",
-    "[getting_started]: https://docs.mosaicml.com/projects/composer/en/stable/examples/getting_started.html"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "Let's get started and configure our environment.\n",
-    "\n",
-    "### Install Dependencies\n",
-    "\n",
-    "If you haven't already, let's install the following dependencies, which are needed for this example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install kaggle pydicom git+https://github.com/qubvel/segmentation_models.pytorch opencv-python-headless jupyterlab-widgets\n",
-    "\n",
-    "%pip install mosaicml\n",
-    "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# %pip install git+https://github.com/mosaicml/composer.git"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Kaggle Authentication\n",
-    "\n",
-    "To access the data you need a Kaggle Account\n",
-    "- accept competition terms https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/data\n",
-    "- download `kaggle.json` from https://www.kaggle.com/yourusername/account by clicking \"Create new API token\"\n",
-    "- make the `kaggle.json` file available to this notebook using the following code cells."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ipywidgets import FileUpload\n",
-    "from IPython.display import display\n",
-    "uploader = FileUpload(accept='.json', multiple=True)\n",
-    "display(uploader)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "kaggle_folder = os.path.join(os.path.expanduser(\"~\"), \".kaggle\")\n",
-    "os.makedirs(kaggle_folder, exist_ok=True)\n",
-    "kaggle_config_file = os.path.join(kaggle_folder, \"kaggle.json\")\n",
-    "with open(kaggle_config_file, 'wb+') as output_file: \n",
-    "    for uploaded_filename in uploader.value:\n",
-    "        content = uploader.value[uploaded_filename]['content']   \n",
-    "        output_file.write(content) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Download and unzip the data \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!kaggle datasets download -d seesee/siim-train-test\n",
-    "!unzip -q siim-train-test.zip -d .\n",
-    "!ls"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Flatten Image Directories\n",
-    "The original dataset is oddly nested. We flatten it out so the images are easier to access in our pytorch dataset.\n",
-    "\n",
-    "`/siim/dicom-images-train/id/id/id.dcm` to `/siim/dicom-images-train/id.dcm`. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "from tqdm.auto import tqdm\n",
-    "\n",
-    "train_images = list(Path('siim/dicom-images-train').glob('*/*/*.dcm'))\n",
-    "for image in tqdm(train_images):\n",
-    "    image.replace(f'siim/dicom-images-train/{image.parts[-1]}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Project setup"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import itertools\n",
-    "from ipywidgets import interact, fixed, IntSlider\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import torch\n",
-    "from torch import nn\n",
-    "import matplotlib.pyplot as plt\n",
-    "import cv2\n",
-    "\n",
-    "# model\n",
-    "import segmentation_models_pytorch as smp\n",
-    "\n",
-    "# data\n",
-    "from torch.utils.data import DataLoader, Dataset\n",
-    "from torchvision.utils import draw_segmentation_masks, make_grid\n",
-    "from pydicom.filereader import dcmread\n",
-    "from sklearn.model_selection import StratifiedKFold\n",
-    "\n",
-    "# transforms\n",
-    "from albumentations import ShiftScaleRotate, Resize, Compose\n",
-    "\n",
-    "from torchmetrics import Metric\n",
-    "from torchmetrics.collections import MetricCollection\n",
-    "\n",
-    "# composer\n",
-    "from composer import Trainer\n",
-    "from composer.models import ComposerModel\n",
-    "from composer.optim import DecoupledAdamW\n",
-    "from composer.metrics.metrics import Dice"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Utils\n",
-    "\n",
-    "Here we define some utility functions to help with logging, decoding/encoding targets, and visualization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class LossMetric(Metric):\n",
-    "    \"\"\"Turns any torch.nn Loss Module into distributed torchmetrics Metric.\"\"\"\n",
-    "\n",
-    "    def __init__(self, loss, dist_sync_on_step=False):\n",
-    "        super().__init__(dist_sync_on_step=dist_sync_on_step)\n",
-    "        self.loss = loss\n",
-    "        self.add_state(\"sum_loss\", default=torch.tensor(0.), dist_reduce_fx=\"sum\")\n",
-    "        self.add_state(\"total_batches\", default=torch.tensor(0), dist_reduce_fx=\"sum\")\n",
-    "\n",
-    "    def update(self, preds, target):\n",
-    "        \"\"\"Update the state with new predictions and targets.\n",
-    "        \"\"\"\n",
-    "        # Loss calculated over samples/batch, accumulate loss over all batches\n",
-    "        self.sum_loss += self.loss(preds, target)\n",
-    "        self.total_batches += 1\n",
-    "\n",
-    "    def compute(self):\n",
-    "        \"\"\"Aggregate state over all processes and compute the metric.\n",
-    "        \"\"\"\n",
-    "        # Return average loss over entire validation dataset\n",
-    "        return self.sum_loss / self.total_batches\n",
-    "\n",
-    "def rle2mask(rle, height=1024, width=1024, fill_value=1):\n",
-    "    mask = np.zeros((height, width), np.float32)\n",
-    "    mask = mask.reshape(-1)\n",
-    "    rle = np.array([int(s) for s in rle.strip().split(' ')])\n",
-    "    rle = rle.reshape(-1, 2)\n",
-    "    start = 0\n",
-    "    for index, length in rle:\n",
-    "        start = start+index\n",
-    "        end = start+length\n",
-    "        mask[start: end] = fill_value\n",
-    "        start = end\n",
-    "    mask = mask.reshape(width, height).T\n",
-    "    return mask\n",
-    "\n",
-    "def mask2rle(mask):\n",
-    "    mask = mask.T.flatten()\n",
-    "    start = np.where(mask[1:] > mask[:-1])[0]+1\n",
-    "    end = np.where(mask[:-1] > mask[1:])[0]+1\n",
-    "    length = end-start\n",
-    "    rle = []\n",
-    "    for i in range(len(length)):\n",
-    "        if i == 0:\n",
-    "            rle.extend([start[0], length[0]])\n",
-    "        else:\n",
-    "            rle.extend([start[i]-end[i-1], length[i]])\n",
-    "    rle = ' '.join([str(r) for r in rle])\n",
-    "    return rle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Preprocessing and Data Science"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### SIIM Dataset\n",
-    "\n",
-    "The SIIM dataset consists of:\n",
-    "- `dicom-images-train` - 12954 labeled images in [DICOM][dicom] format.\n",
-    "- `dicom-images-test` - 3205 unlabeled DICOM images for testing\n",
-    "\n",
-    "- `train-rle.csv` comes with a label file `train-rle.csv` mapping `ImageId` to `EncodedPixels`.\n",
-    "\n",
-    "    - `ImageId`s map to image paths for [DICOM][dicom_format] format images. \n",
-    "\n",
-    "  - `EncodedPixels` are [run length encoded][masks] segmentation masks representing areas where pneumothorax has been labeled by an expert. A label of `\"-1\"` indicates the image was examined and no pneumothorax was found.\n",
-    "\n",
-    "[dicom]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom\n",
-    "[dicom_format]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom.html#sphx-glr-auto-examples-input-output-plot-read-dicom-py\n",
-    "[masks]: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/mask.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!ls siim"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "labels_df = pd.read_csv('siim/train-rle.csv')\n",
-    "labels_df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Clean Data\n",
-    "Of the ~13,000 images, only 3600 have masks. We will throw out some of the negative samples to better balance our dataset and speed up training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "labels_df[labels_df[\" EncodedPixels\"] != \"-1\"].shape, labels_df[labels_df[\" EncodedPixels\"] == \"-1\"].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def balance_labels(labels_df, extra_samples_without_mask=1500, random_state=1337):\n",
-    "    \"\"\"\n",
-    "    Drop duplicates and mark samples with masks.\n",
-    "    Sample 3576+extra_samples_without_mask unmasked samples to balance dataset.\n",
-    "    \"\"\"\n",
-    "    df = labels_df.drop_duplicates('ImageId')\n",
-    "    df_with_mask = df[df[\" EncodedPixels\"] != \"-1\"].copy(deep=True)\n",
-    "    df_with_mask['has_mask'] = 1\n",
-    "    df_without_mask = df[df[\" EncodedPixels\"] == \"-1\"].copy(deep=True)\n",
-    "    df_without_mask['has_mask'] = 0\n",
-    "    df_without_mask_sampled = df_without_mask.sample(len(df_with_mask)+extra_samples_without_mask, random_state=random_state)\n",
-    "    df = pd.concat([df_with_mask, df_without_mask_sampled])\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = balance_labels(labels_df)\n",
-    "df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create Cross Validation Splits\n",
-    "Once cleaned and balanced, we're left with only 6838 images. This will leave us with rather small training and validation sets once we split the data. To mitigate the chances of us validating on a poorly sampled (not representative of our unlabeled test data) validation set, we use [StratifiedKFold][kfold] to create 5 different 80%-20%, `train` `eval` splits. \n",
-    "\n",
-    "**Note**: For datasets of this size, it's good practice to train and evaluate on each split, but due to runtime constraints in this notebook we will only train on the first split which contains 5470 training and 1368 eval samples.\n",
-    "\n",
-    "[kfold]: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)\n",
-    "train_idx, eval_idx = list(kfold.split(df[\"ImageId\"], df[\"has_mask\"]))[0]\n",
-    "train_df, eval_df = df.iloc[train_idx], df.iloc[eval_idx]\n",
-    "train_df.shape, eval_df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## PyTorch\n",
-    "\n",
-    "### PyTorch Dataset\n",
-    "`SIIMDataset` is a standard PyTorch dataset that reads images and decodes labels from the siim label csv. DICOM images are loaded as grayscale numpy arrays, converted to rgb, and scaled. Labels are converted from rle strings to binary segmentation masks. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class SIIMDataset(Dataset):\n",
-    "    def __init__(self, \n",
-    "                 labels_df,\n",
-    "                 transforms=None,\n",
-    "                 image_dir=Path('siim/dicom-images-train')):\n",
-    "        self.labels_df = labels_df\n",
-    "        self.image_dir = image_dir\n",
-    "        self.transforms = transforms\n",
-    "\n",
-    "    def __getitem__(self, idx):\n",
-    "        row = self.labels_df.iloc[idx]\n",
-    "        image_id = row.ImageId\n",
-    "        image_path = self.image_dir / f'{image_id}.dcm'\n",
-    "        image = dcmread(image_path).pixel_array # load dicom image\n",
-    "        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # convert rgb so we can keep imagenet first layer weights\n",
-    "        image = (image / 255.).astype('float32') # scale (0.- 1.)\n",
-    "\n",
-    "        rle = row[' EncodedPixels']\n",
-    "        if rle != '-1':\n",
-    "            mask = rle2mask(rle, 1024, 1024).astype('float32')\n",
-    "        else:\n",
-    "            mask = np.zeros([1024, 1024]).astype('float32')\n",
-    "\n",
-    "        if self.transforms:\n",
-    "            augmented = self.transforms(image=image, mask=mask)\n",
-    "            image = augmented['image']\n",
-    "            mask = augmented['mask']\n",
-    "\n",
-    "        return (\n",
-    "            torch.from_numpy(image).permute(2, 0, 1),\n",
-    "            torch.from_numpy(mask).unsqueeze(0)\n",
-    "        )\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return len(self.labels_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Transforms\n",
-    "We use the [albumentations](https://albumentations.ai/docs/getting_started/mask_augmentation/) library to resize and randomly scale/rotate our training images. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_size = 512\n",
-    "\n",
-    "train_transforms = Compose(\n",
-    "    [\n",
-    "        Resize(image_size, image_size),\n",
-    "        ShiftScaleRotate(\n",
-    "            shift_limit=0,\n",
-    "            scale_limit=0.1,\n",
-    "            rotate_limit=10, # rotate\n",
-    "            p=0.5,\n",
-    "            border_mode=cv2.BORDER_CONSTANT\n",
-    "        )\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "eval_transforms = Compose([Resize(image_size, image_size)])\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### DataLoaders"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "train_batch_size = 32\n",
-    "val_batch_size = 32\n",
-    "\n",
-    "train_dataloader = DataLoader(SIIMDataset(train_df, transforms=train_transforms),\n",
-    "                              batch_size=train_batch_size, shuffle=True, num_workers=2)\n",
-    "\n",
-    "eval_dataloader = DataLoader(SIIMDataset(eval_df, transforms=eval_transforms),\n",
-    "                             batch_size=val_batch_size, shuffle=False, num_workers=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Visualize batch\n",
-    "Areas of pneumothorax are highlighted in red; drag the slider to iterate through batches."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@interact(data_loader=fixed(train_dataloader), batch=IntSlider(min=0, max=len(train_dataloader)-1, step=1, value=0))\n",
-    "def show_batch(data_loader, batch):\n",
-    "    plt.rcParams['figure.figsize'] = [20, 15]\n",
-    "\n",
-    "    images, masks = list(itertools.islice(data_loader, batch, batch+1))[0]\n",
-    "    masks_list = []\n",
-    "    for image, mask in zip(images, masks):\n",
-    "        masked = draw_segmentation_masks((image * 255).byte(),\n",
-    "                                    mask.bool(), alpha=0.5, colors='red')\n",
-    "        masks_list.append(masked)\n",
-    "\n",
-    "    grid  = make_grid(masks_list, nrow=6)\n",
-    "    plt.imshow(grid.permute(1, 2, 0));"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Composer\n",
-    "\n",
-    "### Model\n",
-    "\n",
-    "Here we define a Composer model that wraps the smp [segmentation models pytorch][pytorch_seg] package. This lets us quickly create many different segmentation models made from common pre-trained PyTorch encoders. \n",
-    "\n",
-    "- We set defaults to create a [Unet][unet] from an ImageNet pre-trained ResNet-34 with 3 input channels for our RGB (converted) inputs and 1 output channel. \n",
-    "- We set the default loss to `nn.BCEWithLogitsLoss()` to classify each pixel of the output.\n",
-    "\n",
-    "[pytorch_seg]: https://github.com/qubvel/segmentation_models.pytorch\n",
-    "[unet]: https://arxiv.org/abs/1505.04597"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class SMPUNet(ComposerModel):\n",
-    "    def __init__(self,\n",
-    "                 encoder_name='resnet34',\n",
-    "                 encoder_weights='imagenet',\n",
-    "                 in_channels=3, classes=1,\n",
-    "                 loss=nn.BCEWithLogitsLoss()):\n",
-    "        super().__init__()\n",
-    "        self.model = smp.Unet(\n",
-    "            encoder_name=encoder_name,\n",
-    "            encoder_weights=encoder_weights,     # use `imagenet` pre-trained weights for encoder initialization\n",
-    "            in_channels=in_channels,        # model input channels (1 for gray-scale images, 3 for RGB, etc.)\n",
-    "            classes=classes         # model output channels (number of classes in your dataset)\n",
-    "        )                \n",
-    "\n",
-    "        self.criterion = loss\n",
-    "        self.train_loss = LossMetric(loss)\n",
-    "        self.val_loss = LossMetric(loss)\n",
-    "        self.val_dice = Dice(num_classes=classes)\n",
-    "\n",
-    "    def forward(self, batch):\n",
-    "        images, targets = batch\n",
-    "        return self.model(images)\n",
-    "\n",
-    "    def loss(self, outputs, batch):\n",
-    "        _, targets = batch\n",
-    "        return self.criterion(outputs, targets)\n",
-    "\n",
-    "    def get_metrics(self, is_train: bool = False):\n",
-    "        if is_train:\n",
-    "            return {'BCEWithLogitsLoss': self.train_loss}\n",
-    "        else:\n",
-    "            return {'BCEWithLogitsLoss': self.val_loss, 'Dice': self.dice}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = SMPUNet() # define unet model\n",
-    "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Trainer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    train_dataloader=train_dataloader,\n",
-    "    eval_dataloader=eval_dataloader,\n",
-    "    max_duration='2ep',\n",
-    "    optimizers=optimizer,\n",
-    "    device='gpu',\n",
-    "    precision='amp',\n",
-    "    seed=1337\n",
-    ")\n",
-    "trainer.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Algorithms\n",
-    "\n",
-    "Composer allows us to quickly experiment with algorithms that can speed up or improve the quality of our model. This is how we can add `CutOut` and `LabelSmoothing`\n",
-    "\n",
-    "Additionally, the Composer trainer has builtin support for automatic mixed precision training and gradient accumulation to help train quickly and simulate larger batch sizes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from composer.algorithms import CutOut, LabelSmoothing\n",
-    "\n",
-    "model = SMPUNet() # define unet model\n",
-    "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)\n",
-    "\n",
-    "algorithms = [CutOut(length=0.5), LabelSmoothing(smoothing=0.1)]\n",
-    "\n",
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    train_dataloader=train_dataloader,\n",
-    "    eval_dataloader=eval_dataloader,\n",
-    "    max_duration='2ep',\n",
-    "    optimizers=optimizer,\n",
-    "    algorithms=algorithms,\n",
-    "    device='gpu',\n",
-    "    precision='amp',\n",
-    "    seed=1337\n",
-    ")\n",
-    "trainer.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "## What next?\n",
-    "\n",
-    "You've now seen a from-scratch demonstration of using Composer in a computer vision project. But don't stop here! If you're interested, we recommend that you continue to experiment with:\n",
-    "\n",
-    "- training longer\n",
-    "- different loss functions, architectures, transformations, and\n",
-    "- different combinations of composer methods!\n",
-    "\n",
-    "In addition, please continue to explore our tutorials! Here are a couple suggestions:\n",
-    "\n",
-    "* Continue to explore more advanced applications of Composer like [fine-tuning a transformer for sentiment classification][huggingface_tutorial].\n",
-    "\n",
-    "* Learn about callbacks and how to apply [early stopping][early_stopping_tutorial].\n",
-    "\n",
-    "* See how dataloading bottlenecks in computer vision can be addressed using [FFCV][ffcv].\n",
-    "\n",
-    "[image_segmentation_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/medical_image_segmentation.html\n",
-    "[huggingface_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/huggingface_models.html\n",
-    "[early_stopping_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/early_stopping.html\n",
-    "[ffcv]: https://docs.mosaicml.com/projects/composer/en/stable/examples/ffcv_dataloaders.html"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Come get involved with MosaicML!\n",
-    "\n",
-    "We'd love for you to get involved with the MosaicML community in any of these ways:\n",
-    "\n",
-    "### [Star Composer on GitHub](https://github.com/mosaicml/composer)\n",
-    "\n",
-    "Help make others aware of our work by [starring Composer on GitHub](https://github.com/mosaicml/composer).\n",
-    "\n",
-    "### [Join the MosaicML Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg)\n",
-    "\n",
-    "Head on over to the [MosaicML slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg) to join other ML efficiency enthusiasts. Come for the paper discussions, stay for the memes!\n",
-    "\n",
-    "### Contribute to Composer\n",
-    "\n",
-    "Is there a bug you noticed or a feature you'd like? File an [issue](https://github.com/mosaicml/composer/issues) or make a [pull request](https://github.com/mosaicml/composer/pulls)!"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/examples/profiler_demo.py b/examples/profiler_demo.py
index f06fa17f06..d46c89e559 100644
--- a/examples/profiler_demo.py
+++ b/examples/profiler_demo.py
@@ -8,11 +8,13 @@
 
 # [imports-start]
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.utils.data import DataLoader
 from torchvision import datasets, transforms
 
 from composer import Trainer
-from composer.models import mnist_model
+from composer.models.tasks import ComposerClassifier
 from composer.profiler import JSONTraceHandler, cyclic_schedule
 from composer.profiler.profiler import Profiler
 
@@ -35,10 +37,39 @@
     persistent_workers=True,
     num_workers=8,
 )
+
 # [dataloader-end]
 
+
 # Instantiate Model
-model = mnist_model(num_classes=10)
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
+
+model = ComposerClassifier(module=Model(num_classes=10))
 
 # [trainer-start]
 # Instantiate the trainer
diff --git a/examples/segmentation/README.md b/examples/segmentation/README.md
deleted file mode 100644
index 8eaa391184..0000000000
--- a/examples/segmentation/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Semantic Segmentation Example
-
-This example illustrates how to train a semantic segmentation model in composer.
-
-## Installation
-
-First, install [Composer](https://github.com/mosaicml/composer) with `pip install mosaicml`. Additionally, our models are pulled from [MMsegmentation](https://github.com/open-mmlab/mmsegmentation), so follow the [MMcv install instructions](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) (which is dependent on your CUDA and PyTorch versions), then install MMsegmentation with `pip install mmsegmentation`.
-
-Alternatively, we have publicly available Docker images to reproduce our results. Use `mosaicml/pytorch_vision:1.12.1_cu116-python3.9-ubuntu20.04` for running on GPUs or `mosaicml/pytorch_vision:1.12.1_cpu-python3.9-ubuntu20.04` for running on CPUs.
-
-## DeepLabv3+ on ADE20k
-
-The `train_deeplabv3_ade20k.py` script trains a DeepLabv3+ model with either a ResNet-50 or ResNet-101 backbone on the ADE20k semantic segmentation benchmark. To download ADE20k locally (~1 GB), specify the `--download` option when running the script, then the dataset will be downloaded data directory path i.e. the first argument.
-
-We designed the script to be hackable, so try our recipes on your own models and datsets!
-### Example configurations
-
-<!--pytest.mark.skip-->
-
-```bash
-# Downloads ADE20k and does single GPU/CPU training depending on torch.cuda.is_available():
-python train_deeplabv3_ade20k.py /path/to/ade20k --download
-
-# Log experiments to Weights and Biases:
-python train_deeplabv3_ade20k.py /path/to/ade20k --wandb_logger --wandb_entity my_username --wandb_project my_project --run_name my_run_name
-
-# Single/Multi GPU training (infers the number of GPUs available):
-composer train_deeplabv3_ade20k.py /path/to/ade20k
-
-# Manually specify number of GPUs to use:
-composer -n $N_GPUS train_deeplabv3_ade20k.py /path/to/ade20k
-
-# Mild DeepLabv3+ recipe for fastest training to 45.6 mIoU:
-composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name mild --max_duration 25ep
-
-# Medium DeepLabv3+ recipe for highest mIoU (49.15) with similar training time as baseline:
-composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name medium --max_duration 90ep
-
-# Hot DeepLabv3+ recipe for highest mIoU (49.83) with a long training schedule:
-composer train_deeplabv3_ade20k.py /path/to/ade20k --recipe_name hot --max_duration 256ep
-```
diff --git a/examples/segmentation/train_deeplabv3_ade20k.py b/examples/segmentation/train_deeplabv3_ade20k.py
deleted file mode 100644
index 90d93aa037..0000000000
--- a/examples/segmentation/train_deeplabv3_ade20k.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Example script to train a DeepLabv3+ model on ADE20k for semantic segmentation."""
-
-import argparse
-import logging
-import os
-
-import torch
-import torchvision
-from torch.utils.data import DataLoader
-from torchmetrics import MetricCollection
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
-from composer import DataSpec, Time, Trainer
-from composer.algorithms import EMA, SAM, ChannelsLast, MixUp
-from composer.callbacks import CheckpointSaver, ImageVisualizer, LRMonitor, SpeedMonitor
-from composer.datasets.ade20k import (ADE20k, PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair,
-                                      RandomResizePair)
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.loggers import WandBLogger
-from composer.loss import DiceLoss, soft_cross_entropy
-from composer.metrics import CrossEntropy, MIoU
-from composer.models import ComposerClassifier
-from composer.models.deeplabv3.model import deeplabv3
-from composer.optim import CosineAnnealingScheduler, DecoupledSGDW
-from composer.utils import dist
-
-logging.basicConfig()
-logging.getLogger().setLevel(logging.INFO)
-
-parser = argparse.ArgumentParser()
-
-# Dataloader command-line arguments
-parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str)
-parser.add_argument('--download',
-                    help='Use to download ADE20k from the internet and put it in the `data_dir`',
-                    action='store_true')
-parser.add_argument('--train_resize_size', help='Training image resize size', type=int, default=512)
-parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=512)
-parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=128)
-parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=128)
-
-# Model command-line arguments
-parser.add_argument('--backbone_arch',
-                    help='Architecture to use for the backbone.',
-                    default='resnet101',
-                    choices=['resnet50', 'resnet101'])
-parser.add_argument('--sync_bn',
-                    help='Use sync BatchNorm. Recommended if the per device microbatch size is below 16',
-                    action='store_true')
-parser.add_argument('--cross_entropy_weight', help='Weight to scale the cross entropy loss', type=float, default=0.375)
-parser.add_argument('--dice_weight', help='Weight to scale the dice loss', type=float, default=1.125)
-
-# Optimizer command-line arguments
-parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=0.08)
-parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.9)
-parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-5)
-
-# Save checkpoint command-line arguments
-parser.add_argument('--save_checkpoint_dir',
-                    help='Directory in which to save model checkpoints',
-                    type=str,
-                    default='checkpoints/{run_name}')
-parser.add_argument('--checkpoint_interval',
-                    help='Frequency to save checkpoints',
-                    type=Time.from_timestring,
-                    default='1ep')
-
-# Load checkpoint command-line arguments, assumes resuming from a previous training run (as opposed to fine-tuning)
-parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str)
-
-# Recipes command-line argument
-parser.add_argument('--recipe_name',
-                    help='Algorithmic recipes to be applied to the trainer',
-                    choices=['mild', 'medium', 'hot'])
-
-# Logger command-line arguments
-# Note: Only Weights and Biases to minimize arguments. Other loggers can be used by adjusting the script
-parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true')
-parser.add_argument('--wandb_entity', help='WandB entity name', type=str)
-parser.add_argument('--wandb_project', help='WandB project name', type=str)
-
-parser.add_argument('--image_viz', help='Whether or not to log images using ImageVisualizer', action='store_true')
-
-# Trainer arguments
-parser.add_argument('--device_train_microbatch_size',
-                    help='Size of train microbatch size if running on GPU',
-                    default='auto')
-parser.add_argument('--run_name', help='Name of the training run used for checkpointing and logging', type=str)
-parser.add_argument('--seed', help='Random seed', type=int, default=17)
-parser.add_argument('--max_duration',
-                    help='Duration to train specified as a Time string',
-                    type=Time.from_timestring,
-                    default='128ep')
-
-args = parser.parse_args()
-
-IMAGENET_CHANNEL_MEAN = (int(0.485 * 255), int(0.456 * 255), int(0.406 * 255))
-IMAGENET_CHANNEL_STD = (int(0.229 * 255), int(0.224 * 255), int(0.225 * 255))
-
-ADE20K_URL = 'http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip'
-ADE20K_FILE = 'ADEChallengeData2016.zip'
-
-
-def _main():
-    # Divide batch size by number of devices
-    if dist.get_world_size() > 1:
-        args.train_batch_size = args.train_batch_size // dist.get_world_size()
-        args.eval_batch_size = args.eval_batch_size // dist.get_world_size()
-
-    # Train dataset code
-    logging.info('Building train dataloader')
-
-    if args.download:
-        torchvision.datasets.utils.download_and_extract_archive(url=ADE20K_URL,
-                                                                download_root=args.data_dir,
-                                                                filename=ADE20K_FILE,
-                                                                remove_finished=True)
-        # Adjust the data_dir to include the extracted directory
-        args.data_dir = os.path.join(args.data_dir, 'ADEChallengeData2016')
-
-    # Training transforms applied to both the image and target
-    train_both_transforms = torch.nn.Sequential(
-        RandomResizePair(
-            min_scale=0.5,
-            max_scale=2.0,
-            base_size=(args.train_resize_size, args.train_resize_size),
-        ),
-        RandomCropPair(
-            crop_size=(args.train_resize_size, args.train_resize_size),
-            class_max_percent=0.75,
-            num_retry=10,
-        ),
-        RandomHFlipPair(),
-    )
-
-    # Training transforms applied to the image only
-    train_image_transforms = torch.nn.Sequential(
-        PhotometricDistoration(
-            brightness=32. / 255,
-            contrast=0.5,
-            saturation=0.5,
-            hue=18. / 255,
-        ),
-        PadToSize(
-            size=(args.train_resize_size, args.train_resize_size),
-            fill=IMAGENET_CHANNEL_MEAN,
-        ),
-    )
-
-    # Training transforms applied to the target only
-    train_target_transforms = PadToSize(size=(args.train_resize_size, args.train_resize_size), fill=0)
-
-    # Create ADE20k train dataset
-    train_dataset = ADE20k(
-        datadir=args.data_dir,
-        split='training',
-        image_transforms=train_image_transforms,
-        target_transforms=train_target_transforms,
-        both_transforms=train_both_transforms,
-    )
-
-    # Create ADE20k train dataloader
-
-    train_sampler = None
-    if dist.get_world_size():
-        # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup
-        train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True)
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=True,  # Prevents using a smaller batch at the end of an epoch
-        sampler=train_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,
-    )
-
-    # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck
-    train_dataspec = DataSpec(dataloader=train_dataloader,
-                              device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                                                std=IMAGENET_CHANNEL_STD,
-                                                                ignore_background=True))
-    logging.info('Built train dataloader\n')
-
-    # Validation dataset code
-    logging.info('Building evaluation dataloader')
-
-    # Validation image and target transformations
-    image_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size),
-                                         interpolation=InterpolationMode.BILINEAR)
-    target_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size),
-                                          interpolation=InterpolationMode.NEAREST)
-
-    # Create ADE20k validation dataset
-    val_dataset = ADE20k(datadir=args.data_dir,
-                         split='validation',
-                         both_transforms=None,
-                         image_transforms=image_transforms,
-                         target_transforms=target_transforms)
-
-    #Create ADE20k validation dataloader
-
-    val_sampler = None
-    if dist.get_world_size():
-        # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware
-        val_sampler = dist.get_sampler(val_dataset, drop_last=False, shuffle=False)
-
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size=args.eval_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=False,
-        sampler=val_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,
-    )
-
-    # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck
-    val_dataspec = DataSpec(dataloader=val_dataloader,
-                            device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                                              std=IMAGENET_CHANNEL_STD,
-                                                              ignore_background=True))
-    logging.info('Built validation dataset\n')
-
-    logging.info('Building Composer DeepLabv3+ model')
-
-    # Create a DeepLabv3+ model
-    model = deeplabv3(
-        num_classes=150,
-        backbone_arch=args.backbone_arch,
-        backbone_weights='IMAGENET1K_V2',
-        sync_bn=args.sync_bn,
-        use_plus=True,
-    )
-
-    # Initialize the classifier head only since the backbone uses pre-trained weights
-    def weight_init(module: torch.nn.Module):
-        if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
-            torch.nn.init.kaiming_normal_(module.weight)
-        if isinstance(module, torch.nn.BatchNorm2d):
-            torch.nn.init.ones_(module.weight)
-            torch.nn.init.zeros_(module.bias)
-
-    model.classifier.apply(weight_init)  # type: ignore Does not recognize classifier as a torch.nn.Module
-
-    # Loss function to use during training
-    # This ignores index -1 since the NormalizationFn transformation sets the background class to -1
-    dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True)
-
-    def combo_loss(output, target):
-        loss = {}
-        loss['cross_entropy'] = soft_cross_entropy(output, target, ignore_index=-1)
-        loss['dice'] = dice_loss_fn(output, target)
-        loss['total'] = args.cross_entropy_weight * loss['cross_entropy'] + args.dice_weight * loss['dice']
-        return loss
-
-    # Training and Validation metrics to log throughout training
-    train_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)])
-    val_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)])
-
-    # Create a ComposerClassifier using the model, loss function, and metrics
-    composer_model = ComposerClassifier(module=model,
-                                        train_metrics=train_metrics,
-                                        val_metrics=val_metrics,
-                                        loss_fn=combo_loss)
-
-    logging.info('Built Composer DeepLabv3+ model\n')
-
-    logging.info('Building optimizer and learning rate scheduler')
-    # Optimizer
-    optimizer = DecoupledSGDW(composer_model.parameters(),
-                              lr=args.learning_rate,
-                              momentum=args.momentum,
-                              weight_decay=args.weight_decay)
-
-    # Only use a LR schedule if no recipe is specified or if the hot recipe was specified
-    lr_scheduler = None
-    if args.recipe_name is None or args.recipe_name == 'hot':
-        lr_scheduler = CosineAnnealingScheduler()
-
-    logging.info('Built optimizer and learning rate scheduler')
-
-    logging.info('Building callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver')
-    speed_monitor = SpeedMonitor(window_size=50)  # Measures throughput as samples/sec and tracks total training time
-    lr_monitor = LRMonitor()  # Logs the learning rate
-
-    # Callback for checkpointing
-    checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval)
-    logging.info('Built callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver\n')
-
-    # Recipes for training DeepLabv3+ on ImageNet in order of increasing training time and accuracy
-    # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/
-    logging.info('Building algorithm recipes')
-    if args.recipe_name == 'mild':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='1000ba', update_interval='10ba'),
-        ]
-    elif args.recipe_name == 'medium':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='1000ba', update_interval='10ba'),
-            SAM(rho=0.3, interval=2),
-            MixUp(alpha=0.2),
-        ]
-    elif args.recipe_name == 'hot':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='2000ba', update_interval='1ba'),
-            SAM(rho=0.3, interval=1),
-            MixUp(alpha=0.5),
-        ]
-    else:
-        algorithms = None
-    logging.info('Built algorithm recipes\n')
-
-    # Weight and Biases logger if specified in commandline
-    logger = None
-    if args.wandb_logger:
-        logging.info('Building Weights and Biases logger')
-        if args.wandb_entity is None:
-            raise ValueError('Please specify --wandb_entity argument')
-        if args.wandb_project is None:
-            raise ValueError('Please specify --wandb_project argument')
-        logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project)
-        logging.info('Built Weights and Biases logger')
-
-    callbacks = [speed_monitor, lr_monitor, checkpoint_saver]
-    if args.image_viz:
-        callbacks.append(ImageVisualizer(mode='segmentation'))
-    # Create the Trainer!
-    logging.info('Building Trainer')
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-    precision = 'amp' if device == 'gpu' else 'fp32'  # Mixed precision for fast training when using a GPU
-    device_train_microbatch_size = 'auto' if device == 'gpu' else args.device_train_microbatch_size  # If on GPU, use 'auto' gradient accumulation
-    trainer = Trainer(run_name=args.run_name,
-                      model=composer_model,
-                      train_dataloader=train_dataspec,
-                      eval_dataloader=val_dataspec,
-                      eval_interval='1ep',
-                      optimizers=optimizer,
-                      schedulers=lr_scheduler,
-                      algorithms=algorithms,
-                      loggers=logger,
-                      max_duration=args.max_duration,
-                      callbacks=callbacks,
-                      load_path=args.load_checkpoint_path,
-                      device=device,
-                      precision=precision,
-                      device_train_microbatch_size=device_train_microbatch_size,
-                      seed=args.seed)
-    logging.info('Built Trainer\n')
-
-    # Start training!
-    logging.info('Train!')
-    trainer.fit()
-
-
-if __name__ == '__main__':
-    _main()
diff --git a/pyproject.toml b/pyproject.toml
index f4155e23ae..a4800ea34b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,18 +82,15 @@ reportUnusedCoroutine = "error"
 # Pytest
 [tool.pytest.ini_options]
 # By default, do not run gpu, vision, docs, notebook, or daily tests
-addopts = "--codeblocks --strict-markers -m 'not gpu and not vision and not doctest and not daily and not remote'"
+addopts = "--codeblocks --strict-markers -m 'not gpu and not doctest and not daily and not remote'"
 
 markers = [
-    # !!!!!!!!!!!IMPORTANT!!!!!!!!!: when updating the markers, also make sure to update meta.yaml
     # Tests that require a world_size of two should be annotated with `@pytest.mark.world_size(2)`.
     # If not specified, the test will be assumed to have a world-size of one, which is
     # equivalent to `@pytest.mark.world_size(1)`
     "world_size(val)",
     # Tests that require a gpu should be annotated with `@pytest.mark.gpu`
     "gpu",
-    # Whether the test should run in a container based on the vision dockerimage, which contains ffcv and opencv
-    "vision",
     # Tests which are run as part of the documentation build
     "doctest",
     # Should be run during daily regression
diff --git a/setup.py b/setup.py
index 3555668370..c87feaf05e 100644
--- a/setup.py
+++ b/setup.py
@@ -142,10 +142,6 @@ def package_files(prefix: str, directory: str, extension: str):
     'setuptools<=59.5.0',
 ]
 
-extra_deps['health_checker'] = {
-    'pynvml>=11.5.0,<12',
-}
-
 extra_deps['system_metrics_monitor'] = {
     'pynvml>=11.5.0,<12',
 }
@@ -171,19 +167,6 @@ def package_files(prefix: str, directory: str, extension: str):
     'tensorboard>=2.9.1,<3.0.0',
 ]
 
-extra_deps['unet'] = [
-    'monai>=0.9.1,<1.4',
-    'scikit-learn>=1.0.1,<2',
-]
-
-extra_deps['vit'] = [
-    'vit_pytorch==1.6.1',
-]
-
-extra_deps['timm'] = [
-    'timm>=0.5.4,<0.6',
-]
-
 extra_deps['coco'] = [
     'pycocotools>=2.0.4,<3',
 ]
diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py
index 940ca040f2..91ecf2dac2 100644
--- a/tests/algorithms/algorithm_settings.py
+++ b/tests/algorithms/algorithm_settings.py
@@ -21,12 +21,11 @@
                                  LabelSmoothing, LayerFreezing, LowPrecisionGroupNorm, LowPrecisionLayerNorm, MixUp,
                                  NoOpModel, ProgressiveResizing, RandAugment, SelectiveBackprop, SeqLengthWarmup,
                                  SqueezeExcite, StochasticDepth, WeightStandardization)
-from composer.models import composer_resnet
 from composer.models.base import ComposerModel
 from composer.utils import dist
 from tests.common import get_module_subclasses
 from tests.common.datasets import RandomImageDataset, SimpleDataset, dummy_bert_lm_dataloader, dummy_gpt_lm_dataloader
-from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, configure_tiny_bert_hf_model,
+from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, composer_resnet, configure_tiny_bert_hf_model,
                                  configure_tiny_gpt2_hf_model)
 
 simple_bert_settings = {
diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
index ddb05a0c3c..fd3fad0628 100644
--- a/tests/algorithms/test_required_on_load.py
+++ b/tests/algorithms/test_required_on_load.py
@@ -14,8 +14,8 @@
 from composer import Trainer, algorithms
 from composer.callbacks import CheckpointSaver
 from composer.core import Algorithm, Event, Time, TimeUnit  # type: ignore imports used in `eval(representation)`
-from composer.models import ComposerClassifier, ComposerModel, composer_resnet
-from tests.common import ConvModel, SimpleConvModel
+from composer.models import ComposerClassifier, ComposerModel
+from tests.common import ConvModel, SimpleConvModel, composer_resnet
 
 
 def initialize_algorithm(algo_cls: Type):
diff --git a/tests/algorithms/test_stochastic_depth.py b/tests/algorithms/test_stochastic_depth.py
index 23c21bd816..2ec267756a 100644
--- a/tests/algorithms/test_stochastic_depth.py
+++ b/tests/algorithms/test_stochastic_depth.py
@@ -14,8 +14,8 @@
 from composer.algorithms.stochastic_depth.stochastic_layers import make_resnet_bottleneck_stochastic
 from composer.core import Event, State
 from composer.core.time import TimeUnit
-from composer.models import composer_resnet
 from composer.utils import module_surgery
+from tests.common import composer_resnet
 
 
 @pytest.fixture()
diff --git a/tests/callbacks/callback_settings.py b/tests/callbacks/callback_settings.py
index 26a1eeb3df..492b5988be 100644
--- a/tests/callbacks/callback_settings.py
+++ b/tests/callbacks/callback_settings.py
@@ -11,9 +11,9 @@
 import composer.loggers
 import composer.profiler
 from composer import Callback
-from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, HealthChecker,
-                                ImageVisualizer, MemoryMonitor, MemorySnapshot, MLPerfCallback, SpeedMonitor,
-                                SystemMetricsMonitor, ThresholdStopper)
+from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, ImageVisualizer,
+                                MemoryMonitor, MemorySnapshot, MLPerfCallback, SpeedMonitor, SystemMetricsMonitor,
+                                ThresholdStopper)
 from composer.loggers import (CometMLLogger, ConsoleLogger, LoggerDestination, MLFlowLogger, ProgressBarLogger,
                               RemoteUploaderDownloader, TensorboardLogger, WandBLogger)
 from composer.models.base import ComposerModel
@@ -149,7 +149,6 @@
     ImageVisualizer: [pytest.mark.skipif(not _WANDB_INSTALLED, reason='Wandb is optional')],
     MLFlowLogger: [pytest.mark.skipif(not _MLFLOW_INSTALLED, reason='mlflow is optional'),],
     SystemMetricsMonitor: [pytest.mark.skipif(not _PYNMVL_INSTALLED, reason='pynmvl is optional'),],
-    HealthChecker: [pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')],
 }
 
 
diff --git a/tests/callbacks/test_health_checker.py b/tests/callbacks/test_health_checker.py
deleted file mode 100644
index 5638699ca9..0000000000
--- a/tests/callbacks/test_health_checker.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import datetime
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from composer import Timestamp
-from composer.callbacks import HealthChecker
-from composer.callbacks.health_checker import GPUUtilization
-from composer.utils import dist
-from tests.common import world_size
-
-pynvml = pytest.importorskip('pynvml')
-pytest.importorskip('slack_sdk')
-
-
-class MockUtil:
-
-    def __init__(self, util):
-        self.gpu = util
-
-
-@pytest.mark.gpu
-@world_size(1, 2)
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_gpu_utilization(world_size):
-    assert HealthChecker._is_available()
-
-    gpu_utilization_values = [
-        MockUtil(100),
-        MockUtil(10),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-    ]
-
-    with patch.multiple(pynvml,
-                        nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values),
-                        nvmlDeviceGetCount=MagicMock(return_value=world_size)):
-
-        gpu_utilization = GPUUtilization()
-        gpu_utilization.sample()
-        gpu_utilization.sample()
-        gpu_utilization.sample()
-        _, alert = gpu_utilization.check()
-
-        should_alert = dist.get_local_rank() == 0 and world_size > 1
-        assert alert == should_alert
-
-
-@pytest.mark.gpu
-@world_size(1, 2)
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_health_checker(world_size):
-
-    state = MagicMock()
-    state.run_name = 'pytest-mock-run-kwei73'
-    logger = MagicMock()
-
-    health_checker = HealthChecker(
-        sample_freq=1,
-        window_size=3,
-        wait=0,
-    )
-
-    gpu_utilization_values = [
-        MockUtil(100),
-        MockUtil(10),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-    ]
-
-    with patch.multiple(pynvml,
-                        nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values),
-                        nvmlDeviceGetCount=MagicMock(return_value=world_size)):
-
-        # collect data and checker
-        for seconds in [1, 2, 3]:
-            state.timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds))
-            health_checker.after_train_batch(state, logger)
-
-        should_alert = dist.get_local_rank() == 0 and world_size > 1
-        assert health_checker.metrics[0].alerted == should_alert
-
-
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_health_checker_sampling():
-    timestamp = Timestamp(total_wct=datetime.timedelta(seconds=0))
-
-    health_checker = HealthChecker(
-        sample_freq=1,
-        window_size=5,
-        wait=10,
-    )
-
-    config = [
-        (5, False),  # before wait
-        (11, True),
-        (11.5, False),  # below sample frequency
-        (12, True),
-        (20, True),
-        (11, False),  # no time travel
-    ]
-
-    for seconds, is_sample in config:
-        timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds))
-        assert health_checker._sample(timestamp) == is_sample
diff --git a/tests/callbacks/test_inference.py b/tests/callbacks/test_inference.py
index 960aec9a04..bef07c081c 100644
--- a/tests/callbacks/test_inference.py
+++ b/tests/callbacks/test_inference.py
@@ -13,9 +13,9 @@
 from torch.utils.data import DataLoader
 
 from composer.callbacks import ExportForInferenceCallback, export_for_inference
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 from tests.common.datasets import RandomImageDataset
+from tests.common.models import composer_resnet
 
 
 @pytest.mark.parametrize(
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
index be2a508860..bcc9903e61 100644
--- a/tests/common/__init__.py
+++ b/tests/common/__init__.py
@@ -12,7 +12,7 @@
 from tests.common.markers import device, world_size
 from tests.common.models import (ConvModel, EmbeddedWeightTiedModel, EmptyModel, SimpleConvModel, SimpleModel,
                                  SimpleModelWithDropout, SimpleTransformerClassifier, SimpleTransformerMaskedLM,
-                                 SimpleWeightTiedModel, ZeroModel)
+                                 SimpleWeightTiedModel, ZeroModel, composer_resnet)
 from tests.common.state import assert_state_equivalent
 
 
@@ -46,4 +46,5 @@ def get_module_subclasses(module: types.ModuleType, cls: Type) -> List[Type]:
     'ParityDataset',
     'SimpleDataset',
     'InfiniteClassificationDataset',
+    'composer_resnet',
 ]
diff --git a/tests/common/models.py b/tests/common/models.py
index a0b66d8929..d8bf2994d4 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -4,15 +4,18 @@
 """Contains commonly used models that are shared across the test suite."""
 import copy
 from functools import partial
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 import torch
 from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchvision.models import resnet
 
+from composer.loss import loss_registry
 from composer.metrics import CrossEntropy, MIoU
 from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
-from composer.models import ComposerClassifier, HuggingFaceModel
+from composer.models import ComposerClassifier, HuggingFaceModel, Initializer
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -440,6 +443,64 @@ def forward(self, batch: Tuple[torch.Tensor, Any]) -> torch.Tensor:
         return outputs
 
 
+def composer_resnet(
+    model_name: str,
+    num_classes: int = 1000,
+    weights: Optional[str] = None,
+    groups: int = 1,
+    width_per_group: int = 64,
+    initializers: Optional[List[Initializer]] = None,
+    loss_name: str = 'soft_cross_entropy',
+) -> ComposerClassifier:
+    """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model.
+    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
+    Args:
+        model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``,
+            ``"resnet152"``].
+        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
+        weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``.
+        groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``.
+        width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage.
+            Default: ``64``.
+        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
+            Default: ``None``.
+        loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or
+            'binary_cross_entropy_with_logits'. Loss function must be in
+            :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``".
+    Returns:
+        ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model.
+    """
+    valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
+    if model_name not in valid_model_names:
+        raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.')
+
+    if loss_name not in loss_registry.keys():
+        raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the '
+                         'specified loss function is present in composer.loss.loss.py')
+
+    if initializers is None:
+        initializers = []
+
+    # Instantiate model
+    model_fn = getattr(resnet, model_name)
+    model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group)
+
+    # Grab loss function from loss registry
+    loss_fn = loss_registry[loss_name]
+
+    # Create metrics for train and validation
+    train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro')
+    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')])
+
+    # Apply Initializers to model
+    for initializer in initializers:
+        initializer = Initializer(initializer)
+        model.apply(initializer.get_initializer())
+
+    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
+    return composer_model
+
+
 # Note: These methods are an alternative to the tiny_bert fixtures in fixtures.py.
 # Fixtures cannot be used natively as parametrized inputs, which we require when
 # we wish to run a test across multiple models, one of which is a HuggingFace model.
diff --git a/tests/datasets/test_add_dataset_transform.py b/tests/datasets/test_add_dataset_transform.py
deleted file mode 100644
index d7a545a33b..0000000000
--- a/tests/datasets/test_add_dataset_transform.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torchvision import transforms
-
-from composer.datasets.synthetic import SyntheticPILDataset
-from composer.datasets.utils import add_vision_dataset_transform
-
-image_size = 32
-
-
-def generate_synthetic_dataset(data_transforms):
-    return SyntheticPILDataset(total_dataset_size=1000,
-                               data_shape=[image_size, image_size],
-                               num_classes=2,
-                               transform=data_transforms)
-
-
-def generate_default_transforms():
-    return transforms.Compose([transforms.RandomCrop(32), transforms.ToTensor(), transforms.RandomRotation(5)])
-
-
-def generate_composition_no_tensor():
-    return transforms.Compose(
-        [transforms.RandomCrop(32),
-         transforms.RandomHorizontalFlip(),
-         transforms.RandomRotation(5)])
-
-
-@pytest.mark.parametrize('is_tensor_transform,index', [(False, 1), (True, 2)])
-def test_pre_post_to_tensor_compose(is_tensor_transform, index):
-    dataset = generate_synthetic_dataset(generate_default_transforms())
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform)
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast
-
-
-@pytest.mark.parametrize('is_tensor_transform,index', [(False, 0), (True, 1)])
-def test_pre_post_to_tensor(is_tensor_transform, index):
-    dataset = generate_synthetic_dataset(transforms.ToTensor())
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform)
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast
-
-
-@pytest.mark.parametrize('data_transforms', [(generate_composition_no_tensor()), (transforms.RandomHorizontalFlip())])
-def test_default_to_append(data_transforms):
-    dataset = generate_synthetic_dataset(data_transforms)
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast())
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[-1]) == transforms.RandomAutocontrast
-
-
-def test_add_to_none_transform():
-    dataset = generate_synthetic_dataset(None)
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast())
-    assert type(dataset.transform) == transforms.RandomAutocontrast
diff --git a/tests/datasets/test_cifar.py b/tests/datasets/test_cifar.py
deleted file mode 100644
index 6eac6e2ebf..0000000000
--- a/tests/datasets/test_cifar.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from composer.datasets import build_cifar10_dataloader, build_synthetic_cifar10_dataloader
-
-
-@pytest.mark.skip  # Download is flaky and test is not critical
-@pytest.mark.parametrize('is_train', [False, True])
-@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True])
-def test_cifar10_shape_length(is_train, synthetic):
-    batch_size = 1
-
-    if synthetic:
-        dataspec = build_synthetic_cifar10_dataloader(global_batch_size=batch_size, is_train=is_train)
-    else:
-        dataspec = build_cifar10_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train)
-
-    samples = list(dataspec.dataloader)
-    if is_train:
-        assert len(samples) == 50000 // batch_size
-    else:
-        assert len(samples) == 10000 // batch_size
-
-    assert samples[0][0].shape == (1, 3, 32, 32)
diff --git a/tests/datasets/test_dataset_utils.py b/tests/datasets/test_dataset_utils.py
deleted file mode 100644
index 720edce59b..0000000000
--- a/tests/datasets/test_dataset_utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import List, Tuple
-
-import numpy as np
-import pytest
-import torch
-from PIL import Image
-
-from composer.datasets.utils import pil_image_collate
-
-
-@pytest.fixture
-def num_samples():
-    return 4
-
-
-@pytest.fixture
-def image_size():
-    return (16, 16)
-
-
-@pytest.fixture
-def pil_image_list(num_samples: int, image_size: Tuple[int, int]):
-    return [Image.new(mode='RGB', size=image_size, color=(i, i, i)) for i in range(num_samples)]
-
-
-@pytest.fixture
-def pil_target_list(num_samples: int, image_size: Tuple[int, int]):
-    return [Image.new(mode='L', size=image_size, color=i) for i in range(num_samples)]
-
-
-@pytest.fixture
-def correct_image_tensor(num_samples: int, image_size: Tuple[int, int]):
-    return torch.arange(num_samples).expand(3, *image_size, -1).permute(3, 0, 1, 2)
-
-
-@pytest.fixture
-def scalar_target_list(num_samples: int):
-    return np.arange(num_samples)
-
-
-def test_scalar_target_collate(pil_image_list: List[Image.Image], scalar_target_list: np.ndarray,
-                               correct_image_tensor: torch.Tensor):
-    batch = [(img, target) for img, target in zip(pil_image_list, scalar_target_list)]
-    image_tensor, target_tensor = pil_image_collate(batch=batch)
-
-    correct_target_tensor = torch.arange(correct_image_tensor.shape[0])
-
-    assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_target_tensor)
-
-
-def test_image_target_collate(pil_image_list: List[Image.Image], pil_target_list: List[Image.Image],
-                              correct_image_tensor):
-    batch = [(img, target) for img, target in zip(pil_image_list, pil_target_list)]
-    image_tensor, target_tensor = pil_image_collate(
-        batch=batch)  # type: ignore "Image" is incompatible with "ndarray[Unknown, Unknown]"
-
-    assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_image_tensor[:, 0])
diff --git a/tests/datasets/test_ffcv_utils.py b/tests/datasets/test_ffcv_utils.py
deleted file mode 100644
index 3614d73387..0000000000
--- a/tests/datasets/test_ffcv_utils.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import pathlib
-
-import pytest
-
-from composer.datasets.ffcv_utils import write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticDataLabelType, SyntheticPILDataset
-
-
-@pytest.mark.vision
-def test_write_ffcv_dataset(tmp_path: pathlib.Path):
-    dataset = SyntheticPILDataset(total_dataset_size=1,
-                                  num_classes=1,
-                                  data_shape=[1, 1, 3],
-                                  label_type=SyntheticDataLabelType.CLASSIFICATION_INT,
-                                  num_unique_samples_to_create=1)
-    output_file = str(tmp_path / 'ffcv')
-    write_ffcv_dataset(dataset, write_path=output_file, num_workers=1)
-    assert os.path.exists(output_file)
diff --git a/tests/datasets/test_mnist.py b/tests/datasets/test_mnist.py
deleted file mode 100644
index 7342184d03..0000000000
--- a/tests/datasets/test_mnist.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from composer.datasets import build_mnist_dataloader, build_synthetic_mnist_dataloader
-
-
-@pytest.mark.parametrize('is_train', [False, True])
-@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True])
-def test_mnist_shape_length(is_train, synthetic):
-    batch_size = 1
-
-    if synthetic:
-        loader = build_synthetic_mnist_dataloader(global_batch_size=batch_size, is_train=is_train)
-    else:
-        loader = build_mnist_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train)
-
-    samples = list(loader)
-    if is_train:
-        assert len(samples) == 60000 // batch_size
-    else:
-        assert len(samples) == 10000 // batch_size
-
-    assert samples[0][0].shape == (1, 1, 28, 28)
diff --git a/tests/datasets/test_segmentation_transforms.py b/tests/datasets/test_segmentation_transforms.py
deleted file mode 100644
index 2e4af40126..0000000000
--- a/tests/datasets/test_segmentation_transforms.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import pytest
-from PIL import Image
-
-from composer.datasets.ade20k import (PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair,
-                                      RandomResizePair)
-
-
-@pytest.fixture
-def size():
-    return 16, 16
-
-
-@pytest.fixture
-def sample_pair(size):
-    img = Image.new(mode='RGB', size=size)
-    target = Image.new(mode='L', size=size)
-    return img, target
-
-
-def test_random_resize(sample_pair, size):
-    random_resize_transform = RandomResizePair(min_scale=0.5, max_scale=2.0, base_size=size)
-
-    # Test that the resized image remains within bounds for 10 iterations
-    for _ in range(10):
-        resized_img, resized_target = random_resize_transform(sample_pair)
-        assert resized_img.size == resized_target.size
-        assert resized_img.size[0] >= size[0] // 2 and resized_img.size[0] <= size[0] * 2
-        assert resized_img.size[1] >= size[1] // 2 and resized_img.size[1] <= size[1] * 2
-
-
-@pytest.mark.parametrize('crop_size', [(8, 8), (32, 32)])
-def test_random_crop(sample_pair, crop_size):
-    random_crop_transform = RandomCropPair(crop_size)
-    image, target = random_crop_transform(sample_pair)
-    assert image.size == target.size
-    final_size = min(crop_size[0], sample_pair[0].height), min(crop_size[1], sample_pair[0].width)
-    assert final_size == image.size
-
-
-def test_random_hflip(sample_pair):
-    old_image, old_target = np.array(sample_pair[0]), np.array(sample_pair[1])
-
-    # Always flip
-    always_hflip_transform = RandomHFlipPair(probability=1.0)
-    new_image, new_target = always_hflip_transform(sample_pair)
-    new_image, new_target = np.array(new_image), np.array(new_target)
-    assert np.allclose(new_image, old_image[:, ::-1]) and np.allclose(new_target, old_target[:, ::-1])
-
-    # Never flip
-    always_hflip_transform = RandomHFlipPair(probability=0.0)
-    new_image, new_target = always_hflip_transform(sample_pair)
-    new_image, new_target = np.array(new_image), np.array(new_target)
-    assert np.allclose(new_image, old_image) and np.allclose(new_target, old_target)
-
-
-@pytest.mark.parametrize('pad_size', [(32, 32), (8, 8)])
-def test_pad_transform(sample_pair, pad_size):
-    image = sample_pair[0]
-    pad_transform = PadToSize(size=pad_size, fill=255)
-    padded_image = pad_transform(image)
-    final_size = max(pad_size[1], image.width), max(pad_size[0], image.height)
-    # Check for correct size and number of padding elements
-    assert padded_image.size == final_size
-
-    # Check appropriate amount of padding is used
-    padded_image = np.array(padded_image)
-    initial_area = image.width * image.height
-    final_area = final_size[0] * final_size[1]
-    n_channels = padded_image.shape[2]
-    pad_volume = n_channels * (final_area - initial_area)
-    assert pad_volume == (padded_image == 255).sum()
-
-
-def test_photometric_distortion(sample_pair):
-    old_image = sample_pair[0]
-    # Test no transform case
-    photometric_transform = PhotometricDistoration(brightness=1.0, contrast=1.0, saturation=1.0, hue=0)
-    new_image = photometric_transform(old_image)
-    old_image, new_image = np.array(old_image), np.array(new_image)
-    assert np.allclose(old_image, new_image)
diff --git a/tests/datasets/test_synthetic_data.py b/tests/datasets/test_synthetic_data.py
deleted file mode 100644
index 6f62aebb9d..0000000000
--- a/tests/datasets/test_synthetic_data.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional
-
-import pytest
-import torch
-
-from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
-                                         SyntheticPILDataset)
-
-
-@pytest.mark.parametrize('data_type', [
-    SyntheticDataType.GAUSSIAN,
-    SyntheticDataType.SEPARABLE,
-])
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-def test_synthetic_batch_pair_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType):
-    if data_type == SyntheticDataType.SEPARABLE:
-        if label_type != SyntheticDataLabelType.CLASSIFICATION_INT:
-            pytest.skip('Separable data requires classification int labels')
-        num_classes = 2
-        label_shape = None
-    else:
-        num_classes = 10
-        label_shape = (1, 10, 12)
-
-    if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        pytest.xfail('classification_int is not currently supported with gaussian data')
-
-    dataset_size = 1000
-    data_shape = (3, 32, 32)
-    num_samples_to_create = 10
-    dataset = SyntheticBatchPairDataset(total_dataset_size=dataset_size,
-                                        data_shape=data_shape,
-                                        num_unique_samples_to_create=num_samples_to_create,
-                                        data_type=data_type,
-                                        label_type=label_type,
-                                        num_classes=num_classes,
-                                        label_shape=label_shape)
-    assert len(dataset) == dataset_size
-
-    # verify datapoints are correct
-    x, y = dataset[0]
-    assert x.size() == data_shape
-    if label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        assert isinstance(y.item(), int)
-    elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-        assert y.size() == (num_classes,)
-        assert torch.min(y) == 0
-        assert torch.max(y) == 1
-
-    # check that points were allocated in memory after the first call to __getitem__
-    assert dataset.input_data is not None
-    assert dataset.input_target is not None
-    # check that the correct number of points were allocated in memory
-    assert dataset.input_data.size()[0] == num_samples_to_create
-    assert dataset.input_target.size()[0] == num_samples_to_create
-
-    # verify that you can getch points outside the num_samples_to_create range
-    # (still within the total dataset size range)
-    x, y = dataset[num_samples_to_create + 1]
-    assert x is not None
-    assert y is not None
-
-
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-@pytest.mark.parametrize('num_classes', [None, 0])
-def test_synthetic_classification_param_validation(label_type: SyntheticDataLabelType, num_classes: Optional[int]):
-    with pytest.raises(ValueError):
-        SyntheticBatchPairDataset(total_dataset_size=10,
-                                  data_shape=(2, 2),
-                                  label_type=label_type,
-                                  num_classes=num_classes)
-
-
-@pytest.mark.parametrize('data_type', [
-    SyntheticDataType.GAUSSIAN,
-    SyntheticDataType.SEPARABLE,
-])
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-def test_synthetic_image_data_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType):
-    if data_type == SyntheticDataType.SEPARABLE:
-        if label_type != SyntheticDataLabelType.CLASSIFICATION_INT:
-            pytest.skip('Seperable data requires classification int labels')
-        num_classes = 2
-        label_shape = None
-    else:
-        num_classes = 10
-        label_shape = (1, 10, 12)
-
-    if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        pytest.xfail('classification_int is not currently supported with gaussian data')
-
-    dataset_size = 1000
-    data_shape = (32, 32)
-    num_samples_to_create = 100
-    dataset = SyntheticPILDataset(total_dataset_size=dataset_size,
-                                  data_shape=data_shape,
-                                  num_unique_samples_to_create=num_samples_to_create,
-                                  data_type=data_type,
-                                  label_type=label_type,
-                                  num_classes=num_classes,
-                                  label_shape=label_shape)
-    assert len(dataset) == dataset_size
-
-    # verify datapoints are correct
-    x, y = dataset[0]
-    assert x.size == data_shape
-    if label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        assert isinstance(y.item(), int)
-    elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-        assert y.size() == (num_classes,)
-        assert torch.min(y) == 0
-        assert torch.max(y) == 1
-
-    # check that points were allocated in memory after the first call to __getitem__
-    assert dataset._dataset.input_data is not None
-    assert dataset._dataset.input_target is not None
-    # check that the correct number of points were allocated in memory
-    assert dataset._dataset.input_data.shape[0] == num_samples_to_create
-    assert dataset._dataset.input_target.shape[0] == num_samples_to_create
-
-    # verify that you can getch points outside the num_samples_to_create range
-    # (still within the total dataset size range)
-    x, y = dataset[num_samples_to_create + 1]
-    assert x is not None
-    assert y is not None
diff --git a/tests/models/test_bert.py b/tests/models/test_bert.py
deleted file mode 100644
index bee5111e08..0000000000
--- a/tests/models/test_bert.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torch.utils.data import DataLoader
-
-from composer.models.bert import create_bert_classification, create_bert_mlm
-from composer.trainer import Trainer
-from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset
-
-
-def test_bert_mlm_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch):
-    transformers = pytest.importorskip('transformers')
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_bert_config)
-    bert_composer_model = create_bert_mlm(use_pretrained=False,
-                                          pretrained_model_name='dummy',
-                                          model_config=None,
-                                          tokenizer_name=None,
-                                          gradient_checkpointing=False)
-
-    train_dataset = RandomTextLMDataset(size=8,
-                                        vocab_size=tiny_bert_tokenizer.vocab_size,
-                                        sequence_length=8,
-                                        use_keys=True)
-    collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_bert_tokenizer,
-                                                            mlm=True,
-                                                            mlm_probability=0.15)
-    train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator)
-
-    trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics is not None
-    assert trainer.state.train_metrics['LanguageCrossEntropy'].compute() > 0.0
-
-
-def test_bert_classification_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch):
-    pytest.importorskip('transformers')
-
-    def config_patch(x, num_labels):
-        tiny_bert_config.num_labels = num_labels
-        return tiny_bert_config
-
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', config_patch)
-    bert_composer_model = create_bert_classification(use_pretrained=False,
-                                                     pretrained_model_name='dummy',
-                                                     model_config=None,
-                                                     tokenizer_name=None,
-                                                     gradient_checkpointing=False,
-                                                     num_labels=3)
-
-    train_dataset = RandomTextClassificationDataset(size=8,
-                                                    vocab_size=tiny_bert_tokenizer.vocab_size,
-                                                    sequence_length=8,
-                                                    num_classes=3,
-                                                    use_keys=True)
-    train_dataloader = DataLoader(train_dataset, batch_size=4)
-
-    trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics is not None
-    assert trainer.state.train_metrics['MulticlassAccuracy'].compute() > 0.0
diff --git a/tests/models/test_efficientnet.py b/tests/models/test_efficientnet.py
deleted file mode 100644
index a11dccc87b..0000000000
--- a/tests/models/test_efficientnet.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-from composer.models.efficientnetb0.efficientnets import EfficientNet
-
-
-@pytest.mark.gpu
-def test_efficientb0_activate_shape():
-    # Running this test on cuda as convolutions are slow on CPU
-    random_input = torch.rand(2, 3, 224, 224).cuda()
-
-    model = EfficientNet.get_model_from_name(
-        'efficientnet-b0',
-        num_classes=1000,
-        drop_connect_rate=0.2,
-    ).cuda()
-    # Test Stem
-    out = model.conv_stem(random_input)
-    out = model.bn1(out)
-    out = model.act1(out)
-    assert out.shape == (2, 32, 112, 112)
-
-    # Test each block, shapes found at Table 1 of EfficientNet paper
-    block_act_shape = [
-        (2, 16, 112, 112),
-        (2, 24, 56, 56),
-        (2, 24, 56, 56),
-        (2, 40, 28, 28),
-        (2, 40, 28, 28),
-        (2, 80, 14, 14),
-        (2, 80, 14, 14),
-        (2, 80, 14, 14),
-        (2, 112, 14, 14),
-        (2, 112, 14, 14),
-        (2, 112, 14, 14),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 320, 7, 7),
-    ]
-    for i, block in enumerate(model.blocks):
-        out = block(out)
-        assert out.shape == block_act_shape[i]
-
-    out = model.conv_head(out)
-    assert out.shape == (2, 1280, 7, 7)
diff --git a/tests/models/test_gpt2.py b/tests/models/test_gpt2.py
deleted file mode 100644
index 1183353d1b..0000000000
--- a/tests/models/test_gpt2.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torch.utils.data import DataLoader
-
-from composer.models.gpt2 import create_gpt2
-from composer.trainer import Trainer
-from tests.common.datasets import RandomTextLMDataset
-
-
-def test_gpt2_hf_factory(tiny_gpt2_config, tiny_gpt2_tokenizer, monkeypatch):
-    transformers = pytest.importorskip('transformers')
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_gpt2_config)
-    gpt2_composer_model = create_gpt2(use_pretrained=False,
-                                      pretrained_model_name='dummy',
-                                      model_config=None,
-                                      tokenizer_name=None,
-                                      gradient_checkpointing=False)
-
-    train_dataset = RandomTextLMDataset(size=8,
-                                        vocab_size=tiny_gpt2_tokenizer.vocab_size,
-                                        sequence_length=8,
-                                        use_keys=True)
-    collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_gpt2_tokenizer, mlm=False)
-    train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator)
-
-    trainer = Trainer(model=gpt2_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics is not None
-    assert trainer.state.train_metrics['LanguagePerplexity'].compute() > 0.0
diff --git a/tests/models/test_mmdet_model.py b/tests/models/test_mmdet_model.py
deleted file mode 100644
index 8ed2246ead..0000000000
--- a/tests/models/test_mmdet_model.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import pytest
-import torch
-
-
-@pytest.fixture
-def mmdet_detection_batch():
-    batch_size = 2
-    num_labels_per_image = 20
-    image_size = 224
-    return {
-        'img_metas': [{
-            'filename': '../../data/coco/train2017/fake_img.jpg',
-            'ori_filename': 'fake_image.jpg',
-            'img_shape': (image_size, image_size, 3),
-            'ori_shape': (image_size, image_size, 3),
-            'pad_shape': (image_size, image_size, 3),
-            'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32)
-        }] * batch_size,
-        'img':
-            torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32),
-        'gt_bboxes': [torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size,
-        'gt_labels': [torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size
-    }
-
-
-@pytest.fixture
-def mmdet_detection_eval_batch():
-    # Eval settings for mmdetection datasets have an extra list around inputs.
-    batch_size = 2
-    num_labels_per_image = 20
-    image_size = 224
-    return {
-        'img_metas': [[{
-            'filename': '../../data/coco/train2017/fake_img.jpg',
-            'ori_filename': 'fake_image.jpg',
-            'img_shape': (image_size, image_size, 3),
-            'ori_shape': (image_size, image_size, 3),
-            'pad_shape': (image_size, image_size, 3),
-            'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32),
-        }] * batch_size],
-        'img': [torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32)],
-        'gt_bboxes': [[torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size],
-        'gt_labels': [[torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size]
-    }
-
-
-@pytest.fixture
-def yolox_config():
-    # from https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox/yolox_s_8x8_300e_coco.py
-    return dict(
-        type='YOLOX',
-        input_size=(640, 640),
-        random_size_range=(15, 25),
-        random_size_interval=10,
-        backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
-        neck={
-            'type': 'YOLOXPAFPN',
-            'in_channels': [128, 256, 512],
-            'out_channels': 128,
-            'num_csp_blocks': 1,
-        },
-        bbox_head=dict(type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128),
-        train_cfg=dict(assigner={
-            'type': 'SimOTAAssigner',
-            'center_radius': 2.5
-        }),
-        # In order to align the source code, the threshold of the val phase is
-        # 0.01, and the threshold of the test phase is 0.001.
-        test_cfg=dict(score_thr=0.01, nms={
-            'type': 'nms',
-            'iou_threshold': 0.65
-        }))
-
-
-@pytest.fixture
-def faster_rcnn_config():
-    # modified from https://github.com/open-mmlab/mmdetection/blob/master/configs/_base_/models/faster_rcnn_r50_fpn.py
-    return dict(
-        type='FasterRCNN',
-        backbone=dict(type='ResNet',
-                      depth=50,
-                      num_stages=4,
-                      out_indices=(0, 1, 2, 3),
-                      frozen_stages=1,
-                      norm_cfg=dict(type='BN', requires_grad=True),
-                      norm_eval=True,
-                      style='pytorch'),
-        neck=dict(type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
-        rpn_head=dict(type='RPNHead',
-                      in_channels=256,
-                      feat_channels=256,
-                      anchor_generator=dict(type='AnchorGenerator',
-                                            scales=[8],
-                                            ratios=[0.5, 1.0, 2.0],
-                                            strides=[4, 8, 16, 32, 64]),
-                      bbox_coder=dict(type='DeltaXYWHBBoxCoder',
-                                      target_means=[.0, .0, .0, .0],
-                                      target_stds=[1.0, 1.0, 1.0, 1.0]),
-                      loss_cls={
-                          'type': 'CrossEntropyLoss',
-                          'use_sigmoid': True,
-                          'loss_weight': 1.0,
-                      },
-                      loss_bbox={
-                          'type': 'L1Loss',
-                          'loss_weight': 1.0
-                      }),
-        roi_head=dict(type='StandardRoIHead',
-                      bbox_roi_extractor={
-                          'type': 'SingleRoIExtractor',
-                          'roi_layer': {
-                              'type': 'RoIAlign',
-                              'output_size': 7,
-                              'sampling_ratio': 0,
-                          },
-                          'out_channels': 256,
-                          'featmap_strides': [4, 8, 16, 32]
-                      },
-                      bbox_head={
-                          'type': 'Shared2FCBBoxHead',
-                          'in_channels': 256,
-                          'fc_out_channels': 1024,
-                          'roi_feat_size': 7,
-                          'num_classes': 80,
-                          'bbox_coder': {
-                              'type': 'DeltaXYWHBBoxCoder',
-                              'target_means': [0., 0., 0., 0.],
-                              'target_stds': [0.1, 0.1, 0.2, 0.2]
-                          },
-                          'reg_class_agnostic': False,
-                          'loss_cls': {
-                              'type': 'CrossEntropyLoss',
-                              'use_sigmoid': False,
-                              'loss_weight': 1.0,
-                          },
-                          'loss_bbox': {
-                              'type': 'L1Loss',
-                              'loss_weight': 1.0
-                          }
-                      }),
-        # model training and testing settings
-        train_cfg=dict(rpn=dict(assigner={
-            'type': 'MaxIoUAssigner',
-            'pos_iou_thr': 0.7,
-            'neg_iou_thr': 0.3,
-            'min_pos_iou': 0.3,
-            'match_low_quality': True,
-            'ignore_iof_thr': -1
-        },
-                                sampler={
-                                    'type': 'RandomSampler',
-                                    'num': 256,
-                                    'pos_fraction': 0.5,
-                                    'neg_pos_ub': -1,
-                                    'add_gt_as_proposals': False
-                                },
-                                allowed_border=-1,
-                                pos_weight=-1,
-                                debug=False),
-                       rpn_proposal=dict(nms_pre=2000,
-                                         max_per_img=1000,
-                                         nms={
-                                             'type': 'nms',
-                                             'iou_threshold': 0.7
-                                         },
-                                         min_bbox_size=0),
-                       rcnn=dict(assigner={
-                           'type': 'MaxIoUAssigner',
-                           'pos_iou_thr': 0.5,
-                           'neg_iou_thr': 0.5,
-                           'min_pos_iou': 0.5,
-                           'match_low_quality': False,
-                           'ignore_iof_thr': -1
-                       },
-                                 sampler={
-                                     'type': 'RandomSampler',
-                                     'num': 512,
-                                     'pos_fraction': 0.25,
-                                     'neg_pos_ub': -1,
-                                     'add_gt_as_proposals': True
-                                 },
-                                 pos_weight=-1,
-                                 debug=False)),
-        test_cfg=dict(
-            rpn=dict(
-                nms_pre=1000,
-                max_per_img=1000,
-                nms={
-                    'type': 'nms',
-                    'iou_threshold': 0.7
-                },
-                min_bbox_size=0,
-            ),
-            rcnn={
-                'score_thr': 0.05,
-                'nms': {
-                    'type': 'nms',
-                    'iou_threshold': 0.5
-                },
-                'max_per_img': 100,
-            }
-            # soft-nms is also supported for rcnn testing
-            # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
-        ))
-
-
-def test_mmdet_model_forward_yolox(mmdet_detection_batch, yolox_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(yolox_config)
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model(mmdet_detection_batch)
-    assert list(out.keys()) == ['loss_cls', 'loss_bbox', 'loss_obj']
-
-
-def test_mmdet_model_eval_forward_yolox(mmdet_detection_eval_batch, yolox_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(yolox_config)
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model.eval_forward(mmdet_detection_eval_batch)
-    assert len(out) == mmdet_detection_eval_batch['img'][0].shape[0]  # batch size
-    assert list(out[0].keys()) == ['labels', 'boxes', 'scores']
-
-
-def test_mmdet_model_forward_faster_rcnn(mmdet_detection_batch, faster_rcnn_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(faster_rcnn_config)
-
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model(mmdet_detection_batch)
-    assert list(out.keys()) == ['loss_rpn_cls', 'loss_rpn_bbox', 'loss_cls', 'acc', 'loss_bbox']
diff --git a/tests/test_precision.py b/tests/test_precision.py
index 46571529c6..2b85d3d7d2 100644
--- a/tests/test_precision.py
+++ b/tests/test_precision.py
@@ -9,8 +9,7 @@
 
 from composer import Trainer
 from composer.core import Precision, get_precision_context
-from composer.models import composer_resnet_cifar
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 try:
     import transformer_engine.pytorch as te
@@ -22,7 +21,7 @@
 def get_trainer(precision: Precision, precision_config: Optional[Dict[str, Any]] = None) -> Trainer:
 
     return Trainer(
-        model=composer_resnet_cifar('resnet_9'),
+        model=composer_resnet('resnet18'),
         train_dataloader=DataLoader(
             dataset=RandomImageDataset(size=1024),
             batch_size=512,
@@ -78,7 +77,7 @@ def predict_and_measure_memory(precision) -> int:
 def test_train_precision_memory(precision: Precision):
     memory_fp32 = fit_and_measure_memory(Precision.FP32)
     memory_half = fit_and_measure_memory(precision)
-    assert memory_half < 0.7 * memory_fp32
+    assert memory_half < 0.85 * memory_fp32
 
 
 @pytest.mark.gpu
diff --git a/tests/trainer/test_ddp.py b/tests/trainer/test_ddp.py
index 41d240286e..d9733c4285 100644
--- a/tests/trainer/test_ddp.py
+++ b/tests/trainer/test_ddp.py
@@ -12,11 +12,10 @@
 import composer.core.types as types
 from composer import Callback, Event
 from composer.core import State
-from composer.datasets.synthetic import SyntheticBatchPairDataset
 from composer.loggers import Logger
 from composer.trainer.trainer import Trainer
 from composer.utils import dist
-from tests.common import SimpleModel
+from tests.common import RandomClassificationDataset, SimpleModel
 
 
 def get_file_path(*, is_train: bool, tmp_path: pathlib.Path) -> str:
@@ -40,8 +39,8 @@ class TrackedDataset(types.Dataset):
     atomic file writes, it is slow and should not be used in any performance measurements.
     """
 
-    def __init__(self, is_train: bool, synthetic_dataset: SyntheticBatchPairDataset, tmp_path: pathlib.Path):
-        self.dataset = synthetic_dataset
+    def __init__(self, is_train: bool, dataset, tmp_path: pathlib.Path):
+        self.dataset = dataset
         self.is_train = is_train
         self.tmp_path = tmp_path
         self.counter = 0
@@ -110,19 +109,11 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
     and 2) each ddp process is indeed getting different data.
     """
 
-    model = SimpleModel(num_classes=100)
-
     train_batch_size = 10
     train_subset_num_batches = 3
 
-    synthetic_dataset = SyntheticBatchPairDataset(
-        num_unique_samples_to_create=train_batch_size * train_subset_num_batches,
-        total_dataset_size=10_000,
-        data_shape=(model.num_features, 5, 5),
-        num_classes=model.num_classes,
-    )
     train_dataset = TrackedDataset(
-        synthetic_dataset=synthetic_dataset,
+        dataset=RandomClassificationDataset(size=train_batch_size * train_subset_num_batches,),
         is_train=True,
         tmp_path=tmp_path,
     )
@@ -144,14 +135,8 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
     eval_batch_size = 10
     eval_subset_num_batches = 3
 
-    eval_dataset = SyntheticBatchPairDataset(
-        num_unique_samples_to_create=eval_batch_size * eval_subset_num_batches,
-        total_dataset_size=10_000,
-        data_shape=(model.num_features, 5, 5),
-        num_classes=model.num_classes,
-    )
     eval_dataset = TrackedDataset(
-        synthetic_dataset=eval_dataset,
+        dataset=RandomClassificationDataset(size=eval_batch_size * eval_subset_num_batches,),
         is_train=False,
         tmp_path=tmp_path,
     )
@@ -179,17 +164,19 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
         }
 
     max_epochs = 2
-    trainer = Trainer(model=model,
-                      train_dataloader=train_dataloader,
-                      eval_dataloader=eval_dataloader,
-                      device=device,
-                      max_duration=f'{max_epochs}ep',
-                      eval_interval='1ep',
-                      eval_subset_num_batches=eval_subset_num_batches,
-                      train_subset_num_batches=train_subset_num_batches,
-                      deepspeed_config={} if deepspeed else None,
-                      fsdp_config=fsdp_config,
-                      callbacks=[CheckBatch0(tmp_path)])
+    trainer = Trainer(
+        model=SimpleModel(num_classes=100),
+        train_dataloader=train_dataloader,
+        eval_dataloader=eval_dataloader,
+        device=device,
+        max_duration=f'{max_epochs}ep',
+        eval_interval='1ep',
+        eval_subset_num_batches=eval_subset_num_batches,
+        train_subset_num_batches=train_subset_num_batches,
+        deepspeed_config={} if deepspeed else None,
+        fsdp_config=fsdp_config,
+        callbacks=[CheckBatch0(tmp_path)],
+    )
 
     trainer.fit()
 
diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
index a4e3bf90b1..789ad3c136 100644
--- a/tests/utils/test_inference.py
+++ b/tests/utils/test_inference.py
@@ -20,7 +20,6 @@
 from composer.functional import apply_gated_linear_units
 from composer.loggers import InMemoryLogger, Logger
 from composer.loggers.logger_destination import LoggerDestination
-from composer.models import composer_resnet
 from composer.trainer.dist_strategy import prepare_ddp_module
 from composer.trainer.trainer import Trainer
 from composer.utils import dist, export_with_logger, inference
@@ -28,7 +27,7 @@
 from tests.common import SimpleTransformerClassifier, device
 from tests.common.datasets import (RandomImageDataset, dummy_text_classification_dataloader, dummy_tiny_bert_lm_batch,
                                    dummy_transformer_classifier_batch)
-from tests.common.models import configure_tiny_bert_hf_model
+from tests.common.models import composer_resnet, configure_tiny_bert_hf_model
 
 
 class MockFileUploader(LoggerDestination):