diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 4e0e7f6bcb..02238a51fa 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -20,42 +20,42 @@ jobs: include: - name: cpu-3.10-2.0 container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not vision and not doctest + markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-3.10-2.1 container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not vision and not doctest + markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-3.10-2.1-composer container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not vision and not doctest + markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: cpu-doctest container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not vision and doctest + markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml - name: daily-cpu-3.10-2.0 container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not vision and not doctest + markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: daily-cpu-3.10-2.1 container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not vision and not doctest + markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: daily-cpu-3.10-2.1-composer container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not vision and not doctest + markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-doctest container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not vision and doctest + markers: daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml name: ${{ matrix.name }} diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index e2c715710e..6eee54cb0b 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -15,17 +15,17 @@ jobs: include: - name: cpu-3.10-2.0 container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 - markers: not daily and not remote and not gpu and not vision and not doctest + markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-3.10-2.1 container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and not remote and not gpu and not vision and not doctest + markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-doctest container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and not remote and not gpu and not vision and doctest + markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml name: ${{ matrix.name }} diff --git a/README.md b/README.md index 17a6e41cfd..8bdda2d3e0 100644 --- a/README.md +++ b/README.md @@ -135,26 +135,55 @@ Here is a code snippet demonstrating our Trainer on the MNIST dataset. ```python +import torch +import torch.nn as nn +import torch.nn.functional as F from torchvision import datasets, transforms from torch.utils.data import DataLoader from composer import Trainer -from composer.models import mnist_model +from composer.models import ComposerClassifier from composer.algorithms import LabelSmoothing, CutMix, ChannelsLast +class Model(nn.Module): + """Toy convolutional neural network architecture in pytorch for MNIST.""" + + def __init__(self, num_classes: int = 10): + super().__init__() + + self.num_classes = num_classes + + self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0) + self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0) + self.bn = nn.BatchNorm2d(32) + self.fc1 = nn.Linear(32 * 16, 32) + self.fc2 = nn.Linear(32, num_classes) + + def forward(self, x): + out = self.conv1(x) + out = F.relu(out) + out = self.conv2(out) + out = self.bn(out) + out = F.relu(out) + out = F.adaptive_avg_pool2d(out, (4, 4)) + out = torch.flatten(out, 1, -1) + out = self.fc1(out) + out = F.relu(out) + return self.fc2(out) + transform = transforms.Compose([transforms.ToTensor()]) dataset = datasets.MNIST("data", train=True, download=True, transform=transform) train_dataloader = DataLoader(dataset, batch_size=128) trainer = Trainer( - model=mnist_model(num_classes=10), + model=ComposerClassifier(module=Model(), num_classes=10), train_dataloader=train_dataloader, max_duration="2ep", algorithms=[ LabelSmoothing(smoothing=0.1), CutMix(alpha=1.0), ChannelsLast(), - ] + ], ) trainer.fit() ``` diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md index 274c10ce9c..4943a9db58 100644 --- a/STYLE_GUIDE.md +++ b/STYLE_GUIDE.md @@ -227,22 +227,23 @@ All imports in composer should be absolute -- that is, they do not begin with a 1. If a dependency is not core to Composer (e.g. it is for a model, dataset, algorithm, or some callbacks): 1. It must be specified in a entry of the `extra_deps` dictionary of [setup.py](setup.py). This dictionary groups dependencies that can be conditionally installed. An entry named `foo` - can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[unet]'` - will install everything in `install_requires`, along with `monai` and `scikit-learn`. + can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[system_metrics_monitor]'` + will install everything in `install_requires`, along with `pynvml`. 1. It must also be specified in the `run_constrained` and the `test.requires` section. 1. The import must be conditionally imported in the code. For example: ```python + from composer import Callback from composer.utils import MissingConditionalImportError - def unet(): + class SystemMetricsMonitor(Callback) try: - import monai + import pynvml except ImportError as e: - raise MissingConditionalImportError(extra_deps_group="unet", - conda_package="monai", + raise MissingConditionalImportError(extra_deps_group="system_metrics_monitor", + conda_package="pynvml", conda_channel="conda-forge",) from e ``` diff --git a/composer/algorithms/blurpool/README.md b/composer/algorithms/blurpool/README.md index f99e1fb275..24b25d221a 100644 --- a/composer/algorithms/blurpool/README.md +++ b/composer/algorithms/blurpool/README.md @@ -56,9 +56,7 @@ def training_loop(model, train_loader): -```python -from composer.models import composer_deeplabv3 - -model = composer_deeplabv3(num_classes=150, - backbone_arch="resnet101", - backbone_weights="IMAGENET1K_V2", - sync_bn=False -) -``` - -## Architecture - -Based on [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611) - -
-deeplabv3plus -
- - -- **Backbone network**: converts the input image into a feature map. - * Usually ResNet-101 with the strided convolutions converted to dilations convolutions in stage 3 and 4. - * The 3x3 convolutions in stage 3 and 4 have dilation sizes of 2 and 4, respectively, to compensate for the decreased receptive field. - * The average pooling and classification layer are ignored. -- **Spatial Pyramid Pooling**: extracts multi-resolution features from the stage 4 backbone feature map. - * The backbone feature map is processed with four parallel convolution layers with dilations {1, 12, 24, 36} and kernel sizes {1x1, 3x3, 3x3, 3x3}. - * In parallel to the convolutions, global average pool the backbone feature map, then bilinearly upsample to be the same spatial dimension as the feature map. - * Concatenate the outputs from the convolutions and global average pool, then process with a 1x1 convolution. - * The 3x3 convolutions are implemented as depth-wise convolutions to reduce memory and computation cost. -- **Decoder**: converts the output of spatial pyramid pooling (SPP) to class predictions of the same spatial dimension as the input image. - * SPP output is bilinearly upsampled to be the same spatial dimension as the output from the first stage in the backbone network. - * A 1x1 convolution is applied to the first stage activations, then this is concatenated with the upsampled SPP output. - * The concatenation is processed by a 3x3 convolution with dropout followed by a classification layer. - * The predictions are bilinearly upsampled to be the same resolution as the input image. - -## Training Hyperparameters - -We tested two sets of hyperparameters for DeepLabv3+ trained on the ADE20k dataset. - -### Typical ADE20k Model Hyperparameters - -- Model: deeplabv3: - - Initializers: kaiming_normal, bn_ones - - Number of classes: 150 - - Backbone weights: IMAGENET1K_V1 - - Sync BatchNorm -- Optimizer: SGD - - Learning rate: 0.01 - - Momentum: 0.9 - - Weight decay: 5.0e-4 - - Dampening: 0 - - Nsterov: false -- LR schedulers: - - Polynomial: - - Alpha_f: 0.01 - - Power: 0.9 -- Number of epochs: 127 -- Batch size: 16 -- Precision: amp - -| Model | mIoU | Time-to-Train on 8xA100 | -| --- | --- | --- | -| ResNet101-DeepLabv3+ | 44.17 +/- 0.17 | 6.385 hr | - -### Composer ADE20k Model Hyperparameters - -- Model: deeplabv3: - - Initializers: kaiming_normal, bn_ones - - Number of classes: 150 - - Backbone Architecture: resnet101 - - Sync BatchNorm - - Backbone weights: IMAGENET1K_V2 -- Optimizer: Decoupled SGDW - - Learning rate: 0.01 - - Momentum: 0.9 - - Weight decay: 2.0e-5 - - Dampening: 0 - - Nesterov: false -- LR schedulers: - - Cosine decay, t_max: 1dur -- Number of epochs: 128 -- Batch size: 32 -- Precision: amp - -| Model | mIoU | Time-to-Train on 8xA100 | -| --- | --- | --- | -| ResNet101-DeepLabv3+ | 45.764 +/- 0.29 | 4.67 hr | - -Improvements: - -- New PyTorch pretrained weights -- Cosine decay -- Decoupled Weight Decay -- Increase batch size to 32 -- Decrease weight decay to 2e-5 - -## Attribution - -[Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611) by Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam - -[OpenMMLab Semantic Segmentation Toolbox and Benchmark](https://github.com/open-mmlab/mmsegmentation) - -[How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) by Vasilis Vryniotis - -## API Reference - -```{eval-rst} -.. autoclass:: composer.models.deeplabv3.composer_deeplabv3 - :noindex: -``` diff --git a/composer/models/deeplabv3/__init__.py b/composer/models/deeplabv3/__init__.py deleted file mode 100644 index e3473a3015..0000000000 --- a/composer/models/deeplabv3/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""DeepLabV3 for image segmentation.""" -from composer.models.deeplabv3.model import composer_deeplabv3 as composer_deeplabv3 - -__all__ = ['composer_deeplabv3'] diff --git a/composer/models/deeplabv3/model.py b/composer/models/deeplabv3/model.py deleted file mode 100644 index 876604d3c5..0000000000 --- a/composer/models/deeplabv3/model.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""DeepLabV3 model extending :class:`.ComposerClassifier`.""" - -import functools -import textwrap -import warnings -from typing import Dict, Optional, Sequence - -import torch -import torch.distributed as torch_dist -import torch.nn.functional as F -import torchvision -from packaging import version -from torchmetrics import MetricCollection -from torchvision.models import _utils, resnet - -from composer.loss import DiceLoss, soft_cross_entropy -from composer.metrics import CrossEntropy, MIoU -from composer.models.initializers import Initializer -from composer.models.tasks import ComposerClassifier -from composer.utils import dist - -__all__ = ['deeplabv3', 'composer_deeplabv3'] - -_IMAGENET1K_V1_URL = 'https://download.pytorch.org/models/resnet101-63fe2227.pth' -_IMAGENET1K_V2_URL = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth' - - -class SimpleSegmentationModel(torch.nn.Module): - - def __init__(self, backbone, classifier): - warnings.warn(DeprecationWarning('SimpleSegmentationModel is deprecated and will be removed in v0.18')) - - super().__init__() - self.backbone = backbone - self.classifier = classifier - - def forward(self, x): - input_shape = x.shape[-2:] - features = self.backbone(x) - logits = self.classifier(tuple(features.values())) - logits = F.interpolate(logits, - size=input_shape, - mode='bilinear', - align_corners=False, - recompute_scale_factor=False) - return logits - - -def deeplabv3(num_classes: int, - backbone_arch: str = 'resnet101', - backbone_weights: Optional[str] = None, - sync_bn: bool = True, - use_plus: bool = True, - initializers: Sequence[Initializer] = ()): - """Helper function to build a mmsegmentation DeepLabV3 model. - - Args: - num_classes (int): Number of classes in the segmentation task. - backbone_arch (str, optional): The architecture to use for the backbone. Must be either - [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``. - backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone. - Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``. - sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers. - Default: ``True``. - use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``. - initializers (Sequence[Initializer], optional): Initializers for the model. ``()`` for no initialization. - Default: ``()``. - - Returns: - deeplabv3: A DeepLabV3 :class:`torch.nn.Module`. - - Example: - - .. code-block:: python - - from composer.models.deeplabv3.deeplabv3 import deeplabv3 - - pytorch_model = deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None) - """ - warnings.warn(DeprecationWarning('deeplabv3 is deprecated and will be removed in v0.18')) - - # check that the specified architecture is in the resnet module - if not hasattr(resnet, backbone_arch): - raise ValueError(f'backbone_arch must be part of the torchvision resnet module, got value: {backbone_arch}') - - # change the model weight url if specified - if version.parse(torchvision.__version__) < version.parse('0.13.0'): - pretrained = False - if backbone_weights: - pretrained = True - if backbone_weights == 'IMAGENET1K_V1': - resnet.model_urls[backbone_arch] = _IMAGENET1K_V1_URL # pyright: ignore[reportGeneralTypeIssues] - elif backbone_weights == 'IMAGENET1K_V2': - resnet.model_urls[backbone_arch] = _IMAGENET1K_V2_URL # pyright: ignore[reportGeneralTypeIssues] - else: - ValueError( - textwrap.dedent(f"""\ - `backbone_weights` must be either "IMAGENET1K_V1" or "IMAGENET1K_V2" - if torchvision.__version__ < 0.13.0. `backbone_weights` was {backbone_weights}.""")) - backbone = getattr(resnet, backbone_arch)(pretrained=pretrained, - replace_stride_with_dilation=[False, True, True]) - else: - backbone = getattr(resnet, backbone_arch)(weights=backbone_weights, - replace_stride_with_dilation=[False, True, True]) - - # specify which layers to extract activations from - return_layers = {'layer1': 'layer1', 'layer4': 'layer4'} if use_plus else {'layer4': 'layer4'} - backbone = _utils.IntermediateLayerGetter(backbone, return_layers=return_layers) - - try: - from mmseg.models import ASPPHead, DepthwiseSeparableASPPHead - except ImportError as e: - raise ImportError( - textwrap.dedent("""\ - Either mmcv or mmsegmentation is not installed. To install mmcv, please run pip install mmcv-full==1.4.4 -f - https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html where {cu_version} and - {torch_version} refer to your CUDA and PyTorch versions, respectively. To install mmsegmentation, please - run pip install mmsegmentation==0.22.0 on command-line.""")) from e - - world_size = dist.get_world_size() - if sync_bn and world_size == 1: - warnings.warn('sync_bn was true, but only one process is present for training. sync_bn will be ignored.') - - norm_type = 'SyncBN' if sync_bn and world_size > 1 else 'BN' - norm_cfg = {'type': norm_type, 'requires_grad': True} - if use_plus: - # mmseg config: - # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3plus_r50-d8.py - head = DepthwiseSeparableASPPHead(in_channels=2048, - in_index=-1, - channels=512, - dilations=(1, 12, 24, 36), - c1_in_channels=256, - c1_channels=48, - dropout_ratio=0.1, - num_classes=num_classes, - norm_cfg=norm_cfg, - align_corners=False) - else: - # mmseg config: - # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3_r50-d8.py - head = ASPPHead(in_channels=2048, - in_index=-1, - channels=512, - dilations=(1, 12, 24, 36), - dropout_ratio=0.1, - num_classes=num_classes, - norm_cfg=norm_cfg, - align_corners=False) - - model = SimpleSegmentationModel(backbone, head) - - if initializers: - for initializer in initializers: - initializer_fn = Initializer(initializer).get_initializer() - - # Only apply initialization to classifier head if pre-trained weights are used - if backbone_weights is None: - model.apply(initializer_fn) - else: - model.classifier.apply(initializer_fn) - - if sync_bn and world_size > 1: - local_world_size = dist.get_local_world_size() - - # List of ranks for each node, assumes that each node has the same number of ranks - num_nodes = world_size // local_world_size - process_group = None - if num_nodes > 1: - ranks_per_node = [ - list(range(node * local_world_size, (node + 1) * local_world_size)) for node in range(num_nodes) - ] - process_groups = [torch_dist.new_group(ranks) for ranks in ranks_per_node] - process_group = process_groups[dist.get_node_rank()] - - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=process_group) - - return model - - -def composer_deeplabv3(num_classes: int, - backbone_arch: str = 'resnet101', - backbone_weights: Optional[str] = None, - sync_bn: bool = True, - use_plus: bool = True, - ignore_index: int = -1, - cross_entropy_weight: float = 1.0, - dice_weight: float = 0.0, - initializers: Sequence[Initializer] = ()): - """Helper function to create a :class:`.ComposerClassifier` with a DeepLabv3(+) model. Logs - Mean Intersection over Union (MIoU) and Cross Entropy during training and validation. - - From `Rethinking Atrous Convolution for Semantic Image Segmentation `_ - (Chen et al, 2017). - - Args: - num_classes (int): Number of classes in the segmentation task. - backbone_arch (str, optional): The architecture to use for the backbone. Must be either - [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``. - backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone. - Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``. - sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers. - Default: ``True``. - use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``. - ignore_index (int): Class label to ignore when calculating the loss and other metrics. Default: ``-1``. - cross_entropy_weight (float): Weight to scale the cross entropy loss. Default: ``1.0``. - dice_weight (float): Weight to scale the dice loss. Default: ``0.0``. - initializers (List[Initializer], optional): Initializers for the model. ``[]`` for no initialization. - Default: ``[]``. - - - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with a DeepLabv3(+) model. - - Example: - - .. code-block:: python - - from composer.models import composer_deeplabv3 - - model = composer_deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None) - """ - warnings.warn(DeprecationWarning('composer_deeplabv3 is deprecated and will be removed in v0.18')) - - model = deeplabv3(backbone_arch=backbone_arch, - backbone_weights=backbone_weights, - use_plus=use_plus, - num_classes=num_classes, - sync_bn=sync_bn, - initializers=initializers) - - train_metrics = MetricCollection( - [CrossEntropy(ignore_index=ignore_index), - MIoU(num_classes, ignore_index=ignore_index)]) - val_metrics = MetricCollection( - [CrossEntropy(ignore_index=ignore_index), - MIoU(num_classes, ignore_index=ignore_index)]) - - ce_loss_fn = functools.partial(soft_cross_entropy, ignore_index=ignore_index) - dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True) - - def _combo_loss(output, target) -> Dict[str, torch.Tensor]: - loss = {'total': torch.zeros(1, device=output.device, dtype=output.dtype)} - if cross_entropy_weight: - loss['cross_entropy'] = ce_loss_fn(output, target) - loss['total'] += loss['cross_entropy'] * cross_entropy_weight - if dice_weight: - loss['dice'] = dice_loss_fn(output, target) - loss['total'] += loss['dice'] * dice_weight - return loss - - composer_model = ComposerClassifier(module=model, - train_metrics=train_metrics, - val_metrics=val_metrics, - loss_fn=_combo_loss) - return composer_model diff --git a/composer/models/efficientnetb0/README.md b/composer/models/efficientnetb0/README.md deleted file mode 100644 index 9cb1096bc6..0000000000 --- a/composer/models/efficientnetb0/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# EfficientNet -[\[Example\]](#example) · [\[Architecture\]](#architecture) · [\[Family Members\]](#family-members) · [\[Default Training Hyperparameters\]](#default-training-hyperparameters) · [\[Attribution\]](#attribution) · [\[API Reference\]](#api-reference) - -`Vision` /`Image Classification` - -The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks, but were initially designed for image classification. The model family was designed to reach the highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and image resolution according to an empirically determined scaling law. - -## Example - -```python -from composer.models import composer_efficientnetb0 - -model = composer_efficientnetb0(num_classes=1000, drop_connect_rate=0.2) -``` - -## Architecture - -The table below from Tan and Le specifies the EfficientNet baseline architecture broken up into separate stages. MBConv indicates a mobile inverted bottleneck with a specific expansion size and kernel size. Resolution is the expected input resolution of the current stage. Number of channels is the number of output channels of the current stage. Number of layers indicates the number of repeated blocks in each stage. Subsequent EfficientNet family members scale the resolution, number of channels, and number of layers according to the resolution, width, and depth scaling parameters defined by Tan and Le. - -![efficientnet_arch.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/efficientnet_arch.png) - -## Family members - -Tan and Le included 8 members in their model family. The goal was for each family member to have approximately double the FLOPs of the previous family member. Currently, we only support EfficientNet-B0. - -| Model Family Member | Parameter Count | TPU Repo Accuracy* | Our Accuracy** | Training Time on 8x3080 | -|---------------------|-----------------|--------------------|----------------|-------------------------| -| EfficientNet-B0 | 5.3M | 77.1% | 77.22% | 23.3 hr | -| EfficientNet-B1 | 7.8M | 79.1% | TBA | TBA | -| EfficientNet-B2 | 9.2M | 80.1% | TBA | TBA | -| EfficientNet-B3 | 12M | 81.6% | TBA | TBA | -| EfficientNet-B4 | 19M | 82.9% | TBA | TBA | -| EfficientNet-B5 | 30M | 83.6% | TBA | TBA | -| EfficientNet-B6 | 43M | 84.0% | TBA | TBA | -| EfficientNet-B7 | 66M | 84.3% | TBA | TBA | - -*Includes label smoothing, sample-wise stochastic depth, and AutoAugment - -**Includes label smoothing and sample-wise stochastic depth - -## Default Training Hyperparameters - -We use the following default hyperparameters from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet): - -```yaml -optimizer: - rmsprop: - lr: 0.08 - momentum: 0.9 - alpha: 0.9 - eps: 0.01 - weight_decay: 1.0e-5 -schedulers: - - cosine_decay_with_warmup: - t_warmup: "16ep" -train_batch_size: 4096 -max_duration: 400ep -``` - -Our implementation differs from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) in that we: - -- Apply weight decay to batch normalization trainable parameters -- Use `momentum = 0.1` and `eps = 1e-5` as batch normalization parameters - -## Attribution - -Paper: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le - -Code: [gen-efficientnet-pytorch Github repository](https://github.com/rwightman/gen-efficientnet-pytorch) by Ross Wightman - -Hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) by Nvidia - -## API Reference - -```{eval-rst} -.. autoclass:: composer.models.efficientnetb0.composer_efficientnetb0 - :noindex: -``` diff --git a/composer/models/efficientnetb0/__init__.py b/composer/models/efficientnetb0/__init__.py deleted file mode 100644 index d1101f595c..0000000000 --- a/composer/models/efficientnetb0/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety -of vision tasks, but were initially designed for image classification. The model family was designed to reach the -highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and -image resolution according to an empirically determined scaling law. - -See the :doc:`Model Card ` for more details. -""" -from composer.models.efficientnetb0.model import composer_efficientnetb0 as composer_efficientnetb0 - -__all__ = ['composer_efficientnetb0'] - -_task = 'Image Classification' -_dataset = 'ImageNet' -_name = 'EfficientNet-B0' -_quality = '76.63' -_metric = 'Top-1 Accuracy' -_ttt = '21h 48m' -_hparams = 'efficientnetb0.yaml' diff --git a/composer/models/efficientnetb0/_layers.py b/composer/models/efficientnetb0/_layers.py deleted file mode 100644 index 1dbf62450d..0000000000 --- a/composer/models/efficientnetb0/_layers.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -from typing import Callable, Optional - -import torch -from torch import nn as nn - - -def round_channels( - channels: float, - width_multiplier: float, - divisor: int = 8, - min_value: Optional[int] = None, -) -> int: - """Round number of channels after scaling with width multiplier. - - This function ensures that channel integers halfway in-between divisors is rounded up. - - Args: - channels (float): Number to round. - width_multiplier (float): Amount to scale `channels`. - divisor (int): Number to make the output divisible by. - min_value (int, optional): Minimum value the output can be. If not specified, defaults - to the ``divisor``. - """ - if not width_multiplier: - return int(channels) - channels *= width_multiplier - - min_value = min_value or divisor - new_channels = max(min_value, int(channels + divisor / 2) // divisor * divisor) - if new_channels < 0.9 * channels: # increase channels if rounding decreases by >10% - new_channels += divisor - return new_channels - - -def calculate_same_padding(kernel_size, dilation, stride): - """Calculates the amount of padding to use to get the "SAME" functionality in Tensorflow.""" - return ((stride - 1) + dilation * (kernel_size - 1)) // 2 - - -def drop_connect(inputs: torch.Tensor, drop_connect_rate: float, training: bool): - """Randomly mask a set of samples. Provides similar regularization as stochastic depth. - - Args: - input (torch.Tensor): Input tensor to mask. - drop_connect_rate (float): Probability of droppping each sample. - training (bool): Whether or not the model is training - """ - if not training: - return inputs - - keep_prob = 1 - drop_connect_rate - rand_tensor = keep_prob + torch.rand( - [inputs.size()[0], 1, 1, 1], - dtype=inputs.dtype, - device=inputs.device, - ) - rand_tensor.floor_() # binarize - output = inputs.div(keep_prob) * rand_tensor - return output - - -class SqueezeExcite(nn.Module): - """Squeeze Excite Layer. - - Args: - in_channels (int): Number of channels in the input tensor. - latent_channels (int): Number of hidden channels. - act_layer (torch.nn.Module): Activation layer to use in block. - """ - - def __init__( - self, - in_channels: int, - latent_channels: int, - act_layer: Callable[..., nn.Module] = nn.ReLU, - ): - super().__init__() - - self.global_avg_pool = nn.AdaptiveAvgPool2d(1) - self.conv_reduce = nn.Conv2d(in_channels, latent_channels, kernel_size=1, bias=True) - self.act1 = act_layer(inplace=True) - self.conv_expand = nn.Conv2d(latent_channels, in_channels, kernel_size=1, bias=True) - self.gate_fn = torch.nn.Sigmoid() - - def forward(self, x: torch.Tensor): - out = self.global_avg_pool(x) - out = self.conv_reduce(out) - out = self.act1(out) - out = self.conv_expand(out) - out = x * self.gate_fn(out) - return out - - -class DepthwiseSeparableConv(nn.Module): - """Depthwise Separable Convolution layer. - - Args: - in_channels (int): Number of channels in the input tensor. - out_channels (int): Number of channels in the output tensor. - kernel_size (int): Size of the convolving kernel. - stride (int): Stride of the convolution. - se_ratio (float): How much to scale `in_channels` for the hidden layer - dimensionality of the squeeze-excite module. - drop_connect_rate (float): Probability of dropping a sample before the - identity connection, provides regularization similar to stochastic - depth. - act_layer (torch.nn.Module): Activation layer to use in block. - norm_kwargs (dict): Normalization layer's keyword arguments. - norm_layer (torch.nn.Module): Normalization layer to use in block. - """ - - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int, - se_ratio: float, - drop_connect_rate: float, - act_layer: Callable[..., nn.Module], - norm_kwargs: dict, - norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d): - super().__init__() - self.drop_connect_rate = drop_connect_rate - self.has_residual = (in_channels == out_channels and stride == 1) - self.has_se = se_ratio > 0.0 - - padding = calculate_same_padding(kernel_size, dilation=1, stride=stride) - self.conv_depthwise = nn.Conv2d(in_channels=in_channels, - out_channels=in_channels, - groups=in_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - bias=False) - self.bn1 = norm_layer(in_channels, **norm_kwargs) - self.act1 = act_layer(inplace=True) - - if self.has_se: - latent_channels = max(1, int(in_channels * se_ratio)) - self.se = SqueezeExcite(in_channels, latent_channels, act_layer) - - self.conv_pointwise = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - bias=False, - ) - self.bn2 = norm_layer(out_channels, **norm_kwargs) - self.act2 = act_layer(inplace=True) - - def forward(self, input: torch.Tensor): - residual = input - - out = self.conv_depthwise(input) - out = self.bn1(out) - out = self.act1(out) - - if self.has_se: - out = self.se(out) - - out = self.conv_pointwise(out) - out = self.bn2(out) - out = self.act2(out) - - if self.has_residual: - if self.drop_connect_rate > 0.0: - out = drop_connect(out, self.drop_connect_rate, self.training) - out += residual - return out - - -class MBConvBlock(nn.Module): - """Mobile Inverted Residual Bottleneck Block. - - This block is implemented as as defined in - `MobileNetV2: Inverted Residuals and Linear Bottlenecks `_ (Sandler et al, 2018). - - Args: - in_channels (int): Number of channels in the input tensor. - out_channels (int): Number of channels in the output tensor. - kernel_size (int): Size of the convolving kernel. - stride (int): Stride of the convolution. - expand_ratio (int): How much to expand the input channels for the - depthwise convolution. - se_ratio (float): How much to scale `in_channels` for the hidden layer - dimensionality of the squeeze-excite module. - drop_connect_rate (float): Probability of dropping a sample before the - identity connection, provides regularization similar to stochastic - depth. - act_layer (torch.nn.Module): Activation layer to use in block. - norm_kwargs (dict): Normalization layer's keyword arguments. - norm_layer (torch.nn.Module): Normalization layer to use in block. - """ - - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int, - expand_ratio: int, - se_ratio: float, - drop_connect_rate: float, - act_layer: Callable[..., nn.Module], - norm_kwargs: dict, - norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d): - super().__init__() - self.drop_connect_rate = drop_connect_rate - self.has_residual = (in_channels == out_channels and stride == 1) - self.has_se = se_ratio > 0.0 - - mid_channels = round_channels(in_channels, expand_ratio) - - # Point-wise convolution expansion - self.conv1x1_expand = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False) - self.bn1 = norm_layer(mid_channels, **norm_kwargs) - self.act1 = act_layer(inplace=True) - - # Depth-wise Convolution - padding = calculate_same_padding(kernel_size, dilation=1, stride=stride) - self.conv_depthwise = nn.Conv2d(in_channels=mid_channels, - out_channels=mid_channels, - groups=mid_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - bias=False) - self.bn2 = norm_layer(mid_channels, **norm_kwargs) - self.act2 = act_layer(inplace=True) - - # Squeeze and Excitation layer, if specified - if self.has_se: - latent_channels = max(1, int(in_channels * se_ratio)) - self.se = SqueezeExcite(mid_channels, latent_channels, act_layer) - - # Point-wise convolution contraction - self.conv1x1_contract = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False) - self.bn3 = norm_layer(out_channels, **norm_kwargs) - - def forward(self, input: torch.Tensor): - residual = input - - out = self.conv1x1_expand(input) - out = self.bn1(out) - out = self.act1(out) - - out = self.conv_depthwise(out) - out = self.bn2(out) - out = self.act2(out) - - if self.has_se: - out = self.se(out) - - out = self.conv1x1_contract(out) - out = self.bn3(out) - - if self.has_residual: - if self.drop_connect_rate: - out = drop_connect(out, self.drop_connect_rate, self.training) - out += residual - return out diff --git a/composer/models/efficientnetb0/efficientnets.py b/composer/models/efficientnetb0/efficientnets.py deleted file mode 100644 index 7c544a5143..0000000000 --- a/composer/models/efficientnetb0/efficientnets.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""EfficientNet model. - -Adapted from `(Generic) EfficientNets for PyTorch. `_. -""" - -import math -import re -import warnings -from typing import Callable, Optional - -import torch -import torch.nn as nn - -from composer.models.efficientnetb0._layers import (DepthwiseSeparableConv, MBConvBlock, calculate_same_padding, - round_channels) - -__all__ = ['EfficientNet'] - - -class EfficientNet(nn.Module): - """EfficientNet model based on (`Tan et al, 2019 `_). - - Args: - num_classes (int): Size of the EfficientNet output, typically viewed - as the number of classes in a classification task. - width_multiplier (float, optional): How much to scale the EfficientNet-B0 channel - dimension throughout the model. Default: ``1.0``. - depth_multiplier (float, optional): How much to scale the EFficientNet-B0 depth. Default: ``1.0``. - drop_rate (float, optional): Dropout probability for the penultimate activations. Default: ``0.2``. - drop_connect_rate (float, optional): Probability of dropping a sample before the - identity connection, provides regularization similar to stochastic - depth. Default: ``0.2``. - act_layer (torch.nn.Module, optional): Activation layer to use in the model. Default: ``nn.SiLU``. - norm_kwargs (dict, optional): Normalization layer's keyword arguments. Default: ``{"momentum": 0.1, "eps": 1e-5}``. - norm_layer (torch.nn.Module, optional): Normalization layer to use in the model. Default: ``nn.BatchNorm2d``. - """ - - # EfficientNet-B0 architecture specification. - # block_strings are decoded into block level hyperparameters. - # r=repeat, k=kernel_size, s=stride, e=expand_ratio, i=in_channels, o=out_channels, se=se_ratio. - _blocks_strings = [ - 'r1_k3_s1_e1_i32_o16_se0.25', - 'r2_k3_s2_e6_i16_o24_se0.25', - 'r2_k5_s2_e6_i24_o40_se0.25', - 'r3_k3_s2_e6_i40_o80_se0.25', - 'r3_k5_s1_e6_i80_o112_se0.25', - 'r4_k5_s2_e6_i112_o192_se0.25', - 'r1_k3_s1_e6_i192_o320_se0.25', - ] - - def __init__(self, - num_classes: int, - width_multiplier: float = 1.0, - depth_multiplier: float = 1.0, - drop_rate: float = 0.2, - drop_connect_rate: float = 0.2, - act_layer: Callable[..., nn.Module] = nn.SiLU, - norm_kwargs: Optional[dict] = None, - norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d): - warnings.warn(DeprecationWarning('EfficientNet is deprecated and will be removed in v0.18')) - - super(EfficientNet, self).__init__() - self.num_classes = num_classes - - if norm_kwargs is None: - norm_kwargs = {'momentum': 0.1, 'eps': 1e-5} - - in_channels = 3 - out_channels = round_channels(32, width_multiplier) - padding = calculate_same_padding(kernel_size=3, dilation=1, stride=2) - self.conv_stem = nn.Conv2d( - in_channels, - out_channels, - kernel_size=3, - stride=2, - padding=padding, - bias=False, - ) - self.bn1 = norm_layer(num_features=out_channels, **norm_kwargs) - self.act1 = act_layer(inplace=True) - - # Count the number of blocks in the model - block_count = 0. - for block_string in self._blocks_strings: - _, num_repeat = self._decode_block_string(block_string) - block_count += num_repeat - - # Decode block strings and add blocks - block_idx = 0. - blocks = [] - block_args = {} - for block_string in self._blocks_strings: - block_args, num_repeat = self._decode_block_string(block_string) - # Scale channels and number of repeated blocks based on multipliers - block_args['in_channels'] = round_channels( - block_args['in_channels'], - width_multiplier, - ) - block_args['out_channels'] = round_channels( - block_args['out_channels'], - width_multiplier, - ) - num_repeat = int(math.ceil(depth_multiplier * num_repeat)) - - # Add activation, normalization layers, and drop connect - block_args['act_layer'] = act_layer - block_args['norm_kwargs'] = norm_kwargs - block_args['norm_layer'] = norm_layer - - # Delete expand_ratio when set to 1 to use depthwise separable convolution layer - if block_args['expand_ratio'] == 1: - del block_args['expand_ratio'] - - for i in range(num_repeat): - # Linearly decay drop_connect_rate across model depth - block_args['drop_connect_rate'] = drop_connect_rate * block_idx / block_count - - if 'expand_ratio' not in block_args: - blocks.append(DepthwiseSeparableConv(**block_args)) - else: - blocks.append(MBConvBlock(**block_args)) - block_idx += 1 - - # Only the first block in a stage can have stride != 1 - if i == 0: - block_args['stride'] = 1 - block_args['in_channels'] = block_args['out_channels'] - - self.blocks = nn.Sequential(*blocks) - - in_channels = block_args['out_channels'] - out_channels = round_channels(1280, width_multiplier) - self.conv_head = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False) - self.bn2 = norm_layer(out_channels, **norm_kwargs) - self.act2 = act_layer(inplace=True) - - self.global_avg_pool = nn.AdaptiveAvgPool2d(1) - self.dropout = nn.Dropout(drop_rate) - self.classifier = nn.Linear(out_channels, num_classes) - - # Initialization from gen-efficientnet-pytorch repo - for m in self.modules(): - if isinstance(m, torch.nn.Conv2d): - fan_out = (m.kernel_size[0] * m.kernel_size[1] * m.out_channels) // m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) - if m.bias is not None: - m.bias.data.zero_() - elif isinstance(m, torch.nn.BatchNorm2d): - m.weight.data.fill_(1.0) - m.bias.data.zero_() - elif isinstance(m, torch.nn.Linear): - fan_out = m.weight.size(0) - init_range = 1.0 / math.sqrt(fan_out) - m.weight.data.uniform_(-init_range, init_range) - m.bias.data.zero_() - - def extract_features(self, input: torch.Tensor): - out = self.conv_stem(input) - out = self.bn1(out) - out = self.act1(out) - out = self.blocks(out) - out = self.conv_head(out) - out = self.bn2(out) - out = self.act2(out) - out = self.global_avg_pool(out) - return out.flatten(1) - - def forward(self, input: torch.Tensor): - out = self.extract_features(input) - out = self.dropout(out) - return self.classifier(out) - - @staticmethod - def get_model_from_name(model_name: str, num_classes, drop_connect_rate: float): - """Instantiate an EfficientNet model family member based on the model_name string. - - Args: - model_name: (str): One of ``'efficientnet-b0'`` through ``'efficientnet-b7'``. - num_classes (int): Size of the EfficientNet output, typically viewed as the number of classes in a classification task. - drop_connect_rate (float): Probability of dropping a sample before the identity connection, - provides regularization similar to stochastic depth. - """ - - # Coefficients: width, depth, res, dropout - model_arch = { - 'efficientnet-b0': (1.0, 1.0, 224, 0.2), - 'efficientnet-b1': (1.0, 1.1, 240, 0.2), - 'efficientnet-b2': (1.1, 1.2, 260, 0.3), - 'efficientnet-b3': (1.2, 1.4, 300, 0.3), - 'efficientnet-b4': (1.4, 1.8, 380, 0.4), - 'efficientnet-b5': (1.6, 2.2, 456, 0.4), - 'efficientnet-b6': (1.8, 2.6, 528, 0.5), - 'efficientnet-b7': (2.0, 3.1, 600, 0.5), - } - - model_params = model_arch[model_name] - width_multiplier = model_params[0] - depth_multiplier = model_params[1] - drop_rate = model_params[3] - return EfficientNet(num_classes=num_classes, - width_multiplier=width_multiplier, - depth_multiplier=depth_multiplier, - drop_rate=drop_rate, - drop_connect_rate=drop_connect_rate) - - def _decode_block_string(self, block_string: str): - """Decodes an EfficientNet block specification string into a dictionary of keyword arguments for a block in the - architecture.""" - - arg_strings = block_string.split('_') - args = {} - for arg_string in arg_strings: - splits = re.split(r'(\d.*)', arg_string) - if len(splits) >= 2: - key, value = splits[:2] - args[key] = value - num_repeat = int(args['r']) - block_args = { - 'kernel_size': int(args['k']), - 'stride': int(args['s']), - 'expand_ratio': int(args['e']), - 'in_channels': int(args['i']), - 'out_channels': int(args['o']), - 'se_ratio': float(args['se']) if 'se' in args else None, - } - return block_args, num_repeat diff --git a/composer/models/efficientnetb0/model.py b/composer/models/efficientnetb0/model.py deleted file mode 100644 index 67ae193895..0000000000 --- a/composer/models/efficientnetb0/model.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A :class:`.ComposerClassifier` wrapper around the EfficientNet-b0 architecture.""" - -import warnings - -from composer.models.efficientnetb0.efficientnets import EfficientNet -from composer.models.tasks import ComposerClassifier - -__all__ = ['composer_efficientnetb0'] - - -def composer_efficientnetb0(num_classes: int = 1000, drop_connect_rate: float = 0.2) -> ComposerClassifier: - """Helper function to create a :class:`.ComposerClassifier` with an EfficientNet-b0 architecture. - - See `Rethinking Model Scaling for Convolutional Neural Networks `_ - (Tan et al, 2019) for more details. - - Args: - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``. - drop_connect_rate (float, optional): Probability of dropping a sample within a block before identity - connection. Default: ``0.2``. - - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with a EfficientNet-B0 model. - - - Example: - - .. testcode:: - - from composer.models import composer_efficientnetb0 - - model = composer_efficientnetb0() # creates EfficientNet-b0 for image classification - """ - warnings.warn(DeprecationWarning('composer_efficientnetb0 is deprecated and will be removed in v0.18')) - model = EfficientNet.get_model_from_name(model_name='efficientnet-b0', - num_classes=num_classes, - drop_connect_rate=drop_connect_rate) - - composer_model = ComposerClassifier(module=model) - return composer_model diff --git a/composer/models/gpt2/README.md b/composer/models/gpt2/README.md deleted file mode 100644 index 52ee26a97f..0000000000 --- a/composer/models/gpt2/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# GPT-2 -[\[Example\]](#example) · [\[Architecture\]](#architecture) · [\[Family Members\]](#family-members) · [\[Default Training Hyperparameters\]](#default-training-hyperparameters) · [\[Attribution\]](#attribution) · [\[API Reference\]](#api-reference) - -`NLP` / ``Autoregressive Language Modeling`` - -The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales. This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream language generation tasks, such as summarization, translation, and dialog. - -Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's GPT-2 model with one of our configurations. - -## Example - - - -```python -import transformers -from composer.models import GPT2Model - -model = GPT2Model(module=transformers.AutoModelForCausalLM.from_pretrained("gpt2"), - config=transformers.GPT2Config.from_pretrained("gpt2"), - tokenizer_name="gpt2") -``` - -## Architecture - -GPT-2 consists of a a decoder-only Transformer parameterized by $n_{layer}$, $d_{model}$, $d_{ff}$, $d_{attn}$ and $n_{heads}$. The parameters for each model family member can be seen below: - -| Name | $n_{layer}$ | $d_{model}$ | $d_{ff}$ | $d_{attn}$ | $n_{heads}$ | -|------------|-------------|-------------|----------|------------|-------------| -| GPT-2 52M | 8 | 512 | 2048 | 8 | 8 | -| GPT-2 83M | 10 | 640 | 2560 | 640 | 10 | -| GPT-2 125M | 12 | 768 | 3072 | 768 | 12 | - -## Family Members - -We implement three members of this family at different scales: GPT 52M, GPT 83M, and GPT 125M. These models are named after their parameter counts. We selected these particular configurations because (1) they represent points on the pareto frontier of the scaling law for language models as described by [Kaplan et al. at OpenAI](https://arxiv.org/abs/2001.08361) and (2) they are small enough to rapidly iterate on methods using a single GPU node. - -| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Final Loss | Predicted Perplexity | Actual Perplexity | -|---------------------|------------|---------------------------|-----------------|------------|----------------------|-------------------| -| GPT-2 52M | 53.9M | 02:44 | 4.6B | 3.43 | 32.54 | 30.88 | -| GPT-2 83M | 85.8M | 04:52 | 5.5B | 3.28 | 27.84 | 26.57 | -| GPT-2 125M | 114M | 08:25 | 6.7B | 3.18 | 24.64 | 24.04 | - - -There are two ways of varying the amount of time necessary to train a model or the cost necessary to do so: varying the size of the model or varying the number of steps (and therefore data) for which the model is trained. With the GPT family of models, we explore both of these axes. To develop methods for these models, we generally begin with the smallest members of this model family for initial experimentation and scale up once the ideas have been refined. - -To explore tradeoffs between quality and number of training steps: we have ablated both number of training steps, and number of data points to train on. We do this by checkpointing the model throughout training. - -To explore tradeoffs between quality and the size of the model, we use [Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361) to provide suggestions on model capacity and dataset size, and then sweep hyperparameters such as learning rate and batch size to minimize loss. - - -## Attribution - -The GPT model family is described in *[Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)* by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. - -The Scaling Law that we use to choose the members of this model family are described in *[Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361)* by Jared Kaplan, Sam McCandish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. - -## Default Training Hyperparameters - -Below are hyperparameters we used to train GPT-2 125M on [OpenWebText](https://huggingface.co/datasets/openwebtext). - -```yaml -optimizer: - adamw: - lr: 6.0e-4 - betas: - - 0.9 - - 0.999 - eps: 1.0e-08 - weight_decay: 0.0 -schedulers: - - cosine_decay_with_warmup: - t_warmup: 140ba -train_batch_size: 512 -``` - -## API Reference - -```{eval-rst} -.. autoclass:: composer.models.gpt2.GPT2Model - :noindex: -``` diff --git a/composer/models/gpt2/__init__.py b/composer/models/gpt2/__init__.py deleted file mode 100644 index 1ae37b122a..0000000000 --- a/composer/models/gpt2/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales. -This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream -language generation tasks, such as summarization, translation, and dialog. - -See the :doc:`Model Card ` for more details. -""" - -from composer.models.gpt2.model import create_gpt2 as create_gpt2 - -__all__ = ['create_gpt2'] - -_metadata = { - 'gpt2': { - '_task': 'Language Modeling', - '_dataset': 'OpenWebText', - '_name': 'GPT-2 52M', - '_quality': '30.88', - '_metric': 'Perplexity', - '_ttt': '02:44', - '_hparams': 'gpt2_52m.yaml' - }, - 'gpt2 -- TODO RENAME TO GPT2': { - '_task': 'Language Modeling', - '_dataset': 'OpenWebText', - '_name': 'GPT-2 83M', - '_quality': '26.57', - '_metric': 'Perplexity', - '_ttt': '04:52', - '_hparams': 'gpt2_83m.yaml' - }, - 'gpt2 --! TODO RENAME TO GPT2': { - '_task': 'Language Modeling', - '_dataset': 'OpenWebText', - '_name': 'GPT-2 125M', - '_quality': '24.04', - '_metric': 'Perplexity', - '_ttt': '08:25', - '_hparams': 'gpt2_125m.yaml' - } -} diff --git a/composer/models/gpt2/model.py b/composer/models/gpt2/model.py deleted file mode 100644 index ea924b7b99..0000000000 --- a/composer/models/gpt2/model.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""GPT-2 model based on `Hugging Face GPT-2 `_. - -Implemented as a wrapper using :class:`.ComposerTrainer`. -""" - -from __future__ import annotations - -import warnings -from typing import Optional - -from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity -from composer.models.huggingface import HuggingFaceModel -from composer.utils.import_helpers import MissingConditionalImportError - -__all__ = ['create_gpt2'] - - -def create_gpt2(use_pretrained: Optional[bool] = False, - pretrained_model_name: Optional[str] = None, - model_config: Optional[dict] = None, - tokenizer_name: Optional[str] = None, - gradient_checkpointing: Optional[bool] = False): - """Implements :class:`~composer.models.huggingface.HuggingFaceModel` to wrap `Hugging Face GPT-2 \ - transformers `_. Logs training and - validation perplexity. - - From `Language Models are Unsupervised Multitask Learners `_ (Radford et al, 2018). - - Args: - - gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``. - use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``. - model_config (dict): A dictionary providing a HuggingFace model configuration. - tokenizer_name (str, optional): Tokenizer name used to preprocess the dataset - and validate the models inputs. - - .. code-block:: - - { - "_name_or_path": "gpt2", - "activation_function": "gelu_new", - "architectures": ["GPT2LMHeadModel"], - "attn_pdrop": 0.1, - "bos_token_id": 50256, - "embd_pdrop": 0.1, - "eos_token_id": 50256, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, - "n_head": 12, - "n_inner": null, - "n_layer": 12, - "n_positions": 1024, - "reorder_and_upcast_attn": false, - "resid_pdrop": 0.1, - "scale_attn_by_inverse_layer_idx": false, - "scale_attn_weights": true, - "summary_activation": null, - "summary_first_dropout": 0.1, - "summary_proj_to_labels": true, - "summary_type": "cls_index", - "summary_use_proj": true, - "task_specific_params": { - "text-generation": { - "do_sample": true, - "max_length": 50 } - }, - "transformers_version": "4.16.0", - "use_cache": true, - "vocab_size": 50257 - } - - To create a GPT-2 model for language modeling pretraining: - - .. testcode:: - - from composer.models import create_gpt2 - - composer_model = create_gpt2() - - """ - warnings.warn(DeprecationWarning('create_gpt2 is deprecated and will be removed in v0.18')) - - try: - import transformers - except ImportError as e: - raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e - - if not model_config: - model_config = {} - - if not pretrained_model_name: - pretrained_model_name = 'gpt2' - - if use_pretrained: - assert transformers.AutoModelForCausalLM.from_pretrained is not None, 'AutoModelForCausalLM has from_pretrained method' - model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, - **model_config) - else: - config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config) - assert transformers.AutoModelForCausalLM.from_config is not None, 'AutoModelForCausalLM has from_config method' - model = transformers.AutoModelForCausalLM.from_config(config) - - if gradient_checkpointing: - model.gradient_checkpointing_enable() # type: ignore - - # setup the tokenizer - if tokenizer_name: - tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name) - else: - tokenizer = None - - return HuggingFaceModel(model=model, - tokenizer=tokenizer, - metrics=[LanguageCrossEntropy(), LanguagePerplexity()], - use_logits=True) diff --git a/composer/models/mmdetection.py b/composer/models/mmdetection.py deleted file mode 100644 index 2e53aac543..0000000000 --- a/composer/models/mmdetection.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A wrapper class that converts mmdet detection models to composer models""" - -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING, Any, List, Optional - -import numpy as np -import torch -from torchmetrics import Metric -from torchmetrics.collections import MetricCollection - -from composer.models import ComposerModel - -if TYPE_CHECKING: - import mmdet - -__all__ = ['MMDetModel'] - - -class MMDetModel(ComposerModel): - """A wrapper class that adapts mmdetection detectors to composer models. - - Args: - model (mmdet.models.detectors.BaseDetector): An MMdetection Detector. - metrics (list[Metric], optional): list of torchmetrics to apply to the output of `eval_forward`. Default: ``None``. - - .. warning:: This wrapper is designed to work with mmdet datasets. - - Example: - - .. code-block:: python - - from mmdet.models import build_model - from mmcv import ConfigDict - from composer.models import MMDetModel - - yolox_s_config = dict( - type='YOLOX', - input_size=(640, 640), - random_size_range=(15, 25), - random_size_interval=10, - backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5), - neck=dict(type='YOLOXPAFPN', in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), - bbox_head=dict(type='YOLOXHead', num_classes=num_classes, in_channels=128, feat_channels=128), - train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), - test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) - yolox = build_model(ConfigDict(yolox_s_config)) - yolox.init_weights() - model = MMDetModel(yolox) - """ - - def __init__( - self, - model: mmdet.models.detectors.BaseDetector, # type: ignore - metrics: Optional[List[Metric]] = None) -> None: - warnings.warn(DeprecationWarning('MMDetModel is deprecated and will be removed in v0.18')) - super().__init__() - self.model = model - - self.train_metrics = None - self.val_metrics = None - - if metrics: - metric_collection = MetricCollection(metrics) - self.train_metrics = metric_collection.clone(prefix='train_') - self.val_metrics = metric_collection.clone(prefix='val_') - - def forward(self, batch): - # this will return a dictionary of losses in train mode and model outputs in test mode. - return self.model(**batch) - - def loss(self, outputs, batch, **kwargs): - return outputs - - def eval_forward(self, batch, outputs: Optional[Any] = None): - """ - Args: - batch (dict): a eval batch of the format: - - - ``img`` (List[torch.Tensor]): list of image torch.Tensors of shape (batch, c, h , w). - - - ``img_metas`` (List[Dict]): (1, batch_size) list of ``image_meta`` dicts. - Returns: model predictions: A batch_size length list of dictionaries containg detection boxes in (x,y, x2, y2) format, class labels, and class probabilities. - """ - device = batch['img'][0].device - batch.pop('gt_labels') - batch.pop('gt_bboxes') - results = self.model(return_loss=False, rescale=True, **batch) # models behave differently in eval mode - - # outputs are a list of bbox results (x, y, x2, y2, score) - # pack mmdet bounding boxes and labels into the format for torchmetrics MAP expects - preds = [] - for bbox_result in results: - boxes_scores = np.vstack(bbox_result) - boxes, scores = torch.from_numpy(boxes_scores[..., :-1]).to(device), torch.from_numpy( - boxes_scores[..., -1]).to(device) - labels = [np.full(result.shape[0], i, dtype=np.int32) for i, result in enumerate(bbox_result)] - pred = { - 'labels': torch.from_numpy(np.concatenate(labels)).to(device).long(), - 'boxes': boxes.float(), - 'scores': scores.float() - } - preds.append(pred) - return preds - - def get_metrics(self, is_train: bool = False): - if is_train: - metrics = self.train_metrics - else: - metrics = self.val_metrics - return metrics if metrics else {} - - def update_metric(self, batch: Any, outputs: Any, metric: Metric): - targets_box = batch.pop('gt_bboxes')[0] - targets_cls = batch.pop('gt_labels')[0] - targets = [] - for i in range(len(targets_box)): - t = {'boxes': targets_box[i], 'labels': targets_cls[i]} - targets.append(t) - metric.update(outputs, targets) diff --git a/composer/models/resnet/README.md b/composer/models/resnet/README.md deleted file mode 100644 index 430dd303b4..0000000000 --- a/composer/models/resnet/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# 🏙️ ResNet -[\[How to Use\]](#how-to-use) · [\[Architecture\]](#architecture) · [\[Family Members\]](#family-members) · [\[Default Training Hyperparameters\]](#default-training-hyperparameters) · [\[Attribution\]](#attribution) · [\[API Reference\]](#api-reference) - -`Vision` / `Image Classification` - -The ResNet model family is a set of convolutional neural networks that can be used as a basis for a variety of vision tasks. Our implementation is a simple wrapper on top of the [torchvision ResNet implementation](https://pytorch.org/vision/stable/models.html). - -## How to Use - -```python -from composer.models import composer_resnet - -model = composer_resnet( - model_name="resnet50", - num_classes=1000, - weights=None -) -``` - -## Architecture - -The basic architecture defined in the original papers is as follows: - -- The first layer is a 7x7 Convolution with stride 2 and 64 filters. -- Subsequent layers follow 4 stages with {64, 128, 256, 512} input channels with a varying number of residual blocks at each stage that depends on the family member. At the end of every stage, the resolution is reduced by half using a convolution with stride 2. -- The final section consists of a global average pooling followed by a linear + softmax layer that outputs values for the specified number of classes. - -The below table from [He et al.](https://arxiv.org/abs/1512.03385) details some of the building blocks for ResNets of different sizes. - -![resnet.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/resnet.png) - -## Family Members - -ResNet family members are identified by their number of layers. Parameter count, accuracy, and training time are provided below. - -| Model Family Members | Parameter Count | Our Accuracy | Training Time on 8xA100s | -|----------------------|-----------------|--------------|--------------------------| -| ResNet-18 | 11.5M | TBA | TBA | -| ResNet-34 | 21.8M | TBA | TBA | -| ResNet-50 | 25.6M | 76.5% | 3.83 hrs | -| ResNet-101 | 44.5M | 78.1% | 5.50 hrs | -| ResNet-152 | 60.2M | TBA | TBA | - - -> ❗ **Note**: Please see the [CIFAR ResNet model card](https://docs.mosaicml.com/projects/composer/en/stable/model_cards/cifar_resnet.html#architecture) for the differences between CIFAR and ImageNet ResNets. - -## Default Training Hyperparameters - -- Optimizer: Decoupled SGDW - - Learning rate: 2.048 - Momentum: 0.875 - Weight_decay: 5.0e-4 -- LR schedulers: - - Cosine decay with warmup for 8 epochs -- Batch size: 2048 -- Number of epochs: 90ep - -## Attribution - -Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun - -Code and hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5) by Nvidia - -## API Reference - -```{eval-rst} -.. autofunction:: composer.models.resnet.model.composer_resnet - :noindex: -``` diff --git a/composer/models/resnet/__init__.py b/composer/models/resnet/__init__.py deleted file mode 100644 index e00a37035b..0000000000 --- a/composer/models/resnet/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The ResNet model family is a set of convolutional neural networks described in `Deep Residual Learning for Image -Recognition `_ (He et al, 2015). ResNets can be used as the base for a variety of -vision tasks. ImageNet ResNets are a subset of the ResNet family which were designed specifically for classification on -the ImageNet dataset. - -See the :doc:`Model Card ` for more details. -""" -from composer.models.resnet.model import composer_resnet - -__all__ = ['composer_resnet'] - -_metadata = { - 'resnet18': { - '_task': 'Image Classification', - '_dataset': 'ImageNet', - '_name': 'ResNet18', - '_quality': 'TBD', - '_metric': 'Top-1 Accuracy', - '_ttt': 'TBD', - '_hparams': 'resnet18.yaml' - }, - 'resnet34': { - '_task': 'Image Classification', - '_dataset': 'ImageNet', - '_name': 'ResNet34', - '_quality': 'TBD', - '_metric': 'Top-1 Accuracy', - '_ttt': 'TBD', - '_hparams': 'resnet34.yaml' - }, - 'resnet50': { - '_task': 'Image Classification', - '_dataset': 'ImageNet', - '_name': 'ResNet50', - '_quality': '76.51', - '_metric': 'Top-1 Accuracy', - '_ttt': '3h 33m', - '_hparams': 'resnet50.yaml' - }, - 'resnet101': { - '_task': 'Image Classification', - '_dataset': 'ImageNet', - '_name': 'ResNet101', - '_quality': '78.10', - '_metric': 'Top-1 Accuracy', - '_ttt': '8h 15m', - '_hparams': 'resnet101.yaml', - }, - 'resnet152': { - '_task': 'Image Classification', - '_dataset': 'ImageNet', - '_name': 'ResNet152', - '_quality': 'TBD', - '_metric': 'Top-1 Accuracy', - '_ttt': 'TBD', - '_hparams': 'resnet152.yaml' - } -} diff --git a/composer/models/resnet/model.py b/composer/models/resnet/model.py deleted file mode 100644 index 5b023fabcf..0000000000 --- a/composer/models/resnet/model.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A :class:`.ComposerClassifier` wrapper around the torchvision implementations of the ResNet model family.""" - -import logging -import warnings -from typing import List, Optional - -from torchmetrics import MetricCollection -from torchmetrics.classification import MulticlassAccuracy -from torchvision.models import resnet - -from composer.loss import loss_registry -from composer.metrics import CrossEntropy -from composer.models.initializers import Initializer -from composer.models.tasks import ComposerClassifier - -__all__ = ['composer_resnet'] - -log = logging.getLogger(__name__) - -valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] - - -def composer_resnet(model_name: str, - num_classes: int = 1000, - weights: Optional[str] = None, - groups: int = 1, - width_per_group: int = 64, - initializers: Optional[List[Initializer]] = None, - loss_name: str = 'soft_cross_entropy') -> ComposerClassifier: - """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model. - - From `Deep Residual Learning for Image Recognition `_ (He et al, 2015). - - Args: - model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``, - ``"resnet152"``]. - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``. - weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``. - groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``. - width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage. - Default: ``64``. - initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization. - Default: ``None``. - loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or - 'binary_cross_entropy_with_logits'. Loss function must be in - :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``". - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model. - - Example: - - .. testcode:: - - from composer.models import composer_resnet - - model = composer_resnet(model_name='resnet18') # creates a torchvision resnet18 for image classification - """ - warnings.warn(DeprecationWarning('composer_resnet is deprecated and will be removed in v0.18')) - - valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] - if model_name not in valid_model_names: - raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.') - - if loss_name not in loss_registry.keys(): - raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the ' - 'specified loss function is present in composer.loss.loss.py') - - if loss_name == 'binary_cross_entropy_with_logits' and (initializers is None or - Initializer.LINEAR_LOG_CONSTANT_BIAS not in initializers): - log.warning('UserWarning: Using `binary_cross_entropy_loss_with_logits` ' - 'without using `initializers.linear_log_constant_bias` can degrade ' - 'performance. ' - 'Please ensure you are using `initializers. ' - 'linear_log_constant_bias`.') - - if initializers is None: - initializers = [] - - # Instantiate model - model_fn = getattr(resnet, model_name) - model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group) - - # Grab loss function from loss registry - loss_fn = loss_registry[loss_name] - - # Create metrics for train and validation - train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro') - val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')]) - - # Apply Initializers to model - for initializer in initializers: - initializer = Initializer(initializer) - model.apply(initializer.get_initializer()) - - composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn) - return composer_model diff --git a/composer/models/resnet_cifar/README.md b/composer/models/resnet_cifar/README.md deleted file mode 100644 index 5a32ae03b8..0000000000 --- a/composer/models/resnet_cifar/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# CIFAR ResNet -[\[Example\]](#example) · [\[Architecture\]](#architecture) · [\[Family Members\]](#family-members) · [\[Default Training Hyperparameters\]](#default-training-hyperparameters) · [\[Attribution\]](#attribution) · [\[API Reference\]](#api-reference) - -`Vision` / `Image Classification` - -The ResNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks. CIFAR ResNet models are a subset of this family designed specifically for the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) and [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) datasets. - -## Example - -```python -from composer.models import composer_resnet_cifar - -model = composer_resnet_cifar(model_name='resnet_56', num_classes=10) -``` - -## Architecture - -Residual Networks are feedforward convolutional networks with “residual” connections between non-consecutive layers. - -The model architecture is defined by the original paper: - -- The network inputs are of dimension 32×32x3. -- The first layer is 3×3 convolutions -- The subsequent layers are a stack of 6n layers with 3×3 convolutions on the feature maps of sizes {32,16,8}, with 2n layers for each feature map size. The number of filters are {16,32,64} for the respective feature map sizes. Subsampling is performed by convolutions with a stride of 2 -- The network ends with a global average pooling, a linear layer with the output dimension equal to the number of classes, and softmax function. - -There are a total 6n+2 stacked weighted layers. Each family member is specified by the number of layers, for example n=9 corresponds to ResNet56 - -The biggest differences between CIFAR ResNet models and ImageNet ResNet models are: - -- CIFAR ResNet models use fewer filters for each convolution. -- The ImageNet ResNets contain four stages, while the CIFAR ResNets contain three stages. In addition, CIFAR ResNets uniformly distribute blocks across each stage while ImageNet ResNets have a specific number of blocks for each stage. - -## Family Members - -| Model Family Members | Parameter Count | Our Accuracy | Training Time on 1x3080 | -|----------------------|-----------------|--------------|-------------------------| -| ResNet20 | 0.27M | TBA | TBA | -| ResNet32 | 0.46M | TBA | TBA | -| ResNet44 | 0.66M | TBA | TBA | -| ResNet56 | 0.85M | 93.1% | 35 min | -| ResNet110 | 1.7M | TBA | TBA | -## Default Training Hyperparameters - -```yaml -optimizer: - sgd: - learning_rate: 1.2 - momentum: 0.9 - weight_decay: 1e-4 -schedulers: - - multistep_with_warmup: - t_warmup: "5ep" - milestones: - - "80ep" - - "120ep" - gamma: 0.1 -train_batch_size: 1024 -max_duration: 160ep -``` - -## Attribution - -Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. - -Note that this paper set the standard for ResNet style architectures for both CIFAR-10/100 and ImageNet - -## API Reference - -```{eval-rst} -.. autoclass:: composer.models.resnet_cifar.model.composer_resnet_cifar - :noindex: -``` diff --git a/composer/models/resnet_cifar/__init__.py b/composer/models/resnet_cifar/__init__.py deleted file mode 100644 index 2ea6ac226c..0000000000 --- a/composer/models/resnet_cifar/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A ResNet model family adapted for CIFAR10 image sizes. - -See the :doc:`Model Card ` for more details. -""" - -from composer.models.resnet_cifar.model import composer_resnet_cifar as composer_resnet_cifar - -__all__ = ['composer_resnet_cifar'] -_metadata = { - 'resnet9': { - '_task': 'Image Classification', - '_dataset': 'CIFAR10', - '_name': 'ResNet9', - '_quality': 'tbd', - '_metric': 'Top-1 Accuracy', - '_ttt': 'tbd', - '_hparams': 'resnet9_cifar10.yaml' - }, - 'resnet20': { - '_task': 'Image Classification', - '_dataset': 'CIFAR10', - '_name': 'ResNet20', - '_quality': 'tbd', - '_metric': 'Top-1 Accuracy', - '_ttt': 'tbd', - '_hparams': 'resnet20_cifar10.yaml' - }, - 'resnet56': { - '_task': 'Image Classification', - '_dataset': 'CIFAR10', - '_name': 'ResNet56', - '_quality': '93.1', - '_metric': 'Top-1 Accuracy', - '_ttt': '35m', - '_hparams': 'resnet56_cifar10.yaml' - } -} diff --git a/composer/models/resnet_cifar/model.py b/composer/models/resnet_cifar/model.py deleted file mode 100644 index 5bb8660b56..0000000000 --- a/composer/models/resnet_cifar/model.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""ResNet models for CIFAR extending :class:`.ComposerClassifier`.""" - -import warnings -from typing import List, Optional - -from composer.models.initializers import Initializer -from composer.models.resnet_cifar.resnets import ResNet9, ResNetCIFAR -from composer.models.tasks import ComposerClassifier - -__all__ = ['composer_resnet_cifar'] - - -def composer_resnet_cifar(model_name: str, - num_classes: int = 10, - initializers: Optional[List[Initializer]] = None) -> ComposerClassifier: - """Helper function to create a :class:`.ComposerClassifier` with a CIFAR ResNet models. - - From `Deep Residual Learning for Image Recognition `_ (He et al, 2015). - ResNet9 is based on the model from myrtle.ai `blog`_. - - Args: - model_name (str): ``"resnet_9"``, ``"resnet_20"``, or ``"resnet_56"``. - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``. - initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization. - Default: ``None``. - - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with a CIFAR ResNet model. - - Example: - - .. testcode:: - - from composer.models import composer_resnet_cifar - - model = composer_resnet_cifar(model_name="resnet_56") # creates a resnet56 for cifar image classification - - .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/ - """ - warnings.warn(DeprecationWarning('composer_resnet_cifar is deprecated and will be removed in v0.18')) - if initializers is None: - initializers = [] - - if model_name == 'resnet_9': - model = ResNet9(num_classes) # current initializers don't work with this architecture. - else: - model = ResNetCIFAR.get_model_from_name(model_name, initializers, num_classes) - - composer_model = ComposerClassifier(module=model, num_classes=num_classes) - return composer_model diff --git a/composer/models/resnet_cifar/resnets.py b/composer/models/resnet_cifar/resnets.py deleted file mode 100644 index b4f1576b46..0000000000 --- a/composer/models/resnet_cifar/resnets.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The CIFAR ResNet torch module. - -See the :doc:`Model Card ` for more details. -""" - -# Code below adapted from https://github.com/facebookresearch/open_lth -# and https://github.com/pytorch/vision - -from typing import List, Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torchvision.models.resnet import BasicBlock - -from composer.models import Initializer - -__all__ = ['ResNetCIFAR', 'ResNet9'] - - -class ResNetCIFAR(nn.Module): - """A residual neural network as originally designed for CIFAR-10.""" - - class Block(nn.Module): - """A ResNet block.""" - - def __init__(self, f_in: int, f_out: int, downsample: bool = False): - super(ResNetCIFAR.Block, self).__init__() - - stride = 2 if downsample else 1 - self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(f_out) - self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(f_out) - self.relu = nn.ReLU(inplace=True) - - # No parameters for shortcut connections. - if downsample or f_in != f_out: - self.shortcut = nn.Sequential( - nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False), - nn.BatchNorm2d(f_out), - ) - else: - self.shortcut = nn.Sequential() - - def forward(self, x: torch.Tensor): - out = self.relu(self.bn1(self.conv1(x))) - out = self.bn2(self.conv2(out)) - out += self.shortcut(x) - return self.relu(out) - - def __init__(self, plan: List[Tuple[int, int]], initializers: List[Initializer], outputs: int = 10): - super(ResNetCIFAR, self).__init__() - outputs = outputs or 10 - - self.num_classes = outputs - - # Initial convolution. - current_filters = plan[0][0] - self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False) - self.bn = nn.BatchNorm2d(current_filters) - self.relu = nn.ReLU(inplace=True) - - # The subsequent blocks of the ResNet. - blocks = [] - for segment_index, (filters, num_blocks) in enumerate(plan): - for block_index in range(num_blocks): - downsample = segment_index > 0 and block_index == 0 - blocks.append(ResNetCIFAR.Block(current_filters, filters, downsample)) - current_filters = filters - - self.blocks = nn.Sequential(*blocks) - - # Final fc layer. Size = number of filters in last segment. - self.fc = nn.Linear(plan[-1][0], outputs) - self.criterion = nn.CrossEntropyLoss() - - for initializer in initializers: - initializer = Initializer(initializer) - self.apply(initializer.get_initializer()) - - def forward(self, x: torch.Tensor): - out = self.relu(self.bn(self.conv(x))) - out = self.blocks(out) - out = F.avg_pool2d(out, out.size()[3]) - out = out.view(out.size(0), -1) - out = self.fc(out) - return out - - @staticmethod - def is_valid_model_name(model_name: str): - valid_model_names = [f'resnet_{layers}' for layers in (20, 56)] - return (model_name in valid_model_names) - - @staticmethod - def get_model_from_name(model_name: str, initializers: List[Initializer], outputs: int = 10): - """The naming scheme for a ResNet is ``'resnet_D[_W]'``. - - D is the model depth (e.g. ``'resnet_56'``) - """ - - if not ResNetCIFAR.is_valid_model_name(model_name): - raise ValueError('Invalid model name: {}'.format(model_name)) - - depth = int(model_name.split('_')[-1]) # for resnet56, depth 56, width 16 - if len(model_name.split('_')) == 2: - width = 16 - else: - width = int(model_name.split('_')[3]) - - if (depth - 2) % 3 != 0: - raise ValueError('Invalid ResNetCIFAR depth: {}'.format(depth)) - num_blocks = (depth - 2) // 6 - - model_arch = { - 56: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)], - 20: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)], - } - - return ResNetCIFAR(model_arch[depth], initializers, outputs) - - -# adapted from https://raw.githubusercontent.com/matthias-wright/cifar10-resnet/master/model.py -# under the MIT license -class ResNet9(nn.Module): - """A 9-layer residual network, excluding BatchNorms and activation functions. - - Based on the myrtle.ai `blog`_ and Deep Residual Learning for Image Recognition (`He et al, 2015`_). - - Args: - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``. - - .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/ - .. _He et al, 2015: https://arxiv.org/abs/1512.03385 - """ - - def __init__(self, num_classes: int = 10): - super().__init__() - - self.body = nn.Sequential( - nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(num_features=64, momentum=0.9), - nn.ReLU(inplace=True), - nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(num_features=128, momentum=0.9), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, stride=2), - BasicBlock(inplanes=128, planes=128, stride=1), - nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(num_features=256, momentum=0.9), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, stride=2), - nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(num_features=256, momentum=0.9), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=2, stride=2), - BasicBlock(inplanes=256, planes=256, stride=1), - ) - - self.fc = nn.Linear(in_features=256, out_features=num_classes, bias=True) - - def forward(self, x): - out = self.body(x) - out = F.avg_pool2d(out, out.size()[3]) - out = out.view(out.size(0), -1) - out = self.fc(out) - return out diff --git a/composer/models/timm/__init__.py b/composer/models/timm/__init__.py deleted file mode 100644 index b7960b426a..0000000000 --- a/composer/models/timm/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A wrapper around `timm.create_model() `_ -used to create :class:`.ComposerClassifier`.""" - -from composer.models.timm.model import composer_timm as composer_timm - -__all__ = ['composer_timm'] diff --git a/composer/models/timm/model.py b/composer/models/timm/model.py deleted file mode 100644 index df0ffbca91..0000000000 --- a/composer/models/timm/model.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A wrapper around `timm.create_model() `_ -used to create :class:`.ComposerClassifier`.""" - -import warnings -from typing import Optional - -from composer.models.tasks import ComposerClassifier -from composer.utils.import_helpers import MissingConditionalImportError - -__all__ = ['composer_timm'] - - -def composer_timm(model_name: str, - pretrained: bool = False, - num_classes: int = 1000, - drop_rate: float = 0.0, - drop_path_rate: Optional[float] = None, - drop_block_rate: Optional[float] = None, - global_pool: Optional[str] = None, - bn_momentum: Optional[float] = None, - bn_eps: Optional[float] = None) -> ComposerClassifier: - """A wrapper around `timm.create_model() `_ used to create :class:`.ComposerClassifier`. - - Args: - model_name (str): timm model name e.g: ``"resnet50"``. List of models can be found at - `PyTorch Image Models `_. - pretrained (bool, optional): Imagenet pretrained. Default: ``False``. - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``. - drop_rate (float, optional): Dropout rate. Default: ``0.0``. - drop_path_rate (float, optional): Drop path rate (model default if ``None``). Default: ``None``. - drop_block_rate (float, optional): Drop block rate (model default if ``None``). Default: ``None``. - global_pool (str, optional): Global pool type, one of (``"fast"``, ``"avg"``, ``"max"``, ``"avgmax"``, ``"avgmaxc"``). Model default if ``None``. Default: ``None``. - bn_momentum (float, optional): BatchNorm momentum override (model default if ``None``). Default: ``None``. - bn_eps (float, optional): BatchNorm epsilon override (model default if ``None``). Default: ``None``. - - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with the specified TIMM model. - - Resnet18 Example: - - .. testcode:: - - from composer.models import composer_timm - - model = composer_timm(model_name='resnet18') # creates a timm resnet18 - """ - warnings.warn(DeprecationWarning('composer_timm is deprecated and will be removed in v0.18')) - try: - import timm - except ImportError as e: - raise MissingConditionalImportError(extra_deps_group='timm', conda_package='timm>=0.5.4', - conda_channel=None) from e - model = timm.create_model( # type: ignore (third-party) - model_name=model_name, - pretrained=pretrained, - num_classes=num_classes, - drop_rate=drop_rate, - drop_path_rate=drop_path_rate, - drop_block_rate=drop_block_rate, - global_pool=global_pool, - bn_momentum=bn_momentum, - bn_eps=bn_eps) - - composer_model = ComposerClassifier(module=model) - return composer_model diff --git a/composer/models/unet/README.md b/composer/models/unet/README.md deleted file mode 100644 index 530832051b..0000000000 --- a/composer/models/unet/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# UNet -[\[Example\]](#example) · [\[Architecture\]](#architecture) · [\[Default Training Hyperparameters\]](#default-training-hyperparameters) · [\[Attribution\]](#attribution) · [\[API Reference\]](#api-reference) - -`Vision` / `Segmentation` - -Unet is an architecture used for image segmentation. - -## Example - - - -```python -from composer.models import UNet - -model = UNet() -``` - -## Architecture - -The figure below ([source](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet)) shows a 3D version of the UNet architecture. Quoting the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet), "U-Net is composed of a contractive and an expanding path, that aims at building a bottleneck in its centremost part through a combination of convolution, instance norm and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve training." - -![unet3d.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/unet3d.png) - - -There are 3 main differences between our implementation and the original NVDA DALI implementation. - -The first two refer to removing the NVDA DALI pipeline and replacing all transforms with torch implementations. We are omitting the Zoom transform and use a kernel size of 3 for the Gaussian Blur transform. - -While NVDA DLE examples reports the training accuracy using an average of 5 folds, we are using only 1 fold in the interest of faster iteration time, so all of our results are reported using fold 0 and 200 epochs. - - -## Default Training Hyperparameters - -Below are the hyperparameters we used to train UNet on the [BraTS](http://braintumorsegmentation.org) image segmentation dataset. - -```yaml -optimizer: - radam: - lr: 0.001 - betas: [0.9, 0.999] - eps: 0.00000001 - weight_decay: 0.0001 -schedulers: - - constant: {} -train_batch_size: 64 -max_duration: 200ep -``` - - -## Attribution - -The UNet model has been introduced in "U-Net: Convolutional Networks for Biomedical Image Segmentation" by Olaf Ronneberger, Philipp Fischer, Thomas Brox in [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597). - -We are using the NVDA DLE examples version in -[https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet). - -## API Reference - -```{eval-rst} -.. autoclass:: composer.models.unet.UNet - :noindex: -``` diff --git a/composer/models/unet/__init__.py b/composer/models/unet/__init__.py deleted file mode 100644 index 6f26bd4625..0000000000 --- a/composer/models/unet/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset. - -See the :doc:`Model Card ` for more details. -""" - -from composer.models.unet.unet import UNet as UNet - -__all__ = ['UNet'] - -_task = 'Image Segmentation' -_dataset = 'BRATS' -_name = 'UNet' -_quality = '69.1' -_metric = 'Dice' -_ttt = '21m' -_hparams = 'unet.yaml' diff --git a/composer/models/unet/_layers.py b/composer/models/unet/_layers.py deleted file mode 100644 index 6fae767bf5..0000000000 --- a/composer/models/unet/_layers.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -## Code adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/ - -import numpy as np -import torch -import torch.nn as nn - -normalizations = { - 'instancenorm3d': nn.InstanceNorm3d, - 'instancenorm2d': nn.InstanceNorm2d, - 'batchnorm3d': nn.BatchNorm3d, - 'batchnorm2d': nn.BatchNorm2d, -} - -convolutions = { - 'Conv2d': nn.Conv2d, - 'Conv3d': nn.Conv3d, - 'ConvTranspose2d': nn.ConvTranspose2d, - 'ConvTranspose3d': nn.ConvTranspose3d, -} - - -def get_norm(name, out_channels): - if 'groupnorm' in name: - return nn.GroupNorm(32, out_channels, affine=True) - return normalizations[name](out_channels, affine=True) - - -def get_conv(in_channels, out_channels, kernel_size, stride, dim, bias=False): - conv = convolutions[f'Conv{dim}d'] - padding = get_padding(kernel_size, stride) - return conv(in_channels, out_channels, kernel_size, stride, padding, bias=bias) - - -def get_transp_conv(in_channels, out_channels, kernel_size, stride, dim): - conv = convolutions[f'ConvTranspose{dim}d'] - padding = get_padding(kernel_size, stride) - output_padding = get_output_padding(kernel_size, stride, padding) - return conv(in_channels, out_channels, kernel_size, stride, padding, output_padding, bias=True) - - -def get_padding(kernel_size, stride): - #kernel_size_np = np.cast(np.ndarray, np.atleast_1d(kernel_size)) - #stride_np = np.cast(np.ndarray, np.atleast_1d(stride)) - kernel_size_np = np.atleast_1d(kernel_size) - stride_np = np.atleast_1d(stride) - padding_np = (kernel_size_np - stride_np + 1) / 2 # type: ignore - padding = tuple(int(p) for p in padding_np) # type: ignore - return padding if len(padding) > 1 else padding[0] - - -def get_output_padding(kernel_size, stride, padding): - kernel_size_np = np.atleast_1d(kernel_size) - stride_np = np.atleast_1d(stride) - padding_np = np.atleast_1d(padding) - out_padding_np = 2 * padding_np + stride_np - kernel_size_np - out_padding = tuple(int(p) for p in out_padding_np) - return out_padding if len(out_padding) > 1 else out_padding[0] - - -class ConvLayer(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs): - super(ConvLayer, self).__init__() - self.conv = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim']) - self.norm = get_norm(kwargs['norm'], out_channels) - self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True) - - def forward(self, data): - out = self.conv(data) - out = self.norm(out) - out = self.lrelu(out) - return out - - -class ConvBlock(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs): - super(ConvBlock, self).__init__() - self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs) - self.conv2 = ConvLayer(out_channels, out_channels, kernel_size, 1, **kwargs) - - def forward(self, input_data): - out = self.conv1(input_data) - out = self.conv2(out) - return out - - -class ResidBlock(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs): - super(ResidBlock, self).__init__() - self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs) - self.conv2 = get_conv(out_channels, out_channels, kernel_size, 1, kwargs['dim']) - self.norm = get_norm(kwargs['norm'], out_channels) - self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True) - self.downsample = None - if max(stride) > 1 or in_channels != out_channels: # type: ignore - self.downsample = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim']) - self.norm_res = get_norm(kwargs['norm'], out_channels) - - def forward(self, input_data): - residual = input_data - out = self.conv1(input_data) - out = self.conv2(out) - out = self.norm(out) - if self.downsample is not None: - residual = self.downsample(residual) - residual = self.norm_res(residual) - out = self.lrelu(out + residual) - return out - - -class UpsampleBlock(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs): - super(UpsampleBlock, self).__init__() - self.transp_conv = get_transp_conv(in_channels, out_channels, stride, stride, kwargs['dim']) - self.conv_block = ConvBlock(2 * out_channels, out_channels, kernel_size, 1, **kwargs) - - def forward(self, input_data, skip_data): - out = self.transp_conv(input_data) - out = torch.cat((out, skip_data), dim=1) - out = self.conv_block(out) - return out - - -class OutputBlock(nn.Module): - - def __init__(self, in_channels, out_channels, dim): - super(OutputBlock, self).__init__() - self.conv = get_conv(in_channels, out_channels, kernel_size=1, stride=1, dim=dim, bias=True) - nn.init.constant_(self.conv.bias, 0) - - def forward(self, input_data): - return self.conv(input_data) diff --git a/composer/models/unet/model.py b/composer/models/unet/model.py deleted file mode 100644 index 08c49ff57c..0000000000 --- a/composer/models/unet/model.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset. - -See the :doc:`Model Card ` for more details. -""" - -import warnings - -import torch.nn as nn - -from composer.models.unet._layers import ConvBlock, OutputBlock, ResidBlock, UpsampleBlock - -__all__ = ['UNet'] - - -class UNet(nn.Module): - """Unet Architecture adapted from NVidia `Deep Learning Examples`_. - - .. _Deep Learning Examples: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/ - - Args: - in_channels (int): Number of input channels. - n_class (int): Number of output layers. - kernels (list): Conv layer kernel sizes. - strides (list): Conv layer strides. - normalization_layer (str): Normalization layer type, one of (``"batch"``, ``"instance"``). - negative_slope (float): Leaky relu negative slope. - residual (bool): Use residual connections. - dimension (int): Filter dimensions. - """ - - def __init__( - self, - in_channels, - n_class, - kernels, - strides, - normalization_layer, - negative_slope, - residual, - dimension, - ): - warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18')) - super(UNet, self).__init__() - self.dim = dimension - self.n_class = n_class - self.residual = residual - self.negative_slope = negative_slope - self.norm = normalization_layer + f'norm{dimension}d' - self.filters = [min(2**(5 + i), 320 if dimension == 3 else 512) for i in range(len(strides))] - - down_block = ResidBlock if self.residual else ConvBlock - self.input_block = self.get_conv_block( - conv_block=down_block, - in_channels=in_channels, - out_channels=self.filters[0], - kernel_size=kernels[0], - stride=strides[0], - ) - self.downsamples = self.get_module_list( - conv_block=down_block, - in_channels=self.filters[:-1], - out_channels=self.filters[1:], - kernels=kernels[1:-1], - strides=strides[1:-1], - ) - self.bottleneck = self.get_conv_block( - conv_block=down_block, - in_channels=self.filters[-2], - out_channels=self.filters[-1], - kernel_size=kernels[-1], - stride=strides[-1], - ) - self.upsamples = self.get_module_list( - conv_block=UpsampleBlock, - in_channels=self.filters[1:][::-1], - out_channels=self.filters[:-1][::-1], - kernels=kernels[1:][::-1], - strides=strides[1:][::-1], - ) - self.output_block = self.get_output_block(decoder_level=0) - self.apply(self.initialize_weights) - self.n_layers = len(self.upsamples) - 1 - - def forward(self, input_data): - out = self.input_block(input_data) - encoder_outputs = [out] - for downsample in self.downsamples: - out = downsample(out) - encoder_outputs.append(out) - out = self.bottleneck(out) - for idx, upsample in enumerate(self.upsamples): - out = upsample(out, encoder_outputs[self.n_layers - idx]) - out = self.output_block(out) - return out - - def get_conv_block(self, conv_block, in_channels, out_channels, kernel_size, stride): - return conv_block( - dim=self.dim, - stride=stride, - norm=self.norm, - kernel_size=kernel_size, - in_channels=in_channels, - out_channels=out_channels, - negative_slope=self.negative_slope, - ) - - def get_output_block(self, decoder_level): - return OutputBlock(in_channels=self.filters[decoder_level], out_channels=self.n_class, dim=self.dim) - - def get_module_list(self, in_channels, out_channels, kernels, strides, conv_block): - layers = [] - for in_channel, out_channel, kernel, stride in zip(in_channels, out_channels, kernels, strides): - conv_layer = self.get_conv_block(conv_block, in_channel, out_channel, kernel, stride) - layers.append(conv_layer) - return nn.ModuleList(layers) - - def initialize_weights(self, module): - name = module.__class__.__name__.lower() - if name in ['conv2d']: - nn.init.kaiming_normal_(module.weight, a=self.negative_slope) diff --git a/composer/models/unet/unet.py b/composer/models/unet/unet.py deleted file mode 100644 index dde555bb4f..0000000000 --- a/composer/models/unet/unet.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""A U-Net model extending :class:`.ComposerModel`.""" - -import logging -import warnings -from typing import Any, Dict, Optional, Sequence, Union - -import torch -import torch.nn as nn -from torchmetrics import Metric - -from composer.metrics.metrics import Dice -from composer.models.base import ComposerModel -from composer.models.unet.model import UNet as UNetModel -from composer.utils.import_helpers import MissingConditionalImportError - -log = logging.getLogger(__name__) - -__all__ = ['UNet'] - - -class UNet(ComposerModel): - """A U-Net model extending :class:`.ComposerModel`. - - See U-Net: Convolutional Networks for Biomedical Image Segmentation (`Ronneberger et al, 2015`_) - on the U-Net architecture. - - Args: - num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``3``. - - .. _Ronneberger et al, 2015: https://arxiv.org/abs/1505.04597 - """ - - def __init__(self, num_classes: int = 3) -> None: - warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18')) - - super().__init__() - try: - from monai.losses import DiceLoss - except ImportError as e: - raise MissingConditionalImportError(extra_deps_group='unet', - conda_package='monai', - conda_channel='conda-forge') from e - - self.module = self.build_nnunet() - - self.dice = Dice(num_classes=num_classes) - self.dloss = DiceLoss(include_background=False, softmax=True, to_onehot_y=True, batch=True) - self.closs = nn.CrossEntropyLoss() - - def loss(self, outputs: Any, batch: Any, *args, **kwargs) -> Union[torch.Tensor, Sequence[torch.Tensor]]: - _, y = batch - y = y.squeeze(1) # type: ignore - loss = self.dloss(outputs, y) - loss += self.closs(outputs, y[:, 0].long()) - return loss - - @staticmethod - def metric_mean(name, outputs): - return torch.stack([out[name] for out in outputs]).mean(dim=0) - - def get_metrics(self, is_train: bool = False) -> Dict[str, Metric]: - return {'Dice': self.dice} - - def forward(self, batch: Any) -> torch.Tensor: - x, _ = batch - x = x.squeeze(1) # type: ignore - logits = self.module(x) - return logits - - def inference2d(self, image): - """Runs inference on a 3D image, by passing each depth slice through the model.""" - batch_modulo = image.shape[2] % 64 - if batch_modulo != 0: - batch_pad = 64 - batch_modulo - image = nn.ConstantPad3d((0, 0, 0, 0, batch_pad, 0), 0)(image) - - image = torch.transpose(image.squeeze(0), 0, 1) - preds_shape = (image.shape[0], 4, *image.shape[2:]) - preds = torch.zeros(preds_shape, dtype=image.dtype, device=image.device) - for start in range(0, image.shape[0] - 64 + 1, 64): - end = start + 64 - with torch.no_grad(): - pred = self.module(image[start:end]) - preds[start:end] = pred.data - if batch_modulo != 0: - preds = preds[batch_pad:] # type: ignore - return torch.transpose(preds, 0, 1).unsqueeze(0) - - def eval_forward(self, batch: Any, outputs: Optional[Any] = None): - assert self.training is False, 'For validation, model must be in eval mode' - image, _ = batch - pred = self.inference2d(image) - return pred - - def build_nnunet(self) -> torch.nn.Module: - kernels = [[3, 3]] * 6 - strides = [[1, 1]] + [[2, 2]] * 5 - model = UNetModel(in_channels=4, - n_class=4, - kernels=kernels, - strides=strides, - dimension=2, - residual=True, - normalization_layer='batch', - negative_slope=0.01) - - return model diff --git a/composer/models/vit_small_patch16/__init__.py b/composer/models/vit_small_patch16/__init__.py deleted file mode 100644 index 9992807ade..0000000000 --- a/composer/models/vit_small_patch16/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""ViT Small Patch 16 for image classification.""" - -from composer.models.vit_small_patch16.model import vit_small_patch16 as vit_small_patch16 - -__all__ = ['vit_small_patch16'] - -_task = 'Image Classification' -_dataset = 'ImageNet' -_name = 'ViT-Small-Patch16' -_quality = '74.52' -_metric = 'Top-1 Accuracy' -_ttt = '1d 59m' -_hparams = 'vit_small_patch16.yaml' diff --git a/composer/models/vit_small_patch16/model.py b/composer/models/vit_small_patch16/model.py deleted file mode 100644 index dacb9db56a..0000000000 --- a/composer/models/vit_small_patch16/model.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""Implements ViT-S/16 as a :class:`.ComposerClassifier`.""" - -import warnings - -from composer.models.tasks import ComposerClassifier - -__all__ = ['vit_small_patch16'] - - -def vit_small_patch16(num_classes: int = 1000, - image_size: int = 224, - channels: int = 3, - dropout: float = 0.0, - embedding_dropout: float = 0.0): - """Helper function to create a :class:`.ComposerClassifier` using a ViT-S/16 model. - - See `Training data-efficient image transformers & distillation through attention `_ - (Touvron et al, 2021) for details on ViT-S/16. - - Args: - num_classes (int, optional): number of classes for the model. Default: ``1000``. - image_size (int, optional): input image size. If you have rectangular images, make sure your image - size is the maximum of the width and height. Default: ``224``. - channels (int, optional): number of image channels. Default: ``3``. - dropout (float, optional): 0.0 - 1.0 dropout rate. Default: ``0``. - embedding_dropout (float, optional): 0.0 - 1.0 embedding dropout rate. Default: ``0``. - - Returns: - ComposerModel: instance of :class:`.ComposerClassifier` with a ViT-S/16 model. - """ - warnings.warn(DeprecationWarning('vit_small_patch16 is deprecated and will be removed in v0.18')) - - from vit_pytorch import ViT - model = ViT( - image_size=image_size, - channels=channels, - num_classes=num_classes, - dim=384, # embed dim/width - patch_size=16, - depth=12, # layers - heads=6, - mlp_dim=1536, - dropout=dropout, - emb_dropout=embedding_dropout) - - composer_model = ComposerClassifier(module=model) - return composer_model diff --git a/composer/utils/collect_env.py b/composer/utils/collect_env.py index 2926c54a6f..02e74af8f9 100644 --- a/composer/utils/collect_env.py +++ b/composer/utils/collect_env.py @@ -378,7 +378,6 @@ def print_env(file: Optional[TextIO] = None) -> None: [pip3] torch-optimizer==0.1.0 [pip3] torchmetrics==0.7.3 [pip3] torchvision==0.10.1+cu111 - [pip3] vit-pytorch==0.27.0 [conda] Could not collect diff --git a/docs/source/composer_model.rst b/docs/source/composer_model.rst index 3f4c32dab8..bd80be1d10 100644 --- a/docs/source/composer_model.rst +++ b/docs/source/composer_model.rst @@ -75,8 +75,6 @@ We also provide several common classes for various tasks, specifically: - :class:`.ComposerClassifier` - classification tasks with a cross entropy loss and accuracy metric. -- :func:`.composer_timm` - creates classification models from the popular `TIMM`_ - library. - :class:`.HuggingFaceModel` - :class:`.ComposerModel` wrapper for a 🤗 `Transformers`_ model. .. note:: @@ -195,18 +193,6 @@ Integrations ------------ - -TIMM -~~~~ - -Integrate with your favorite `TIMM`_ models with our :func:`.composer_timm` function. - -.. code:: python - - from composer.models import composer_timm - - timm_model = composer_timm(model_name='resnet50', pretrained=True) - BERT Example with 🤗 Transformers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -265,5 +251,4 @@ and make it compatible with our trainer. .. |loss| replace:: :meth:`~.ComposerModel.loss` .. _MMDetection: https://mmdetection.readthedocs.io/en/latest/ .. _Transformers: https://huggingface.co/docs/transformers/index -.. _TIMM: https://timm.fast.ai/ .. _torchvision: https://pytorch.org/vision/stable/models.html diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 2b640283b3..89d068efe2 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -48,7 +48,6 @@ from composer.core import Timestamp as Timestamp from composer.core import TimeUnit as TimeUnit from composer.core import types as types -from composer.datasets.synthetic import SyntheticBatchPairDataset from composer.devices import DeviceCPU from composer.loggers import InMemoryLogger as InMemoryLogger from composer.loggers import Logger as Logger @@ -87,7 +86,7 @@ sys.path.insert(0, _repo_root) from tests.common import SimpleModel -from tests.common.datasets import RandomTextClassificationDataset +from tests.common.datasets import RandomClassificationDataset, RandomTextClassificationDataset # Disable mosaicml logger os.environ['MOSAICML_PLATFORM'] = 'False' @@ -112,11 +111,10 @@ scheduler = CosineAnnealingLR(optimizer, T_max=1) -dataset = SyntheticBatchPairDataset( - total_dataset_size=100, - data_shape=data_shape, +dataset = RandomClassificationDataset( + shape=data_shape, + size=100, num_classes=num_classes, - num_unique_samples_to_create=10, ) train_dataset = dataset diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index d55745608f..b2cebc5281 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -20,8 +20,6 @@ the following installation targets are available: and building documentation. * ``pip install 'mosaicml[deepspeed]'``: Installs Composer with support for :mod:`deepspeed`. * ``pip install 'mosaicml[nlp]'``: Installs Composer with support for NLP models and algorithms. -* ``pip install 'mosaicml[unet]'``: Installs Composer with support for :doc:`Unet `. -* ``pip install 'mosaicml[timm]'``: Installs Composer with support for :mod:`timm`. * ``pip install 'mosaicml[wandb]'``: Installs Composer with support for :mod:`wandb`. * ``pip install 'mosaicml[comet_ml]'``: Installs Composer with support for :mod:`comet_ml`. * ``pip install 'mosaicml[tensorboard]'``: Installs Composer with support for :mod:`tensorboard`. @@ -29,7 +27,6 @@ the following installation targets are available: * ``pip install 'mosaicml[mlflow]'``: Installs Composer with support for :mod:`mlflow`. * ``pip install 'mosaicml[oci]'``: Installs Composer with support for :mod:`oci`. * ``pip install 'mosaicml[onnx]'``: Installs Composer with support for :mod:`onnx`. -* ``pip install 'mosaicml[vit]'``: Installs Composer with support for :mod:`vit`. * ``pip install 'mosaicml[coco]'``: Installs Composer with support for :mod:`coco`. * ``pip install 'mosaicml[libcloud]'``: Installs Composer with support for :mod:`libcloud`. * ``pip install 'mosaicml[all]'``: Installs all optional dependencies. diff --git a/docs/source/getting_started/quick_start.rst b/docs/source/getting_started/quick_start.rst index c3c7d6f7ed..f7613384ba 100644 --- a/docs/source/getting_started/quick_start.rst +++ b/docs/source/getting_started/quick_start.rst @@ -61,7 +61,7 @@ Besides easily running our built-in algorithms, Composer also features: * An interface to flexibly add algorithms to the training loop * An engine that manages the ordering of algorithms for composition * A trainer to handle boilerplate around numerics, distributed training, and others -* Integration with popular model libraries such as TIMM and HuggingFace Transformers +* Integration with popular model libraries such as HuggingFace Transformers Next steps ---------- diff --git a/docs/source/index.rst b/docs/source/index.rst index ce95ba6e1b..425dcad93c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -47,7 +47,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c examples/getting_started.ipynb examples/functional_api.ipynb - examples/medical_image_segmentation.ipynb examples/custom_speedup_methods.ipynb examples/finetune_huggingface.ipynb examples/pretrain_finetune_huggingface.ipynb @@ -136,19 +135,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c method_cards/swa.md method_cards/weight_standardization.md -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Model Library - - model_cards/BERT.md - model_cards/cifar_resnet.md - model_cards/deeplabv3.md - model_cards/efficientnet.md - model_cards/GPT2.md - model_cards/resnet.md - model_cards/unet.md - .. toctree:: :hidden: :caption: API Reference diff --git a/docs/source/method_cards/decoupled_weight_decay.md b/docs/source/method_cards/decoupled_weight_decay.md index 2d9f78f94f..71e0f4312f 100644 --- a/docs/source/method_cards/decoupled_weight_decay.md +++ b/docs/source/method_cards/decoupled_weight_decay.md @@ -16,9 +16,7 @@ L2 regularization is typically considered equivalent to weight decay, but this e - -```bash -# Single GPU/CPU depending on torch.cuda.is_available() -python train_resnet_imagenet1k.py /path/to/imagenet - -# Log experiments to Weights and Biases -python train_resnet_imagenet1k.py /path/to/imagenet --wandb_logger --wandb_entity my_username --wandb_project my_project --wandb_run_name my_run_name - -# Single/Multi GPU training (infers the number of GPUs available) -composer train_resnet_imagenet1k.py /path/to/imagenet - -# Manually specify number of GPUs to use: -composer -n $N_GPUS train_resnet_imagenet1k.py /path/to/imagenet - -# Mild ResNet recipe for fastest training to ~76.5% accuracy: -composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name mild --train_crop_size 176 --eval_crop_size 224 --max_duration 36ep --loss_name binary_cross_entropy - -# Medium ResNet recipe highest accuracy with similar training time as baseline: -composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name medium --train_crop_size 176 --eval_crop_size 224 --max_duration 135ep --loss_name binary_cross_entropy - -# Spicy ResNet recipe for our most accurate ResNet over a long training schedule: -composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name spicy --train_crop_size 176 --eval_crop_size 224 --max_duration 270ep --loss_name binary_cross_entropy -``` diff --git a/examples/imagenet/train_resnet_imagenet1k.py b/examples/imagenet/train_resnet_imagenet1k.py deleted file mode 100644 index d6f1dee008..0000000000 --- a/examples/imagenet/train_resnet_imagenet1k.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""Example script to train a ResNet model on ImageNet.""" - -import argparse -import logging -import os - -import torch -from torch.utils.data import DataLoader -from torchmetrics import MetricCollection -from torchmetrics.classification import MulticlassAccuracy -from torchvision import transforms -from torchvision.datasets import ImageFolder -from torchvision.models import resnet - -from composer import DataSpec, Time, Trainer -from composer.algorithms import (EMA, SAM, BlurPool, ChannelsLast, ColOut, LabelSmoothing, MixUp, ProgressiveResizing, - RandAugment, StochasticDepth) -from composer.callbacks import CheckpointSaver, LRMonitor, SpeedMonitor -from composer.datasets.utils import NormalizationFn, pil_image_collate -from composer.loggers import WandBLogger -from composer.loss import binary_cross_entropy_with_logits, soft_cross_entropy -from composer.metrics import CrossEntropy -from composer.models.tasks import ComposerClassifier -from composer.optim import CosineAnnealingWithWarmupScheduler, DecoupledSGDW -from composer.utils import dist - -logging.basicConfig() -logging.getLogger().setLevel(logging.INFO) - -parser = argparse.ArgumentParser() - -# Dataloader arguments -parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str) -parser.add_argument('--train_crop_size', help='Training image crop size', type=int, default=224) -parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=256) -parser.add_argument('--eval_crop_size', help='Evaluation image crop size', type=int, default=224) -parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=2048) -parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=2048) - -# Model arguments -parser.add_argument('--model_name', - help='Name of the resnet model to train', - default='resnet50', - choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']) -parser.add_argument('--loss_name', - help='Name of the loss function to use for training', - default='cross_entropy', - choices=['cross_entropy', 'binary_cross_entropy']) - -# Optimizer arguments -parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=2.048) -parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.875) -parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-4) - -# LR scheduler arguments -parser.add_argument('--t_warmup', - help='Duration of learning rate warmup specified as a Time string', - type=Time.from_timestring, - default='8ep') -parser.add_argument('--t_max', - help='Duration to cosine decay the learning rate specified as a Time string', - type=Time.from_timestring, - default='1dur') - -# Save checkpoint arguments -parser.add_argument('--save_checkpoint_dir', - help='Directory in which to save model checkpoints', - type=str, - default='checkpoints/{run_name}') -parser.add_argument('--checkpoint_interval', help='Frequency to save checkpoints', type=str, default='1ep') - -# Load checkpoint arguments, assumes resuming the previous training run instead of fine-tuning -parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str) - -# Recipes -parser.add_argument('--recipe_name', - help='Either "mild", "medium" or "spicy" in order of increasing training time and accuracy', - type=str, - choices=['mild', 'medium', 'spicy']) - -# Logger parameters: progress bar logging is used by default -# Only has Weights and Biases option to reduce the number of arguments. Other loggers can be substituted in the script -parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true') -parser.add_argument('--wandb_entity', help='WandB entity name', type=str) -parser.add_argument('--wandb_project', help='WandB project name', type=str) -parser.add_argument('--wandb_run_name', help='WandB run name', type=str) - -# Trainer arguments -parser.add_argument('--run_name', help='Name of the training run used for checkpointing and other logging', type=str) -parser.add_argument('--seed', help='Random seed', type=int, default=17) -parser.add_argument('--max_duration', - help='Duration to train specified as a Time string', - type=Time.from_timestring, - default='90ep') -parser.add_argument('--eval_interval', - help='How frequently to run evaluation on the validation set specified as a Time string', - type=Time.from_timestring, - default='1ep') - -args = parser.parse_args() - - -def _main(): - - # Divide batch sizes by number of devices if running multi-gpu training - if dist.get_world_size(): - args.train_batch_size = args.train_batch_size // dist.get_world_size() - args.eval_batch_size = args.eval_batch_size // dist.get_world_size() - - # Scale by 255 since the collate `pil_image_collate` results in images in range 0-255 - # If using ToTensor() and the default collate, remove the scaling by 255 - IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255) - IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255) - - # Train dataset - logging.info('Building train dataloader') - train_transforms = transforms.Compose([ - transforms.RandomResizedCrop(args.train_crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)), - transforms.RandomHorizontalFlip(), - ]) - train_dataset = ImageFolder(os.path.join(args.data_dir, 'train'), train_transforms) - # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup - train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True) - train_dataloader = DataLoader( - train_dataset, - batch_size=args.train_batch_size, - num_workers=8, - pin_memory=True, - drop_last=True, - sampler=train_sampler, - collate_fn=pil_image_collate, - persistent_workers=True, # Reduce overhead of creating new workers at the expense of using slightly more RAM - ) - # DataSpec allows for on-gpu transformations, marginally relieving dataloader bottleneck - train_dataspec = DataSpec(dataloader=train_dataloader, - device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)) - logging.info('Built train dataloader\n') - - # Validation dataset - logging.info('Building evaluation dataloader') - eval_transforms = transforms.Compose([ - transforms.Resize(args.eval_resize_size), - transforms.CenterCrop(args.eval_crop_size), - ]) - eval_dataset = ImageFolder(os.path.join(args.data_dir, 'val'), eval_transforms) - # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup, - eval_sampler = dist.get_sampler(eval_dataset, drop_last=False, shuffle=False) - eval_dataloader = DataLoader( - eval_dataset, - batch_size=args.eval_batch_size, - num_workers=8, - pin_memory=True, - drop_last=False, - sampler=eval_sampler, - collate_fn=pil_image_collate, - persistent_workers=True, # Reduce overhead of creating new workers at the expense of using slightly more RAM - ) - eval_dataspec = DataSpec(dataloader=eval_dataloader, - device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)) - logging.info('Built evaluation dataloader\n') - - # Instantiate torchvision ResNet model - logging.info('Building Composer model') - model_fn = getattr(resnet, args.model_name) - model = model_fn(num_classes=1000, groups=1, width_per_group=64) - - # Specify model initialization - def weight_init(w: torch.nn.Module): - if isinstance(w, torch.nn.Linear) or isinstance(w, torch.nn.Conv2d): - torch.nn.init.kaiming_normal_(w.weight) - if isinstance(w, torch.nn.BatchNorm2d): - w.weight.data = torch.rand(w.weight.data.shape) - w.bias.data = torch.zeros_like(w.bias.data) - # When using binary cross entropy, set the classification layer bias to -log(num_classes) - # to ensure the initial probabilities are approximately 1 / num_classes - if args.loss_name == 'binary_cross_entropy' and isinstance(w, torch.nn.Linear): - w.bias.data = torch.ones(w.bias.shape) * -torch.log(torch.tensor(w.bias.shape[0])) - - model.apply(weight_init) - - # Performance metrics to log other than training loss - train_metrics = MulticlassAccuracy(num_classes=1000, average='micro') - val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=1000, average='micro')]) - - # Cross entropy loss that can handle both index and one-hot targets - - if args.loss_name == 'binary_cross_entropy': - loss_fn = binary_cross_entropy_with_logits - else: - loss_fn = soft_cross_entropy - - # Wrapper function to convert a classification PyTorch model into a Composer model - composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn) - logging.info('Built Composer model\n') - - # Optimizer - logging.info('Building optimizer and learning rate scheduler') - optimizer = DecoupledSGDW(composer_model.parameters(), - lr=args.learning_rate, - momentum=args.momentum, - weight_decay=args.weight_decay) - - # Learning rate scheduler: LR warmup for 8 epochs, then cosine decay for the rest of training - lr_scheduler = CosineAnnealingWithWarmupScheduler(t_warmup=args.t_warmup, t_max=args.t_max) - logging.info('Built optimizer and learning rate scheduler\n') - - # Callbacks for logging - logging.info('Building SpeedMonitor, LRMonitor, and CheckpointSaver callbacks') - speed_monitor = SpeedMonitor(window_size=50) # Measures throughput as samples/sec and tracks total training time - lr_monitor = LRMonitor() # Logs the learning rate - - # Callback for checkpointing - checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval) - logging.info('Built SpeedMonitor, LRMonitor, and CheckpointSaver callbacks\n') - - # Recipes for training ResNet architectures on ImageNet in order of increasing training time and accuracy - # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/ - logging.info('Building algorithm recipes') - if args.recipe_name == 'mild': - algorithms = [ - BlurPool(), - ChannelsLast(), - EMA(half_life='100ba', update_interval='20ba'), - ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2), - LabelSmoothing(smoothing=0.08), - ] - elif args.recipe_name == 'medium': - algorithms = [ - BlurPool(), - ChannelsLast(), - EMA(half_life='100ba', update_interval='20ba'), - ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2), - LabelSmoothing(smoothing=0.1), - MixUp(alpha=0.2), - SAM(rho=0.5, interval=10), - ] - elif args.recipe_name == 'spicy': - algorithms = [ - BlurPool(), - ChannelsLast(), - EMA(half_life='100ba', update_interval='20ba'), - ProgressiveResizing(initial_scale=0.6, delay_fraction=0.2, finetune_fraction=0.2), - LabelSmoothing(smoothing=0.13), - MixUp(alpha=0.25), - SAM(rho=0.5, interval=5), - ColOut(p_col=0.05, p_row=0.05), - RandAugment(depth=1, severity=9), - StochasticDepth(target_layer_name='ResNetBottleneck', - stochastic_method='sample', - drop_distribution='linear', - drop_rate=0.1) - ] - else: - algorithms = None - logging.info('Built algorithm recipes\n') - - logger = None - if args.wandb_logger: - if args.wandb_entity is None: - raise ValueError('Please specify --wandb_entity argument') - if args.wandb_project is None: - raise ValueError('Please specify --wandb_project argument') - if args.wandb_run_name is None: - raise ValueError('Please specify --wandb_run_name argument') - logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project, name=args.wandb_run_name) - - # Create the Trainer! - logging.info('Building Trainer') - device = 'gpu' if torch.cuda.is_available() else 'cpu' - precision = 'amp' if device == 'gpu' else 'fp32' # Mixed precision for fast training when using a GPU - trainer = Trainer(run_name=args.run_name, - model=composer_model, - train_dataloader=train_dataspec, - eval_dataloader=eval_dataspec, - eval_interval=args.eval_interval, - optimizers=optimizer, - schedulers=lr_scheduler, - algorithms=algorithms, - loggers=logger, - max_duration=args.max_duration, - callbacks=[speed_monitor, lr_monitor, checkpoint_saver], - load_path=args.load_checkpoint_path, - device=device, - precision=precision, - device_train_microbatch_size='auto', - seed=args.seed) - logging.info('Built Trainer\n') - - # Start training! - logging.info('Train!') - trainer.fit() - - -if __name__ == '__main__': - _main() diff --git a/examples/medical_image_segmentation.ipynb b/examples/medical_image_segmentation.ipynb deleted file mode 100644 index d13f88fbea..0000000000 --- a/examples/medical_image_segmentation.ipynb +++ /dev/null @@ -1,725 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 🩺 Image Segmentation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook you will use Composer and PyTorch to segment pneumothorax (air around or outside of the lungs) from chest radiographic images. This dataset was originally released for a [kaggle competition][kaggle] by the [Society for Informatics in Medicine][siim] (SIIM).\n", - "\n", - "**Disclaimer: This example represents a minimal working baseline. In order to get competitive results this notebook must run for a long time.**\n", - "\n", - "### Recommended Background\n", - "\n", - "This tutorial goes through the process of starting a project from scratch with Composer. It assumes you are fairly familiar with how such a process might look if working with PyTorch. In addition, it assumes some familiarity with computer vision models and methods.\n", - "\n", - "To better understand the Composer part, make sure you're comfortable with the material in our [Getting Started][getting_started] tutorial.\n", - "\n", - "### Tutorial Goals and Concepts Covered\n", - "\n", - "The goal of this tutorial is to provide an executable example of a computer vision project in Composer from the ground up.\n", - "\n", - "We will cover:\n", - "\n", - "- installing relevant packages\n", - "- downloading the SIIM dataset from kaggle\n", - "- cleaning and resampling the dataset\n", - "- splitting data for validation\n", - "- visualizing model inputs\n", - "- training a baseline model with Composer\n", - "- using Composer methods\n", - "- next steps\n", - "\n", - "Let's get started!\n", - "\n", - "[kaggle]: https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/overview\n", - "[siim]: https://siim.org/\n", - "[getting_started]: https://docs.mosaicml.com/projects/composer/en/stable/examples/getting_started.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "Let's get started and configure our environment.\n", - "\n", - "### Install Dependencies\n", - "\n", - "If you haven't already, let's install the following dependencies, which are needed for this example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install kaggle pydicom git+https://github.com/qubvel/segmentation_models.pytorch opencv-python-headless jupyterlab-widgets\n", - "\n", - "%pip install mosaicml\n", - "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# %pip install git+https://github.com/mosaicml/composer.git" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Kaggle Authentication\n", - "\n", - "To access the data you need a Kaggle Account\n", - "- accept competition terms https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/data\n", - "- download `kaggle.json` from https://www.kaggle.com/yourusername/account by clicking \"Create new API token\"\n", - "- make the `kaggle.json` file available to this notebook using the following code cells." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from ipywidgets import FileUpload\n", - "from IPython.display import display\n", - "uploader = FileUpload(accept='.json', multiple=True)\n", - "display(uploader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "kaggle_folder = os.path.join(os.path.expanduser(\"~\"), \".kaggle\")\n", - "os.makedirs(kaggle_folder, exist_ok=True)\n", - "kaggle_config_file = os.path.join(kaggle_folder, \"kaggle.json\")\n", - "with open(kaggle_config_file, 'wb+') as output_file: \n", - " for uploaded_filename in uploader.value:\n", - " content = uploader.value[uploaded_filename]['content'] \n", - " output_file.write(content) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download and unzip the data \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kaggle datasets download -d seesee/siim-train-test\n", - "!unzip -q siim-train-test.zip -d .\n", - "!ls" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Flatten Image Directories\n", - "The original dataset is oddly nested. We flatten it out so the images are easier to access in our pytorch dataset.\n", - "\n", - "`/siim/dicom-images-train/id/id/id.dcm` to `/siim/dicom-images-train/id.dcm`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from tqdm.auto import tqdm\n", - "\n", - "train_images = list(Path('siim/dicom-images-train').glob('*/*/*.dcm'))\n", - "for image in tqdm(train_images):\n", - " image.replace(f'siim/dicom-images-train/{image.parts[-1]}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Project setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import itertools\n", - "from ipywidgets import interact, fixed, IntSlider\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import torch\n", - "from torch import nn\n", - "import matplotlib.pyplot as plt\n", - "import cv2\n", - "\n", - "# model\n", - "import segmentation_models_pytorch as smp\n", - "\n", - "# data\n", - "from torch.utils.data import DataLoader, Dataset\n", - "from torchvision.utils import draw_segmentation_masks, make_grid\n", - "from pydicom.filereader import dcmread\n", - "from sklearn.model_selection import StratifiedKFold\n", - "\n", - "# transforms\n", - "from albumentations import ShiftScaleRotate, Resize, Compose\n", - "\n", - "from torchmetrics import Metric\n", - "from torchmetrics.collections import MetricCollection\n", - "\n", - "# composer\n", - "from composer import Trainer\n", - "from composer.models import ComposerModel\n", - "from composer.optim import DecoupledAdamW\n", - "from composer.metrics.metrics import Dice" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Utils\n", - "\n", - "Here we define some utility functions to help with logging, decoding/encoding targets, and visualization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LossMetric(Metric):\n", - " \"\"\"Turns any torch.nn Loss Module into distributed torchmetrics Metric.\"\"\"\n", - "\n", - " def __init__(self, loss, dist_sync_on_step=False):\n", - " super().__init__(dist_sync_on_step=dist_sync_on_step)\n", - " self.loss = loss\n", - " self.add_state(\"sum_loss\", default=torch.tensor(0.), dist_reduce_fx=\"sum\")\n", - " self.add_state(\"total_batches\", default=torch.tensor(0), dist_reduce_fx=\"sum\")\n", - "\n", - " def update(self, preds, target):\n", - " \"\"\"Update the state with new predictions and targets.\n", - " \"\"\"\n", - " # Loss calculated over samples/batch, accumulate loss over all batches\n", - " self.sum_loss += self.loss(preds, target)\n", - " self.total_batches += 1\n", - "\n", - " def compute(self):\n", - " \"\"\"Aggregate state over all processes and compute the metric.\n", - " \"\"\"\n", - " # Return average loss over entire validation dataset\n", - " return self.sum_loss / self.total_batches\n", - "\n", - "def rle2mask(rle, height=1024, width=1024, fill_value=1):\n", - " mask = np.zeros((height, width), np.float32)\n", - " mask = mask.reshape(-1)\n", - " rle = np.array([int(s) for s in rle.strip().split(' ')])\n", - " rle = rle.reshape(-1, 2)\n", - " start = 0\n", - " for index, length in rle:\n", - " start = start+index\n", - " end = start+length\n", - " mask[start: end] = fill_value\n", - " start = end\n", - " mask = mask.reshape(width, height).T\n", - " return mask\n", - "\n", - "def mask2rle(mask):\n", - " mask = mask.T.flatten()\n", - " start = np.where(mask[1:] > mask[:-1])[0]+1\n", - " end = np.where(mask[:-1] > mask[1:])[0]+1\n", - " length = end-start\n", - " rle = []\n", - " for i in range(len(length)):\n", - " if i == 0:\n", - " rle.extend([start[0], length[0]])\n", - " else:\n", - " rle.extend([start[i]-end[i-1], length[i]])\n", - " rle = ' '.join([str(r) for r in rle])\n", - " return rle" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preprocessing and Data Science" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SIIM Dataset\n", - "\n", - "The SIIM dataset consists of:\n", - "- `dicom-images-train` - 12954 labeled images in [DICOM][dicom] format.\n", - "- `dicom-images-test` - 3205 unlabeled DICOM images for testing\n", - "\n", - "- `train-rle.csv` comes with a label file `train-rle.csv` mapping `ImageId` to `EncodedPixels`.\n", - "\n", - " - `ImageId`s map to image paths for [DICOM][dicom_format] format images. \n", - "\n", - " - `EncodedPixels` are [run length encoded][masks] segmentation masks representing areas where pneumothorax has been labeled by an expert. A label of `\"-1\"` indicates the image was examined and no pneumothorax was found.\n", - "\n", - "[dicom]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom\n", - "[dicom_format]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom.html#sphx-glr-auto-examples-input-output-plot-read-dicom-py\n", - "[masks]: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/mask.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls siim" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "labels_df = pd.read_csv('siim/train-rle.csv')\n", - "labels_df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Clean Data\n", - "Of the ~13,000 images, only 3600 have masks. We will throw out some of the negative samples to better balance our dataset and speed up training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "labels_df[labels_df[\" EncodedPixels\"] != \"-1\"].shape, labels_df[labels_df[\" EncodedPixels\"] == \"-1\"].shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def balance_labels(labels_df, extra_samples_without_mask=1500, random_state=1337):\n", - " \"\"\"\n", - " Drop duplicates and mark samples with masks.\n", - " Sample 3576+extra_samples_without_mask unmasked samples to balance dataset.\n", - " \"\"\"\n", - " df = labels_df.drop_duplicates('ImageId')\n", - " df_with_mask = df[df[\" EncodedPixels\"] != \"-1\"].copy(deep=True)\n", - " df_with_mask['has_mask'] = 1\n", - " df_without_mask = df[df[\" EncodedPixels\"] == \"-1\"].copy(deep=True)\n", - " df_without_mask['has_mask'] = 0\n", - " df_without_mask_sampled = df_without_mask.sample(len(df_with_mask)+extra_samples_without_mask, random_state=random_state)\n", - " df = pd.concat([df_with_mask, df_without_mask_sampled])\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = balance_labels(labels_df)\n", - "df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Cross Validation Splits\n", - "Once cleaned and balanced, we're left with only 6838 images. This will leave us with rather small training and validation sets once we split the data. To mitigate the chances of us validating on a poorly sampled (not representative of our unlabeled test data) validation set, we use [StratifiedKFold][kfold] to create 5 different 80%-20%, `train` `eval` splits. \n", - "\n", - "**Note**: For datasets of this size, it's good practice to train and evaluate on each split, but due to runtime constraints in this notebook we will only train on the first split which contains 5470 training and 1368 eval samples.\n", - "\n", - "[kfold]: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)\n", - "train_idx, eval_idx = list(kfold.split(df[\"ImageId\"], df[\"has_mask\"]))[0]\n", - "train_df, eval_df = df.iloc[train_idx], df.iloc[eval_idx]\n", - "train_df.shape, eval_df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PyTorch\n", - "\n", - "### PyTorch Dataset\n", - "`SIIMDataset` is a standard PyTorch dataset that reads images and decodes labels from the siim label csv. DICOM images are loaded as grayscale numpy arrays, converted to rgb, and scaled. Labels are converted from rle strings to binary segmentation masks. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class SIIMDataset(Dataset):\n", - " def __init__(self, \n", - " labels_df,\n", - " transforms=None,\n", - " image_dir=Path('siim/dicom-images-train')):\n", - " self.labels_df = labels_df\n", - " self.image_dir = image_dir\n", - " self.transforms = transforms\n", - "\n", - " def __getitem__(self, idx):\n", - " row = self.labels_df.iloc[idx]\n", - " image_id = row.ImageId\n", - " image_path = self.image_dir / f'{image_id}.dcm'\n", - " image = dcmread(image_path).pixel_array # load dicom image\n", - " image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # convert rgb so we can keep imagenet first layer weights\n", - " image = (image / 255.).astype('float32') # scale (0.- 1.)\n", - "\n", - " rle = row[' EncodedPixels']\n", - " if rle != '-1':\n", - " mask = rle2mask(rle, 1024, 1024).astype('float32')\n", - " else:\n", - " mask = np.zeros([1024, 1024]).astype('float32')\n", - "\n", - " if self.transforms:\n", - " augmented = self.transforms(image=image, mask=mask)\n", - " image = augmented['image']\n", - " mask = augmented['mask']\n", - "\n", - " return (\n", - " torch.from_numpy(image).permute(2, 0, 1),\n", - " torch.from_numpy(mask).unsqueeze(0)\n", - " )\n", - "\n", - " def __len__(self):\n", - " return len(self.labels_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Transforms\n", - "We use the [albumentations](https://albumentations.ai/docs/getting_started/mask_augmentation/) library to resize and randomly scale/rotate our training images. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_size = 512\n", - "\n", - "train_transforms = Compose(\n", - " [\n", - " Resize(image_size, image_size),\n", - " ShiftScaleRotate(\n", - " shift_limit=0,\n", - " scale_limit=0.1,\n", - " rotate_limit=10, # rotate\n", - " p=0.5,\n", - " border_mode=cv2.BORDER_CONSTANT\n", - " )\n", - " ]\n", - ")\n", - "\n", - "eval_transforms = Compose([Resize(image_size, image_size)])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DataLoaders" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "train_batch_size = 32\n", - "val_batch_size = 32\n", - "\n", - "train_dataloader = DataLoader(SIIMDataset(train_df, transforms=train_transforms),\n", - " batch_size=train_batch_size, shuffle=True, num_workers=2)\n", - "\n", - "eval_dataloader = DataLoader(SIIMDataset(eval_df, transforms=eval_transforms),\n", - " batch_size=val_batch_size, shuffle=False, num_workers=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Visualize batch\n", - "Areas of pneumothorax are highlighted in red; drag the slider to iterate through batches." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@interact(data_loader=fixed(train_dataloader), batch=IntSlider(min=0, max=len(train_dataloader)-1, step=1, value=0))\n", - "def show_batch(data_loader, batch):\n", - " plt.rcParams['figure.figsize'] = [20, 15]\n", - "\n", - " images, masks = list(itertools.islice(data_loader, batch, batch+1))[0]\n", - " masks_list = []\n", - " for image, mask in zip(images, masks):\n", - " masked = draw_segmentation_masks((image * 255).byte(),\n", - " mask.bool(), alpha=0.5, colors='red')\n", - " masks_list.append(masked)\n", - "\n", - " grid = make_grid(masks_list, nrow=6)\n", - " plt.imshow(grid.permute(1, 2, 0));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Composer\n", - "\n", - "### Model\n", - "\n", - "Here we define a Composer model that wraps the smp [segmentation models pytorch][pytorch_seg] package. This lets us quickly create many different segmentation models made from common pre-trained PyTorch encoders. \n", - "\n", - "- We set defaults to create a [Unet][unet] from an ImageNet pre-trained ResNet-34 with 3 input channels for our RGB (converted) inputs and 1 output channel. \n", - "- We set the default loss to `nn.BCEWithLogitsLoss()` to classify each pixel of the output.\n", - "\n", - "[pytorch_seg]: https://github.com/qubvel/segmentation_models.pytorch\n", - "[unet]: https://arxiv.org/abs/1505.04597" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class SMPUNet(ComposerModel):\n", - " def __init__(self,\n", - " encoder_name='resnet34',\n", - " encoder_weights='imagenet',\n", - " in_channels=3, classes=1,\n", - " loss=nn.BCEWithLogitsLoss()):\n", - " super().__init__()\n", - " self.model = smp.Unet(\n", - " encoder_name=encoder_name,\n", - " encoder_weights=encoder_weights, # use `imagenet` pre-trained weights for encoder initialization\n", - " in_channels=in_channels, # model input channels (1 for gray-scale images, 3 for RGB, etc.)\n", - " classes=classes # model output channels (number of classes in your dataset)\n", - " ) \n", - "\n", - " self.criterion = loss\n", - " self.train_loss = LossMetric(loss)\n", - " self.val_loss = LossMetric(loss)\n", - " self.val_dice = Dice(num_classes=classes)\n", - "\n", - " def forward(self, batch):\n", - " images, targets = batch\n", - " return self.model(images)\n", - "\n", - " def loss(self, outputs, batch):\n", - " _, targets = batch\n", - " return self.criterion(outputs, targets)\n", - "\n", - " def get_metrics(self, is_train: bool = False):\n", - " if is_train:\n", - " return {'BCEWithLogitsLoss': self.train_loss}\n", - " else:\n", - " return {'BCEWithLogitsLoss': self.val_loss, 'Dice': self.dice}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = SMPUNet() # define unet model\n", - "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Trainer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " model=model,\n", - " train_dataloader=train_dataloader,\n", - " eval_dataloader=eval_dataloader,\n", - " max_duration='2ep',\n", - " optimizers=optimizer,\n", - " device='gpu',\n", - " precision='amp',\n", - " seed=1337\n", - ")\n", - "trainer.fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Algorithms\n", - "\n", - "Composer allows us to quickly experiment with algorithms that can speed up or improve the quality of our model. This is how we can add `CutOut` and `LabelSmoothing`\n", - "\n", - "Additionally, the Composer trainer has builtin support for automatic mixed precision training and gradient accumulation to help train quickly and simulate larger batch sizes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from composer.algorithms import CutOut, LabelSmoothing\n", - "\n", - "model = SMPUNet() # define unet model\n", - "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)\n", - "\n", - "algorithms = [CutOut(length=0.5), LabelSmoothing(smoothing=0.1)]\n", - "\n", - "trainer = Trainer(\n", - " model=model,\n", - " train_dataloader=train_dataloader,\n", - " eval_dataloader=eval_dataloader,\n", - " max_duration='2ep',\n", - " optimizers=optimizer,\n", - " algorithms=algorithms,\n", - " device='gpu',\n", - " precision='amp',\n", - " seed=1337\n", - ")\n", - "trainer.fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## What next?\n", - "\n", - "You've now seen a from-scratch demonstration of using Composer in a computer vision project. But don't stop here! If you're interested, we recommend that you continue to experiment with:\n", - "\n", - "- training longer\n", - "- different loss functions, architectures, transformations, and\n", - "- different combinations of composer methods!\n", - "\n", - "In addition, please continue to explore our tutorials! Here are a couple suggestions:\n", - "\n", - "* Continue to explore more advanced applications of Composer like [fine-tuning a transformer for sentiment classification][huggingface_tutorial].\n", - "\n", - "* Learn about callbacks and how to apply [early stopping][early_stopping_tutorial].\n", - "\n", - "* See how dataloading bottlenecks in computer vision can be addressed using [FFCV][ffcv].\n", - "\n", - "[image_segmentation_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/medical_image_segmentation.html\n", - "[huggingface_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/huggingface_models.html\n", - "[early_stopping_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/early_stopping.html\n", - "[ffcv]: https://docs.mosaicml.com/projects/composer/en/stable/examples/ffcv_dataloaders.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Come get involved with MosaicML!\n", - "\n", - "We'd love for you to get involved with the MosaicML community in any of these ways:\n", - "\n", - "### [Star Composer on GitHub](https://github.com/mosaicml/composer)\n", - "\n", - "Help make others aware of our work by [starring Composer on GitHub](https://github.com/mosaicml/composer).\n", - "\n", - "### [Join the MosaicML Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg)\n", - "\n", - "Head on over to the [MosaicML slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg) to join other ML efficiency enthusiasts. Come for the paper discussions, stay for the memes!\n", - "\n", - "### Contribute to Composer\n", - "\n", - "Is there a bug you noticed or a feature you'd like? File an [issue](https://github.com/mosaicml/composer/issues) or make a [pull request](https://github.com/mosaicml/composer/pulls)!" - ] - } - ], - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/examples/profiler_demo.py b/examples/profiler_demo.py index f06fa17f06..d46c89e559 100644 --- a/examples/profiler_demo.py +++ b/examples/profiler_demo.py @@ -8,11 +8,13 @@ # [imports-start] import torch +import torch.nn as nn +import torch.nn.functional as F from torch.utils.data import DataLoader from torchvision import datasets, transforms from composer import Trainer -from composer.models import mnist_model +from composer.models.tasks import ComposerClassifier from composer.profiler import JSONTraceHandler, cyclic_schedule from composer.profiler.profiler import Profiler @@ -35,10 +37,39 @@ persistent_workers=True, num_workers=8, ) + # [dataloader-end] + # Instantiate Model -model = mnist_model(num_classes=10) +class Model(nn.Module): + """Toy convolutional neural network architecture in pytorch for MNIST.""" + + def __init__(self, num_classes: int = 10): + super().__init__() + + self.num_classes = num_classes + + self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0) + self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0) + self.bn = nn.BatchNorm2d(32) + self.fc1 = nn.Linear(32 * 16, 32) + self.fc2 = nn.Linear(32, num_classes) + + def forward(self, x): + out = self.conv1(x) + out = F.relu(out) + out = self.conv2(out) + out = self.bn(out) + out = F.relu(out) + out = F.adaptive_avg_pool2d(out, (4, 4)) + out = torch.flatten(out, 1, -1) + out = self.fc1(out) + out = F.relu(out) + return self.fc2(out) + + +model = ComposerClassifier(module=Model(num_classes=10)) # [trainer-start] # Instantiate the trainer diff --git a/examples/segmentation/README.md b/examples/segmentation/README.md deleted file mode 100644 index 8eaa391184..0000000000 --- a/examples/segmentation/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Semantic Segmentation Example - -This example illustrates how to train a semantic segmentation model in composer. - -## Installation - -First, install [Composer](https://github.com/mosaicml/composer) with `pip install mosaicml`. Additionally, our models are pulled from [MMsegmentation](https://github.com/open-mmlab/mmsegmentation), so follow the [MMcv install instructions](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) (which is dependent on your CUDA and PyTorch versions), then install MMsegmentation with `pip install mmsegmentation`. - -Alternatively, we have publicly available Docker images to reproduce our results. Use `mosaicml/pytorch_vision:1.12.1_cu116-python3.9-ubuntu20.04` for running on GPUs or `mosaicml/pytorch_vision:1.12.1_cpu-python3.9-ubuntu20.04` for running on CPUs. - -## DeepLabv3+ on ADE20k - -The `train_deeplabv3_ade20k.py` script trains a DeepLabv3+ model with either a ResNet-50 or ResNet-101 backbone on the ADE20k semantic segmentation benchmark. To download ADE20k locally (~1 GB), specify the `--download` option when running the script, then the dataset will be downloaded data directory path i.e. the first argument. - -We designed the script to be hackable, so try our recipes on your own models and datsets! -### Example configurations - - - -```bash -# Downloads ADE20k and does single GPU/CPU training depending on torch.cuda.is_available(): -python train_deeplabv3_ade20k.py /path/to/ade20k --download - -# Log experiments to Weights and Biases: -python train_deeplabv3_ade20k.py /path/to/ade20k --wandb_logger --wandb_entity my_username --wandb_project my_project --run_name my_run_name - -# Single/Multi GPU training (infers the number of GPUs available): -composer train_deeplabv3_ade20k.py /path/to/ade20k - -# Manually specify number of GPUs to use: -composer -n $N_GPUS train_deeplabv3_ade20k.py /path/to/ade20k - -# Mild DeepLabv3+ recipe for fastest training to 45.6 mIoU: -composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name mild --max_duration 25ep - -# Medium DeepLabv3+ recipe for highest mIoU (49.15) with similar training time as baseline: -composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name medium --max_duration 90ep - -# Hot DeepLabv3+ recipe for highest mIoU (49.83) with a long training schedule: -composer train_deeplabv3_ade20k.py /path/to/ade20k --recipe_name hot --max_duration 256ep -``` diff --git a/examples/segmentation/train_deeplabv3_ade20k.py b/examples/segmentation/train_deeplabv3_ade20k.py deleted file mode 100644 index 90d93aa037..0000000000 --- a/examples/segmentation/train_deeplabv3_ade20k.py +++ /dev/null @@ -1,367 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""Example script to train a DeepLabv3+ model on ADE20k for semantic segmentation.""" - -import argparse -import logging -import os - -import torch -import torchvision -from torch.utils.data import DataLoader -from torchmetrics import MetricCollection -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode - -from composer import DataSpec, Time, Trainer -from composer.algorithms import EMA, SAM, ChannelsLast, MixUp -from composer.callbacks import CheckpointSaver, ImageVisualizer, LRMonitor, SpeedMonitor -from composer.datasets.ade20k import (ADE20k, PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair, - RandomResizePair) -from composer.datasets.utils import NormalizationFn, pil_image_collate -from composer.loggers import WandBLogger -from composer.loss import DiceLoss, soft_cross_entropy -from composer.metrics import CrossEntropy, MIoU -from composer.models import ComposerClassifier -from composer.models.deeplabv3.model import deeplabv3 -from composer.optim import CosineAnnealingScheduler, DecoupledSGDW -from composer.utils import dist - -logging.basicConfig() -logging.getLogger().setLevel(logging.INFO) - -parser = argparse.ArgumentParser() - -# Dataloader command-line arguments -parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str) -parser.add_argument('--download', - help='Use to download ADE20k from the internet and put it in the `data_dir`', - action='store_true') -parser.add_argument('--train_resize_size', help='Training image resize size', type=int, default=512) -parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=512) -parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=128) -parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=128) - -# Model command-line arguments -parser.add_argument('--backbone_arch', - help='Architecture to use for the backbone.', - default='resnet101', - choices=['resnet50', 'resnet101']) -parser.add_argument('--sync_bn', - help='Use sync BatchNorm. Recommended if the per device microbatch size is below 16', - action='store_true') -parser.add_argument('--cross_entropy_weight', help='Weight to scale the cross entropy loss', type=float, default=0.375) -parser.add_argument('--dice_weight', help='Weight to scale the dice loss', type=float, default=1.125) - -# Optimizer command-line arguments -parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=0.08) -parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.9) -parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-5) - -# Save checkpoint command-line arguments -parser.add_argument('--save_checkpoint_dir', - help='Directory in which to save model checkpoints', - type=str, - default='checkpoints/{run_name}') -parser.add_argument('--checkpoint_interval', - help='Frequency to save checkpoints', - type=Time.from_timestring, - default='1ep') - -# Load checkpoint command-line arguments, assumes resuming from a previous training run (as opposed to fine-tuning) -parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str) - -# Recipes command-line argument -parser.add_argument('--recipe_name', - help='Algorithmic recipes to be applied to the trainer', - choices=['mild', 'medium', 'hot']) - -# Logger command-line arguments -# Note: Only Weights and Biases to minimize arguments. Other loggers can be used by adjusting the script -parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true') -parser.add_argument('--wandb_entity', help='WandB entity name', type=str) -parser.add_argument('--wandb_project', help='WandB project name', type=str) - -parser.add_argument('--image_viz', help='Whether or not to log images using ImageVisualizer', action='store_true') - -# Trainer arguments -parser.add_argument('--device_train_microbatch_size', - help='Size of train microbatch size if running on GPU', - default='auto') -parser.add_argument('--run_name', help='Name of the training run used for checkpointing and logging', type=str) -parser.add_argument('--seed', help='Random seed', type=int, default=17) -parser.add_argument('--max_duration', - help='Duration to train specified as a Time string', - type=Time.from_timestring, - default='128ep') - -args = parser.parse_args() - -IMAGENET_CHANNEL_MEAN = (int(0.485 * 255), int(0.456 * 255), int(0.406 * 255)) -IMAGENET_CHANNEL_STD = (int(0.229 * 255), int(0.224 * 255), int(0.225 * 255)) - -ADE20K_URL = 'http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip' -ADE20K_FILE = 'ADEChallengeData2016.zip' - - -def _main(): - # Divide batch size by number of devices - if dist.get_world_size() > 1: - args.train_batch_size = args.train_batch_size // dist.get_world_size() - args.eval_batch_size = args.eval_batch_size // dist.get_world_size() - - # Train dataset code - logging.info('Building train dataloader') - - if args.download: - torchvision.datasets.utils.download_and_extract_archive(url=ADE20K_URL, - download_root=args.data_dir, - filename=ADE20K_FILE, - remove_finished=True) - # Adjust the data_dir to include the extracted directory - args.data_dir = os.path.join(args.data_dir, 'ADEChallengeData2016') - - # Training transforms applied to both the image and target - train_both_transforms = torch.nn.Sequential( - RandomResizePair( - min_scale=0.5, - max_scale=2.0, - base_size=(args.train_resize_size, args.train_resize_size), - ), - RandomCropPair( - crop_size=(args.train_resize_size, args.train_resize_size), - class_max_percent=0.75, - num_retry=10, - ), - RandomHFlipPair(), - ) - - # Training transforms applied to the image only - train_image_transforms = torch.nn.Sequential( - PhotometricDistoration( - brightness=32. / 255, - contrast=0.5, - saturation=0.5, - hue=18. / 255, - ), - PadToSize( - size=(args.train_resize_size, args.train_resize_size), - fill=IMAGENET_CHANNEL_MEAN, - ), - ) - - # Training transforms applied to the target only - train_target_transforms = PadToSize(size=(args.train_resize_size, args.train_resize_size), fill=0) - - # Create ADE20k train dataset - train_dataset = ADE20k( - datadir=args.data_dir, - split='training', - image_transforms=train_image_transforms, - target_transforms=train_target_transforms, - both_transforms=train_both_transforms, - ) - - # Create ADE20k train dataloader - - train_sampler = None - if dist.get_world_size(): - # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup - train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True) - - train_dataloader = DataLoader( - train_dataset, - batch_size=args.train_batch_size, - num_workers=8, - pin_memory=True, - drop_last=True, # Prevents using a smaller batch at the end of an epoch - sampler=train_sampler, - collate_fn=pil_image_collate, - persistent_workers=True, - ) - - # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck - train_dataspec = DataSpec(dataloader=train_dataloader, - device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, - std=IMAGENET_CHANNEL_STD, - ignore_background=True)) - logging.info('Built train dataloader\n') - - # Validation dataset code - logging.info('Building evaluation dataloader') - - # Validation image and target transformations - image_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size), - interpolation=InterpolationMode.BILINEAR) - target_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size), - interpolation=InterpolationMode.NEAREST) - - # Create ADE20k validation dataset - val_dataset = ADE20k(datadir=args.data_dir, - split='validation', - both_transforms=None, - image_transforms=image_transforms, - target_transforms=target_transforms) - - #Create ADE20k validation dataloader - - val_sampler = None - if dist.get_world_size(): - # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware - val_sampler = dist.get_sampler(val_dataset, drop_last=False, shuffle=False) - - val_dataloader = DataLoader( - val_dataset, - batch_size=args.eval_batch_size, - num_workers=8, - pin_memory=True, - drop_last=False, - sampler=val_sampler, - collate_fn=pil_image_collate, - persistent_workers=True, - ) - - # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck - val_dataspec = DataSpec(dataloader=val_dataloader, - device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, - std=IMAGENET_CHANNEL_STD, - ignore_background=True)) - logging.info('Built validation dataset\n') - - logging.info('Building Composer DeepLabv3+ model') - - # Create a DeepLabv3+ model - model = deeplabv3( - num_classes=150, - backbone_arch=args.backbone_arch, - backbone_weights='IMAGENET1K_V2', - sync_bn=args.sync_bn, - use_plus=True, - ) - - # Initialize the classifier head only since the backbone uses pre-trained weights - def weight_init(module: torch.nn.Module): - if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): - torch.nn.init.kaiming_normal_(module.weight) - if isinstance(module, torch.nn.BatchNorm2d): - torch.nn.init.ones_(module.weight) - torch.nn.init.zeros_(module.bias) - - model.classifier.apply(weight_init) # type: ignore Does not recognize classifier as a torch.nn.Module - - # Loss function to use during training - # This ignores index -1 since the NormalizationFn transformation sets the background class to -1 - dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True) - - def combo_loss(output, target): - loss = {} - loss['cross_entropy'] = soft_cross_entropy(output, target, ignore_index=-1) - loss['dice'] = dice_loss_fn(output, target) - loss['total'] = args.cross_entropy_weight * loss['cross_entropy'] + args.dice_weight * loss['dice'] - return loss - - # Training and Validation metrics to log throughout training - train_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)]) - val_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)]) - - # Create a ComposerClassifier using the model, loss function, and metrics - composer_model = ComposerClassifier(module=model, - train_metrics=train_metrics, - val_metrics=val_metrics, - loss_fn=combo_loss) - - logging.info('Built Composer DeepLabv3+ model\n') - - logging.info('Building optimizer and learning rate scheduler') - # Optimizer - optimizer = DecoupledSGDW(composer_model.parameters(), - lr=args.learning_rate, - momentum=args.momentum, - weight_decay=args.weight_decay) - - # Only use a LR schedule if no recipe is specified or if the hot recipe was specified - lr_scheduler = None - if args.recipe_name is None or args.recipe_name == 'hot': - lr_scheduler = CosineAnnealingScheduler() - - logging.info('Built optimizer and learning rate scheduler') - - logging.info('Building callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver') - speed_monitor = SpeedMonitor(window_size=50) # Measures throughput as samples/sec and tracks total training time - lr_monitor = LRMonitor() # Logs the learning rate - - # Callback for checkpointing - checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval) - logging.info('Built callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver\n') - - # Recipes for training DeepLabv3+ on ImageNet in order of increasing training time and accuracy - # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/ - logging.info('Building algorithm recipes') - if args.recipe_name == 'mild': - algorithms = [ - ChannelsLast(), - EMA(half_life='1000ba', update_interval='10ba'), - ] - elif args.recipe_name == 'medium': - algorithms = [ - ChannelsLast(), - EMA(half_life='1000ba', update_interval='10ba'), - SAM(rho=0.3, interval=2), - MixUp(alpha=0.2), - ] - elif args.recipe_name == 'hot': - algorithms = [ - ChannelsLast(), - EMA(half_life='2000ba', update_interval='1ba'), - SAM(rho=0.3, interval=1), - MixUp(alpha=0.5), - ] - else: - algorithms = None - logging.info('Built algorithm recipes\n') - - # Weight and Biases logger if specified in commandline - logger = None - if args.wandb_logger: - logging.info('Building Weights and Biases logger') - if args.wandb_entity is None: - raise ValueError('Please specify --wandb_entity argument') - if args.wandb_project is None: - raise ValueError('Please specify --wandb_project argument') - logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project) - logging.info('Built Weights and Biases logger') - - callbacks = [speed_monitor, lr_monitor, checkpoint_saver] - if args.image_viz: - callbacks.append(ImageVisualizer(mode='segmentation')) - # Create the Trainer! - logging.info('Building Trainer') - device = 'gpu' if torch.cuda.is_available() else 'cpu' - precision = 'amp' if device == 'gpu' else 'fp32' # Mixed precision for fast training when using a GPU - device_train_microbatch_size = 'auto' if device == 'gpu' else args.device_train_microbatch_size # If on GPU, use 'auto' gradient accumulation - trainer = Trainer(run_name=args.run_name, - model=composer_model, - train_dataloader=train_dataspec, - eval_dataloader=val_dataspec, - eval_interval='1ep', - optimizers=optimizer, - schedulers=lr_scheduler, - algorithms=algorithms, - loggers=logger, - max_duration=args.max_duration, - callbacks=callbacks, - load_path=args.load_checkpoint_path, - device=device, - precision=precision, - device_train_microbatch_size=device_train_microbatch_size, - seed=args.seed) - logging.info('Built Trainer\n') - - # Start training! - logging.info('Train!') - trainer.fit() - - -if __name__ == '__main__': - _main() diff --git a/pyproject.toml b/pyproject.toml index f4155e23ae..a4800ea34b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,18 +82,15 @@ reportUnusedCoroutine = "error" # Pytest [tool.pytest.ini_options] # By default, do not run gpu, vision, docs, notebook, or daily tests -addopts = "--codeblocks --strict-markers -m 'not gpu and not vision and not doctest and not daily and not remote'" +addopts = "--codeblocks --strict-markers -m 'not gpu and not doctest and not daily and not remote'" markers = [ - # !!!!!!!!!!!IMPORTANT!!!!!!!!!: when updating the markers, also make sure to update meta.yaml # Tests that require a world_size of two should be annotated with `@pytest.mark.world_size(2)`. # If not specified, the test will be assumed to have a world-size of one, which is # equivalent to `@pytest.mark.world_size(1)` "world_size(val)", # Tests that require a gpu should be annotated with `@pytest.mark.gpu` "gpu", - # Whether the test should run in a container based on the vision dockerimage, which contains ffcv and opencv - "vision", # Tests which are run as part of the documentation build "doctest", # Should be run during daily regression diff --git a/setup.py b/setup.py index 3555668370..c87feaf05e 100644 --- a/setup.py +++ b/setup.py @@ -142,10 +142,6 @@ def package_files(prefix: str, directory: str, extension: str): 'setuptools<=59.5.0', ] -extra_deps['health_checker'] = { - 'pynvml>=11.5.0,<12', -} - extra_deps['system_metrics_monitor'] = { 'pynvml>=11.5.0,<12', } @@ -171,19 +167,6 @@ def package_files(prefix: str, directory: str, extension: str): 'tensorboard>=2.9.1,<3.0.0', ] -extra_deps['unet'] = [ - 'monai>=0.9.1,<1.4', - 'scikit-learn>=1.0.1,<2', -] - -extra_deps['vit'] = [ - 'vit_pytorch==1.6.1', -] - -extra_deps['timm'] = [ - 'timm>=0.5.4,<0.6', -] - extra_deps['coco'] = [ 'pycocotools>=2.0.4,<3', ] diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py index 940ca040f2..91ecf2dac2 100644 --- a/tests/algorithms/algorithm_settings.py +++ b/tests/algorithms/algorithm_settings.py @@ -21,12 +21,11 @@ LabelSmoothing, LayerFreezing, LowPrecisionGroupNorm, LowPrecisionLayerNorm, MixUp, NoOpModel, ProgressiveResizing, RandAugment, SelectiveBackprop, SeqLengthWarmup, SqueezeExcite, StochasticDepth, WeightStandardization) -from composer.models import composer_resnet from composer.models.base import ComposerModel from composer.utils import dist from tests.common import get_module_subclasses from tests.common.datasets import RandomImageDataset, SimpleDataset, dummy_bert_lm_dataloader, dummy_gpt_lm_dataloader -from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, configure_tiny_bert_hf_model, +from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, composer_resnet, configure_tiny_bert_hf_model, configure_tiny_gpt2_hf_model) simple_bert_settings = { diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py index ddb05a0c3c..fd3fad0628 100644 --- a/tests/algorithms/test_required_on_load.py +++ b/tests/algorithms/test_required_on_load.py @@ -14,8 +14,8 @@ from composer import Trainer, algorithms from composer.callbacks import CheckpointSaver from composer.core import Algorithm, Event, Time, TimeUnit # type: ignore imports used in `eval(representation)` -from composer.models import ComposerClassifier, ComposerModel, composer_resnet -from tests.common import ConvModel, SimpleConvModel +from composer.models import ComposerClassifier, ComposerModel +from tests.common import ConvModel, SimpleConvModel, composer_resnet def initialize_algorithm(algo_cls: Type): diff --git a/tests/algorithms/test_stochastic_depth.py b/tests/algorithms/test_stochastic_depth.py index 23c21bd816..2ec267756a 100644 --- a/tests/algorithms/test_stochastic_depth.py +++ b/tests/algorithms/test_stochastic_depth.py @@ -14,8 +14,8 @@ from composer.algorithms.stochastic_depth.stochastic_layers import make_resnet_bottleneck_stochastic from composer.core import Event, State from composer.core.time import TimeUnit -from composer.models import composer_resnet from composer.utils import module_surgery +from tests.common import composer_resnet @pytest.fixture() diff --git a/tests/callbacks/callback_settings.py b/tests/callbacks/callback_settings.py index 26a1eeb3df..492b5988be 100644 --- a/tests/callbacks/callback_settings.py +++ b/tests/callbacks/callback_settings.py @@ -11,9 +11,9 @@ import composer.loggers import composer.profiler from composer import Callback -from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, HealthChecker, - ImageVisualizer, MemoryMonitor, MemorySnapshot, MLPerfCallback, SpeedMonitor, - SystemMetricsMonitor, ThresholdStopper) +from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, ImageVisualizer, + MemoryMonitor, MemorySnapshot, MLPerfCallback, SpeedMonitor, SystemMetricsMonitor, + ThresholdStopper) from composer.loggers import (CometMLLogger, ConsoleLogger, LoggerDestination, MLFlowLogger, ProgressBarLogger, RemoteUploaderDownloader, TensorboardLogger, WandBLogger) from composer.models.base import ComposerModel @@ -149,7 +149,6 @@ ImageVisualizer: [pytest.mark.skipif(not _WANDB_INSTALLED, reason='Wandb is optional')], MLFlowLogger: [pytest.mark.skipif(not _MLFLOW_INSTALLED, reason='mlflow is optional'),], SystemMetricsMonitor: [pytest.mark.skipif(not _PYNMVL_INSTALLED, reason='pynmvl is optional'),], - HealthChecker: [pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')], } diff --git a/tests/callbacks/test_health_checker.py b/tests/callbacks/test_health_checker.py deleted file mode 100644 index 5638699ca9..0000000000 --- a/tests/callbacks/test_health_checker.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import datetime -from unittest.mock import MagicMock, patch - -import pytest - -from composer import Timestamp -from composer.callbacks import HealthChecker -from composer.callbacks.health_checker import GPUUtilization -from composer.utils import dist -from tests.common import world_size - -pynvml = pytest.importorskip('pynvml') -pytest.importorskip('slack_sdk') - - -class MockUtil: - - def __init__(self, util): - self.gpu = util - - -@pytest.mark.gpu -@world_size(1, 2) -@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*') -def test_gpu_utilization(world_size): - assert HealthChecker._is_available() - - gpu_utilization_values = [ - MockUtil(100), - MockUtil(10), - MockUtil(100), - MockUtil(100), - MockUtil(100), - MockUtil(100), - ] - - with patch.multiple(pynvml, - nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values), - nvmlDeviceGetCount=MagicMock(return_value=world_size)): - - gpu_utilization = GPUUtilization() - gpu_utilization.sample() - gpu_utilization.sample() - gpu_utilization.sample() - _, alert = gpu_utilization.check() - - should_alert = dist.get_local_rank() == 0 and world_size > 1 - assert alert == should_alert - - -@pytest.mark.gpu -@world_size(1, 2) -@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*') -def test_health_checker(world_size): - - state = MagicMock() - state.run_name = 'pytest-mock-run-kwei73' - logger = MagicMock() - - health_checker = HealthChecker( - sample_freq=1, - window_size=3, - wait=0, - ) - - gpu_utilization_values = [ - MockUtil(100), - MockUtil(10), - MockUtil(100), - MockUtil(100), - MockUtil(100), - MockUtil(100), - ] - - with patch.multiple(pynvml, - nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values), - nvmlDeviceGetCount=MagicMock(return_value=world_size)): - - # collect data and checker - for seconds in [1, 2, 3]: - state.timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds)) - health_checker.after_train_batch(state, logger) - - should_alert = dist.get_local_rank() == 0 and world_size > 1 - assert health_checker.metrics[0].alerted == should_alert - - -@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*') -def test_health_checker_sampling(): - timestamp = Timestamp(total_wct=datetime.timedelta(seconds=0)) - - health_checker = HealthChecker( - sample_freq=1, - window_size=5, - wait=10, - ) - - config = [ - (5, False), # before wait - (11, True), - (11.5, False), # below sample frequency - (12, True), - (20, True), - (11, False), # no time travel - ] - - for seconds, is_sample in config: - timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds)) - assert health_checker._sample(timestamp) == is_sample diff --git a/tests/callbacks/test_inference.py b/tests/callbacks/test_inference.py index 960aec9a04..bef07c081c 100644 --- a/tests/callbacks/test_inference.py +++ b/tests/callbacks/test_inference.py @@ -13,9 +13,9 @@ from torch.utils.data import DataLoader from composer.callbacks import ExportForInferenceCallback, export_for_inference -from composer.models import composer_resnet from composer.trainer import Trainer from tests.common.datasets import RandomImageDataset +from tests.common.models import composer_resnet @pytest.mark.parametrize( diff --git a/tests/common/__init__.py b/tests/common/__init__.py index be2a508860..bcc9903e61 100644 --- a/tests/common/__init__.py +++ b/tests/common/__init__.py @@ -12,7 +12,7 @@ from tests.common.markers import device, world_size from tests.common.models import (ConvModel, EmbeddedWeightTiedModel, EmptyModel, SimpleConvModel, SimpleModel, SimpleModelWithDropout, SimpleTransformerClassifier, SimpleTransformerMaskedLM, - SimpleWeightTiedModel, ZeroModel) + SimpleWeightTiedModel, ZeroModel, composer_resnet) from tests.common.state import assert_state_equivalent @@ -46,4 +46,5 @@ def get_module_subclasses(module: types.ModuleType, cls: Type) -> List[Type]: 'ParityDataset', 'SimpleDataset', 'InfiniteClassificationDataset', + 'composer_resnet', ] diff --git a/tests/common/models.py b/tests/common/models.py index a0b66d8929..d8bf2994d4 100644 --- a/tests/common/models.py +++ b/tests/common/models.py @@ -4,15 +4,18 @@ """Contains commonly used models that are shared across the test suite.""" import copy from functools import partial -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import pytest import torch from torchmetrics import Metric, MetricCollection +from torchmetrics.classification import MulticlassAccuracy +from torchvision.models import resnet +from composer.loss import loss_registry from composer.metrics import CrossEntropy, MIoU from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy -from composer.models import ComposerClassifier, HuggingFaceModel +from composer.models import ComposerClassifier, HuggingFaceModel, Initializer if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -440,6 +443,64 @@ def forward(self, batch: Tuple[torch.Tensor, Any]) -> torch.Tensor: return outputs +def composer_resnet( + model_name: str, + num_classes: int = 1000, + weights: Optional[str] = None, + groups: int = 1, + width_per_group: int = 64, + initializers: Optional[List[Initializer]] = None, + loss_name: str = 'soft_cross_entropy', +) -> ComposerClassifier: + """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model. + From `Deep Residual Learning for Image Recognition `_ (He et al, 2015). + Args: + model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``, + ``"resnet152"``]. + num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``. + weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``. + groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``. + width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage. + Default: ``64``. + initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization. + Default: ``None``. + loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or + 'binary_cross_entropy_with_logits'. Loss function must be in + :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``". + Returns: + ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model. + """ + valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] + if model_name not in valid_model_names: + raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.') + + if loss_name not in loss_registry.keys(): + raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the ' + 'specified loss function is present in composer.loss.loss.py') + + if initializers is None: + initializers = [] + + # Instantiate model + model_fn = getattr(resnet, model_name) + model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group) + + # Grab loss function from loss registry + loss_fn = loss_registry[loss_name] + + # Create metrics for train and validation + train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro') + val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')]) + + # Apply Initializers to model + for initializer in initializers: + initializer = Initializer(initializer) + model.apply(initializer.get_initializer()) + + composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn) + return composer_model + + # Note: These methods are an alternative to the tiny_bert fixtures in fixtures.py. # Fixtures cannot be used natively as parametrized inputs, which we require when # we wish to run a test across multiple models, one of which is a HuggingFace model. diff --git a/tests/datasets/test_add_dataset_transform.py b/tests/datasets/test_add_dataset_transform.py deleted file mode 100644 index d7a545a33b..0000000000 --- a/tests/datasets/test_add_dataset_transform.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from torchvision import transforms - -from composer.datasets.synthetic import SyntheticPILDataset -from composer.datasets.utils import add_vision_dataset_transform - -image_size = 32 - - -def generate_synthetic_dataset(data_transforms): - return SyntheticPILDataset(total_dataset_size=1000, - data_shape=[image_size, image_size], - num_classes=2, - transform=data_transforms) - - -def generate_default_transforms(): - return transforms.Compose([transforms.RandomCrop(32), transforms.ToTensor(), transforms.RandomRotation(5)]) - - -def generate_composition_no_tensor(): - return transforms.Compose( - [transforms.RandomCrop(32), - transforms.RandomHorizontalFlip(), - transforms.RandomRotation(5)]) - - -@pytest.mark.parametrize('is_tensor_transform,index', [(False, 1), (True, 2)]) -def test_pre_post_to_tensor_compose(is_tensor_transform, index): - dataset = generate_synthetic_dataset(generate_default_transforms()) - add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform) - assert dataset.transform is not None - assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast - - -@pytest.mark.parametrize('is_tensor_transform,index', [(False, 0), (True, 1)]) -def test_pre_post_to_tensor(is_tensor_transform, index): - dataset = generate_synthetic_dataset(transforms.ToTensor()) - add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform) - assert dataset.transform is not None - assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast - - -@pytest.mark.parametrize('data_transforms', [(generate_composition_no_tensor()), (transforms.RandomHorizontalFlip())]) -def test_default_to_append(data_transforms): - dataset = generate_synthetic_dataset(data_transforms) - add_vision_dataset_transform(dataset, transforms.RandomAutocontrast()) - assert dataset.transform is not None - assert type(dataset.transform.transforms[-1]) == transforms.RandomAutocontrast - - -def test_add_to_none_transform(): - dataset = generate_synthetic_dataset(None) - add_vision_dataset_transform(dataset, transforms.RandomAutocontrast()) - assert type(dataset.transform) == transforms.RandomAutocontrast diff --git a/tests/datasets/test_cifar.py b/tests/datasets/test_cifar.py deleted file mode 100644 index 6eac6e2ebf..0000000000 --- a/tests/datasets/test_cifar.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from composer.datasets import build_cifar10_dataloader, build_synthetic_cifar10_dataloader - - -@pytest.mark.skip # Download is flaky and test is not critical -@pytest.mark.parametrize('is_train', [False, True]) -@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True]) -def test_cifar10_shape_length(is_train, synthetic): - batch_size = 1 - - if synthetic: - dataspec = build_synthetic_cifar10_dataloader(global_batch_size=batch_size, is_train=is_train) - else: - dataspec = build_cifar10_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train) - - samples = list(dataspec.dataloader) - if is_train: - assert len(samples) == 50000 // batch_size - else: - assert len(samples) == 10000 // batch_size - - assert samples[0][0].shape == (1, 3, 32, 32) diff --git a/tests/datasets/test_dataset_utils.py b/tests/datasets/test_dataset_utils.py deleted file mode 100644 index 720edce59b..0000000000 --- a/tests/datasets/test_dataset_utils.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -from typing import List, Tuple - -import numpy as np -import pytest -import torch -from PIL import Image - -from composer.datasets.utils import pil_image_collate - - -@pytest.fixture -def num_samples(): - return 4 - - -@pytest.fixture -def image_size(): - return (16, 16) - - -@pytest.fixture -def pil_image_list(num_samples: int, image_size: Tuple[int, int]): - return [Image.new(mode='RGB', size=image_size, color=(i, i, i)) for i in range(num_samples)] - - -@pytest.fixture -def pil_target_list(num_samples: int, image_size: Tuple[int, int]): - return [Image.new(mode='L', size=image_size, color=i) for i in range(num_samples)] - - -@pytest.fixture -def correct_image_tensor(num_samples: int, image_size: Tuple[int, int]): - return torch.arange(num_samples).expand(3, *image_size, -1).permute(3, 0, 1, 2) - - -@pytest.fixture -def scalar_target_list(num_samples: int): - return np.arange(num_samples) - - -def test_scalar_target_collate(pil_image_list: List[Image.Image], scalar_target_list: np.ndarray, - correct_image_tensor: torch.Tensor): - batch = [(img, target) for img, target in zip(pil_image_list, scalar_target_list)] - image_tensor, target_tensor = pil_image_collate(batch=batch) - - correct_target_tensor = torch.arange(correct_image_tensor.shape[0]) - - assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_target_tensor) - - -def test_image_target_collate(pil_image_list: List[Image.Image], pil_target_list: List[Image.Image], - correct_image_tensor): - batch = [(img, target) for img, target in zip(pil_image_list, pil_target_list)] - image_tensor, target_tensor = pil_image_collate( - batch=batch) # type: ignore "Image" is incompatible with "ndarray[Unknown, Unknown]" - - assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_image_tensor[:, 0]) diff --git a/tests/datasets/test_ffcv_utils.py b/tests/datasets/test_ffcv_utils.py deleted file mode 100644 index 3614d73387..0000000000 --- a/tests/datasets/test_ffcv_utils.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import os -import pathlib - -import pytest - -from composer.datasets.ffcv_utils import write_ffcv_dataset -from composer.datasets.synthetic import SyntheticDataLabelType, SyntheticPILDataset - - -@pytest.mark.vision -def test_write_ffcv_dataset(tmp_path: pathlib.Path): - dataset = SyntheticPILDataset(total_dataset_size=1, - num_classes=1, - data_shape=[1, 1, 3], - label_type=SyntheticDataLabelType.CLASSIFICATION_INT, - num_unique_samples_to_create=1) - output_file = str(tmp_path / 'ffcv') - write_ffcv_dataset(dataset, write_path=output_file, num_workers=1) - assert os.path.exists(output_file) diff --git a/tests/datasets/test_mnist.py b/tests/datasets/test_mnist.py deleted file mode 100644 index 7342184d03..0000000000 --- a/tests/datasets/test_mnist.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from composer.datasets import build_mnist_dataloader, build_synthetic_mnist_dataloader - - -@pytest.mark.parametrize('is_train', [False, True]) -@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True]) -def test_mnist_shape_length(is_train, synthetic): - batch_size = 1 - - if synthetic: - loader = build_synthetic_mnist_dataloader(global_batch_size=batch_size, is_train=is_train) - else: - loader = build_mnist_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train) - - samples = list(loader) - if is_train: - assert len(samples) == 60000 // batch_size - else: - assert len(samples) == 10000 // batch_size - - assert samples[0][0].shape == (1, 1, 28, 28) diff --git a/tests/datasets/test_segmentation_transforms.py b/tests/datasets/test_segmentation_transforms.py deleted file mode 100644 index 2e4af40126..0000000000 --- a/tests/datasets/test_segmentation_transforms.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import numpy as np -import pytest -from PIL import Image - -from composer.datasets.ade20k import (PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair, - RandomResizePair) - - -@pytest.fixture -def size(): - return 16, 16 - - -@pytest.fixture -def sample_pair(size): - img = Image.new(mode='RGB', size=size) - target = Image.new(mode='L', size=size) - return img, target - - -def test_random_resize(sample_pair, size): - random_resize_transform = RandomResizePair(min_scale=0.5, max_scale=2.0, base_size=size) - - # Test that the resized image remains within bounds for 10 iterations - for _ in range(10): - resized_img, resized_target = random_resize_transform(sample_pair) - assert resized_img.size == resized_target.size - assert resized_img.size[0] >= size[0] // 2 and resized_img.size[0] <= size[0] * 2 - assert resized_img.size[1] >= size[1] // 2 and resized_img.size[1] <= size[1] * 2 - - -@pytest.mark.parametrize('crop_size', [(8, 8), (32, 32)]) -def test_random_crop(sample_pair, crop_size): - random_crop_transform = RandomCropPair(crop_size) - image, target = random_crop_transform(sample_pair) - assert image.size == target.size - final_size = min(crop_size[0], sample_pair[0].height), min(crop_size[1], sample_pair[0].width) - assert final_size == image.size - - -def test_random_hflip(sample_pair): - old_image, old_target = np.array(sample_pair[0]), np.array(sample_pair[1]) - - # Always flip - always_hflip_transform = RandomHFlipPair(probability=1.0) - new_image, new_target = always_hflip_transform(sample_pair) - new_image, new_target = np.array(new_image), np.array(new_target) - assert np.allclose(new_image, old_image[:, ::-1]) and np.allclose(new_target, old_target[:, ::-1]) - - # Never flip - always_hflip_transform = RandomHFlipPair(probability=0.0) - new_image, new_target = always_hflip_transform(sample_pair) - new_image, new_target = np.array(new_image), np.array(new_target) - assert np.allclose(new_image, old_image) and np.allclose(new_target, old_target) - - -@pytest.mark.parametrize('pad_size', [(32, 32), (8, 8)]) -def test_pad_transform(sample_pair, pad_size): - image = sample_pair[0] - pad_transform = PadToSize(size=pad_size, fill=255) - padded_image = pad_transform(image) - final_size = max(pad_size[1], image.width), max(pad_size[0], image.height) - # Check for correct size and number of padding elements - assert padded_image.size == final_size - - # Check appropriate amount of padding is used - padded_image = np.array(padded_image) - initial_area = image.width * image.height - final_area = final_size[0] * final_size[1] - n_channels = padded_image.shape[2] - pad_volume = n_channels * (final_area - initial_area) - assert pad_volume == (padded_image == 255).sum() - - -def test_photometric_distortion(sample_pair): - old_image = sample_pair[0] - # Test no transform case - photometric_transform = PhotometricDistoration(brightness=1.0, contrast=1.0, saturation=1.0, hue=0) - new_image = photometric_transform(old_image) - old_image, new_image = np.array(old_image), np.array(new_image) - assert np.allclose(old_image, new_image) diff --git a/tests/datasets/test_synthetic_data.py b/tests/datasets/test_synthetic_data.py deleted file mode 100644 index 6f62aebb9d..0000000000 --- a/tests/datasets/test_synthetic_data.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -import pytest -import torch - -from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType, - SyntheticPILDataset) - - -@pytest.mark.parametrize('data_type', [ - SyntheticDataType.GAUSSIAN, - SyntheticDataType.SEPARABLE, -]) -@pytest.mark.parametrize('label_type', [ - SyntheticDataLabelType.CLASSIFICATION_ONE_HOT, - SyntheticDataLabelType.CLASSIFICATION_INT, -]) -def test_synthetic_batch_pair_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType): - if data_type == SyntheticDataType.SEPARABLE: - if label_type != SyntheticDataLabelType.CLASSIFICATION_INT: - pytest.skip('Separable data requires classification int labels') - num_classes = 2 - label_shape = None - else: - num_classes = 10 - label_shape = (1, 10, 12) - - if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT: - pytest.xfail('classification_int is not currently supported with gaussian data') - - dataset_size = 1000 - data_shape = (3, 32, 32) - num_samples_to_create = 10 - dataset = SyntheticBatchPairDataset(total_dataset_size=dataset_size, - data_shape=data_shape, - num_unique_samples_to_create=num_samples_to_create, - data_type=data_type, - label_type=label_type, - num_classes=num_classes, - label_shape=label_shape) - assert len(dataset) == dataset_size - - # verify datapoints are correct - x, y = dataset[0] - assert x.size() == data_shape - if label_type == SyntheticDataLabelType.CLASSIFICATION_INT: - assert isinstance(y.item(), int) - elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT: - assert y.size() == (num_classes,) - assert torch.min(y) == 0 - assert torch.max(y) == 1 - - # check that points were allocated in memory after the first call to __getitem__ - assert dataset.input_data is not None - assert dataset.input_target is not None - # check that the correct number of points were allocated in memory - assert dataset.input_data.size()[0] == num_samples_to_create - assert dataset.input_target.size()[0] == num_samples_to_create - - # verify that you can getch points outside the num_samples_to_create range - # (still within the total dataset size range) - x, y = dataset[num_samples_to_create + 1] - assert x is not None - assert y is not None - - -@pytest.mark.parametrize('label_type', [ - SyntheticDataLabelType.CLASSIFICATION_ONE_HOT, - SyntheticDataLabelType.CLASSIFICATION_INT, -]) -@pytest.mark.parametrize('num_classes', [None, 0]) -def test_synthetic_classification_param_validation(label_type: SyntheticDataLabelType, num_classes: Optional[int]): - with pytest.raises(ValueError): - SyntheticBatchPairDataset(total_dataset_size=10, - data_shape=(2, 2), - label_type=label_type, - num_classes=num_classes) - - -@pytest.mark.parametrize('data_type', [ - SyntheticDataType.GAUSSIAN, - SyntheticDataType.SEPARABLE, -]) -@pytest.mark.parametrize('label_type', [ - SyntheticDataLabelType.CLASSIFICATION_ONE_HOT, - SyntheticDataLabelType.CLASSIFICATION_INT, -]) -def test_synthetic_image_data_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType): - if data_type == SyntheticDataType.SEPARABLE: - if label_type != SyntheticDataLabelType.CLASSIFICATION_INT: - pytest.skip('Seperable data requires classification int labels') - num_classes = 2 - label_shape = None - else: - num_classes = 10 - label_shape = (1, 10, 12) - - if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT: - pytest.xfail('classification_int is not currently supported with gaussian data') - - dataset_size = 1000 - data_shape = (32, 32) - num_samples_to_create = 100 - dataset = SyntheticPILDataset(total_dataset_size=dataset_size, - data_shape=data_shape, - num_unique_samples_to_create=num_samples_to_create, - data_type=data_type, - label_type=label_type, - num_classes=num_classes, - label_shape=label_shape) - assert len(dataset) == dataset_size - - # verify datapoints are correct - x, y = dataset[0] - assert x.size == data_shape - if label_type == SyntheticDataLabelType.CLASSIFICATION_INT: - assert isinstance(y.item(), int) - elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT: - assert y.size() == (num_classes,) - assert torch.min(y) == 0 - assert torch.max(y) == 1 - - # check that points were allocated in memory after the first call to __getitem__ - assert dataset._dataset.input_data is not None - assert dataset._dataset.input_target is not None - # check that the correct number of points were allocated in memory - assert dataset._dataset.input_data.shape[0] == num_samples_to_create - assert dataset._dataset.input_target.shape[0] == num_samples_to_create - - # verify that you can getch points outside the num_samples_to_create range - # (still within the total dataset size range) - x, y = dataset[num_samples_to_create + 1] - assert x is not None - assert y is not None diff --git a/tests/models/test_bert.py b/tests/models/test_bert.py deleted file mode 100644 index bee5111e08..0000000000 --- a/tests/models/test_bert.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from torch.utils.data import DataLoader - -from composer.models.bert import create_bert_classification, create_bert_mlm -from composer.trainer import Trainer -from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset - - -def test_bert_mlm_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch): - transformers = pytest.importorskip('transformers') - monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_bert_config) - bert_composer_model = create_bert_mlm(use_pretrained=False, - pretrained_model_name='dummy', - model_config=None, - tokenizer_name=None, - gradient_checkpointing=False) - - train_dataset = RandomTextLMDataset(size=8, - vocab_size=tiny_bert_tokenizer.vocab_size, - sequence_length=8, - use_keys=True) - collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_bert_tokenizer, - mlm=True, - mlm_probability=0.15) - train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator) - - trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep') - trainer.fit() - - assert trainer.state.train_metrics is not None - assert trainer.state.train_metrics['LanguageCrossEntropy'].compute() > 0.0 - - -def test_bert_classification_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch): - pytest.importorskip('transformers') - - def config_patch(x, num_labels): - tiny_bert_config.num_labels = num_labels - return tiny_bert_config - - monkeypatch.setattr('transformers.AutoConfig.from_pretrained', config_patch) - bert_composer_model = create_bert_classification(use_pretrained=False, - pretrained_model_name='dummy', - model_config=None, - tokenizer_name=None, - gradient_checkpointing=False, - num_labels=3) - - train_dataset = RandomTextClassificationDataset(size=8, - vocab_size=tiny_bert_tokenizer.vocab_size, - sequence_length=8, - num_classes=3, - use_keys=True) - train_dataloader = DataLoader(train_dataset, batch_size=4) - - trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep') - trainer.fit() - - assert trainer.state.train_metrics is not None - assert trainer.state.train_metrics['MulticlassAccuracy'].compute() > 0.0 diff --git a/tests/models/test_efficientnet.py b/tests/models/test_efficientnet.py deleted file mode 100644 index a11dccc87b..0000000000 --- a/tests/models/test_efficientnet.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import torch - -from composer.models.efficientnetb0.efficientnets import EfficientNet - - -@pytest.mark.gpu -def test_efficientb0_activate_shape(): - # Running this test on cuda as convolutions are slow on CPU - random_input = torch.rand(2, 3, 224, 224).cuda() - - model = EfficientNet.get_model_from_name( - 'efficientnet-b0', - num_classes=1000, - drop_connect_rate=0.2, - ).cuda() - # Test Stem - out = model.conv_stem(random_input) - out = model.bn1(out) - out = model.act1(out) - assert out.shape == (2, 32, 112, 112) - - # Test each block, shapes found at Table 1 of EfficientNet paper - block_act_shape = [ - (2, 16, 112, 112), - (2, 24, 56, 56), - (2, 24, 56, 56), - (2, 40, 28, 28), - (2, 40, 28, 28), - (2, 80, 14, 14), - (2, 80, 14, 14), - (2, 80, 14, 14), - (2, 112, 14, 14), - (2, 112, 14, 14), - (2, 112, 14, 14), - (2, 192, 7, 7), - (2, 192, 7, 7), - (2, 192, 7, 7), - (2, 192, 7, 7), - (2, 320, 7, 7), - ] - for i, block in enumerate(model.blocks): - out = block(out) - assert out.shape == block_act_shape[i] - - out = model.conv_head(out) - assert out.shape == (2, 1280, 7, 7) diff --git a/tests/models/test_gpt2.py b/tests/models/test_gpt2.py deleted file mode 100644 index 1183353d1b..0000000000 --- a/tests/models/test_gpt2.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from torch.utils.data import DataLoader - -from composer.models.gpt2 import create_gpt2 -from composer.trainer import Trainer -from tests.common.datasets import RandomTextLMDataset - - -def test_gpt2_hf_factory(tiny_gpt2_config, tiny_gpt2_tokenizer, monkeypatch): - transformers = pytest.importorskip('transformers') - monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_gpt2_config) - gpt2_composer_model = create_gpt2(use_pretrained=False, - pretrained_model_name='dummy', - model_config=None, - tokenizer_name=None, - gradient_checkpointing=False) - - train_dataset = RandomTextLMDataset(size=8, - vocab_size=tiny_gpt2_tokenizer.vocab_size, - sequence_length=8, - use_keys=True) - collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_gpt2_tokenizer, mlm=False) - train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator) - - trainer = Trainer(model=gpt2_composer_model, train_dataloader=train_dataloader, max_duration='1ep') - trainer.fit() - - assert trainer.state.train_metrics is not None - assert trainer.state.train_metrics['LanguagePerplexity'].compute() > 0.0 diff --git a/tests/models/test_mmdet_model.py b/tests/models/test_mmdet_model.py deleted file mode 100644 index 8ed2246ead..0000000000 --- a/tests/models/test_mmdet_model.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import numpy as np -import pytest -import torch - - -@pytest.fixture -def mmdet_detection_batch(): - batch_size = 2 - num_labels_per_image = 20 - image_size = 224 - return { - 'img_metas': [{ - 'filename': '../../data/coco/train2017/fake_img.jpg', - 'ori_filename': 'fake_image.jpg', - 'img_shape': (image_size, image_size, 3), - 'ori_shape': (image_size, image_size, 3), - 'pad_shape': (image_size, image_size, 3), - 'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32) - }] * batch_size, - 'img': - torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32), - 'gt_bboxes': [torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size, - 'gt_labels': [torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size - } - - -@pytest.fixture -def mmdet_detection_eval_batch(): - # Eval settings for mmdetection datasets have an extra list around inputs. - batch_size = 2 - num_labels_per_image = 20 - image_size = 224 - return { - 'img_metas': [[{ - 'filename': '../../data/coco/train2017/fake_img.jpg', - 'ori_filename': 'fake_image.jpg', - 'img_shape': (image_size, image_size, 3), - 'ori_shape': (image_size, image_size, 3), - 'pad_shape': (image_size, image_size, 3), - 'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32), - }] * batch_size], - 'img': [torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32)], - 'gt_bboxes': [[torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size], - 'gt_labels': [[torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size] - } - - -@pytest.fixture -def yolox_config(): - # from https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox/yolox_s_8x8_300e_coco.py - return dict( - type='YOLOX', - input_size=(640, 640), - random_size_range=(15, 25), - random_size_interval=10, - backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5), - neck={ - 'type': 'YOLOXPAFPN', - 'in_channels': [128, 256, 512], - 'out_channels': 128, - 'num_csp_blocks': 1, - }, - bbox_head=dict(type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128), - train_cfg=dict(assigner={ - 'type': 'SimOTAAssigner', - 'center_radius': 2.5 - }), - # In order to align the source code, the threshold of the val phase is - # 0.01, and the threshold of the test phase is 0.001. - test_cfg=dict(score_thr=0.01, nms={ - 'type': 'nms', - 'iou_threshold': 0.65 - })) - - -@pytest.fixture -def faster_rcnn_config(): - # modified from https://github.com/open-mmlab/mmdetection/blob/master/configs/_base_/models/faster_rcnn_r50_fpn.py - return dict( - type='FasterRCNN', - backbone=dict(type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=True, - style='pytorch'), - neck=dict(type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), - rpn_head=dict(type='RPNHead', - in_channels=256, - feat_channels=256, - anchor_generator=dict(type='AnchorGenerator', - scales=[8], - ratios=[0.5, 1.0, 2.0], - strides=[4, 8, 16, 32, 64]), - bbox_coder=dict(type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls={ - 'type': 'CrossEntropyLoss', - 'use_sigmoid': True, - 'loss_weight': 1.0, - }, - loss_bbox={ - 'type': 'L1Loss', - 'loss_weight': 1.0 - }), - roi_head=dict(type='StandardRoIHead', - bbox_roi_extractor={ - 'type': 'SingleRoIExtractor', - 'roi_layer': { - 'type': 'RoIAlign', - 'output_size': 7, - 'sampling_ratio': 0, - }, - 'out_channels': 256, - 'featmap_strides': [4, 8, 16, 32] - }, - bbox_head={ - 'type': 'Shared2FCBBoxHead', - 'in_channels': 256, - 'fc_out_channels': 1024, - 'roi_feat_size': 7, - 'num_classes': 80, - 'bbox_coder': { - 'type': 'DeltaXYWHBBoxCoder', - 'target_means': [0., 0., 0., 0.], - 'target_stds': [0.1, 0.1, 0.2, 0.2] - }, - 'reg_class_agnostic': False, - 'loss_cls': { - 'type': 'CrossEntropyLoss', - 'use_sigmoid': False, - 'loss_weight': 1.0, - }, - 'loss_bbox': { - 'type': 'L1Loss', - 'loss_weight': 1.0 - } - }), - # model training and testing settings - train_cfg=dict(rpn=dict(assigner={ - 'type': 'MaxIoUAssigner', - 'pos_iou_thr': 0.7, - 'neg_iou_thr': 0.3, - 'min_pos_iou': 0.3, - 'match_low_quality': True, - 'ignore_iof_thr': -1 - }, - sampler={ - 'type': 'RandomSampler', - 'num': 256, - 'pos_fraction': 0.5, - 'neg_pos_ub': -1, - 'add_gt_as_proposals': False - }, - allowed_border=-1, - pos_weight=-1, - debug=False), - rpn_proposal=dict(nms_pre=2000, - max_per_img=1000, - nms={ - 'type': 'nms', - 'iou_threshold': 0.7 - }, - min_bbox_size=0), - rcnn=dict(assigner={ - 'type': 'MaxIoUAssigner', - 'pos_iou_thr': 0.5, - 'neg_iou_thr': 0.5, - 'min_pos_iou': 0.5, - 'match_low_quality': False, - 'ignore_iof_thr': -1 - }, - sampler={ - 'type': 'RandomSampler', - 'num': 512, - 'pos_fraction': 0.25, - 'neg_pos_ub': -1, - 'add_gt_as_proposals': True - }, - pos_weight=-1, - debug=False)), - test_cfg=dict( - rpn=dict( - nms_pre=1000, - max_per_img=1000, - nms={ - 'type': 'nms', - 'iou_threshold': 0.7 - }, - min_bbox_size=0, - ), - rcnn={ - 'score_thr': 0.05, - 'nms': { - 'type': 'nms', - 'iou_threshold': 0.5 - }, - 'max_per_img': 100, - } - # soft-nms is also supported for rcnn testing - # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) - )) - - -def test_mmdet_model_forward_yolox(mmdet_detection_batch, yolox_config): - pytest.importorskip('mmdet') - - from mmcv import ConfigDict - from mmdet.models import build_detector - - from composer.models import MMDetModel - - config = ConfigDict(yolox_config) - # non pretrained model to avoid a slow test that downloads the weights. - model = build_detector(config) - model.init_weights() - model = MMDetModel(model=model) - out = model(mmdet_detection_batch) - assert list(out.keys()) == ['loss_cls', 'loss_bbox', 'loss_obj'] - - -def test_mmdet_model_eval_forward_yolox(mmdet_detection_eval_batch, yolox_config): - pytest.importorskip('mmdet') - - from mmcv import ConfigDict - from mmdet.models import build_detector - - from composer.models import MMDetModel - - config = ConfigDict(yolox_config) - # non pretrained model to avoid a slow test that downloads the weights. - model = build_detector(config) - model.init_weights() - model = MMDetModel(model=model) - out = model.eval_forward(mmdet_detection_eval_batch) - assert len(out) == mmdet_detection_eval_batch['img'][0].shape[0] # batch size - assert list(out[0].keys()) == ['labels', 'boxes', 'scores'] - - -def test_mmdet_model_forward_faster_rcnn(mmdet_detection_batch, faster_rcnn_config): - pytest.importorskip('mmdet') - - from mmcv import ConfigDict - from mmdet.models import build_detector - - from composer.models import MMDetModel - - config = ConfigDict(faster_rcnn_config) - - # non pretrained model to avoid a slow test that downloads the weights. - model = build_detector(config) - model.init_weights() - model = MMDetModel(model=model) - out = model(mmdet_detection_batch) - assert list(out.keys()) == ['loss_rpn_cls', 'loss_rpn_bbox', 'loss_cls', 'acc', 'loss_bbox'] diff --git a/tests/test_precision.py b/tests/test_precision.py index 46571529c6..2b85d3d7d2 100644 --- a/tests/test_precision.py +++ b/tests/test_precision.py @@ -9,8 +9,7 @@ from composer import Trainer from composer.core import Precision, get_precision_context -from composer.models import composer_resnet_cifar -from tests.common import RandomImageDataset +from tests.common import RandomImageDataset, composer_resnet try: import transformer_engine.pytorch as te @@ -22,7 +21,7 @@ def get_trainer(precision: Precision, precision_config: Optional[Dict[str, Any]] = None) -> Trainer: return Trainer( - model=composer_resnet_cifar('resnet_9'), + model=composer_resnet('resnet18'), train_dataloader=DataLoader( dataset=RandomImageDataset(size=1024), batch_size=512, @@ -78,7 +77,7 @@ def predict_and_measure_memory(precision) -> int: def test_train_precision_memory(precision: Precision): memory_fp32 = fit_and_measure_memory(Precision.FP32) memory_half = fit_and_measure_memory(precision) - assert memory_half < 0.7 * memory_fp32 + assert memory_half < 0.85 * memory_fp32 @pytest.mark.gpu diff --git a/tests/trainer/test_ddp.py b/tests/trainer/test_ddp.py index 41d240286e..d9733c4285 100644 --- a/tests/trainer/test_ddp.py +++ b/tests/trainer/test_ddp.py @@ -12,11 +12,10 @@ import composer.core.types as types from composer import Callback, Event from composer.core import State -from composer.datasets.synthetic import SyntheticBatchPairDataset from composer.loggers import Logger from composer.trainer.trainer import Trainer from composer.utils import dist -from tests.common import SimpleModel +from tests.common import RandomClassificationDataset, SimpleModel def get_file_path(*, is_train: bool, tmp_path: pathlib.Path) -> str: @@ -40,8 +39,8 @@ class TrackedDataset(types.Dataset): atomic file writes, it is slow and should not be used in any performance measurements. """ - def __init__(self, is_train: bool, synthetic_dataset: SyntheticBatchPairDataset, tmp_path: pathlib.Path): - self.dataset = synthetic_dataset + def __init__(self, is_train: bool, dataset, tmp_path: pathlib.Path): + self.dataset = dataset self.is_train = is_train self.tmp_path = tmp_path self.counter = 0 @@ -110,19 +109,11 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path and 2) each ddp process is indeed getting different data. """ - model = SimpleModel(num_classes=100) - train_batch_size = 10 train_subset_num_batches = 3 - synthetic_dataset = SyntheticBatchPairDataset( - num_unique_samples_to_create=train_batch_size * train_subset_num_batches, - total_dataset_size=10_000, - data_shape=(model.num_features, 5, 5), - num_classes=model.num_classes, - ) train_dataset = TrackedDataset( - synthetic_dataset=synthetic_dataset, + dataset=RandomClassificationDataset(size=train_batch_size * train_subset_num_batches,), is_train=True, tmp_path=tmp_path, ) @@ -144,14 +135,8 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path eval_batch_size = 10 eval_subset_num_batches = 3 - eval_dataset = SyntheticBatchPairDataset( - num_unique_samples_to_create=eval_batch_size * eval_subset_num_batches, - total_dataset_size=10_000, - data_shape=(model.num_features, 5, 5), - num_classes=model.num_classes, - ) eval_dataset = TrackedDataset( - synthetic_dataset=eval_dataset, + dataset=RandomClassificationDataset(size=eval_batch_size * eval_subset_num_batches,), is_train=False, tmp_path=tmp_path, ) @@ -179,17 +164,19 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path } max_epochs = 2 - trainer = Trainer(model=model, - train_dataloader=train_dataloader, - eval_dataloader=eval_dataloader, - device=device, - max_duration=f'{max_epochs}ep', - eval_interval='1ep', - eval_subset_num_batches=eval_subset_num_batches, - train_subset_num_batches=train_subset_num_batches, - deepspeed_config={} if deepspeed else None, - fsdp_config=fsdp_config, - callbacks=[CheckBatch0(tmp_path)]) + trainer = Trainer( + model=SimpleModel(num_classes=100), + train_dataloader=train_dataloader, + eval_dataloader=eval_dataloader, + device=device, + max_duration=f'{max_epochs}ep', + eval_interval='1ep', + eval_subset_num_batches=eval_subset_num_batches, + train_subset_num_batches=train_subset_num_batches, + deepspeed_config={} if deepspeed else None, + fsdp_config=fsdp_config, + callbacks=[CheckBatch0(tmp_path)], + ) trainer.fit() diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py index a4e3bf90b1..789ad3c136 100644 --- a/tests/utils/test_inference.py +++ b/tests/utils/test_inference.py @@ -20,7 +20,6 @@ from composer.functional import apply_gated_linear_units from composer.loggers import InMemoryLogger, Logger from composer.loggers.logger_destination import LoggerDestination -from composer.models import composer_resnet from composer.trainer.dist_strategy import prepare_ddp_module from composer.trainer.trainer import Trainer from composer.utils import dist, export_with_logger, inference @@ -28,7 +27,7 @@ from tests.common import SimpleTransformerClassifier, device from tests.common.datasets import (RandomImageDataset, dummy_text_classification_dataloader, dummy_tiny_bert_lm_batch, dummy_transformer_classifier_batch) -from tests.common.models import configure_tiny_bert_hf_model +from tests.common.models import composer_resnet, configure_tiny_bert_hf_model class MockFileUploader(LoggerDestination):