From 6569fcc03450c2360b50d772bf9b18ec3487fcf4 Mon Sep 17 00:00:00 2001
From: Thomas Polasek <thomaspolasek@meta.com>
Date: Mon, 18 Nov 2024 13:05:18 -0800
Subject: [PATCH] Convert directory fbcode/torchmultimodal to use the Ruff
 Formatter (#531)

Summary:
Pull Request resolved: https://github.com/facebookresearch/multimodal/pull/531

Converts the directory specified to use the Ruff formatter in pyfmt

ruff_dog

If this diff causes merge conflicts when rebasing, please run
`hg status -n -0 --change . -I '**/*.{py,pyi}' | xargs -0 arc pyfmt`
on your diff, and amend any changes before rebasing onto latest.
That should help reduce or eliminate any merge conflicts.

allow-large-files

Reviewed By: amyreese

Differential Revision: D66105544

fbshipit-source-id: 0994dff620e2b718bdfbf4aafd472d468a6f6389
---
 examples/albef/data/retrieval_datamodule.py               | 2 +-
 examples/albef/data/vqa_datamodules.py                    | 4 ++--
 examples/mdetr/data/dataset.py                            | 1 -
 examples/mdetr/data/flickr_eval.py                        | 4 ++--
 examples/mdetr/data/postprocessors.py                     | 1 -
 examples/mdetr/loss.py                                    | 6 ++----
 examples/mdetr/matcher.py                                 | 1 -
 examples/mdetr/optimizer.py                               | 1 +
 examples/mdetr/phrase_grounding.py                        | 1 -
 examples/mdetr/utils/args_parse.py                        | 1 -
 examples/mdetr/utils/metrics.py                           | 1 +
 examples/mdetr/vqa_eval.py                                | 1 -
 examples/mugen/data/mugen_dataset.py                      | 4 +---
 examples/mugen/retrieval/definitions.py                   | 4 +---
 examples/omnivore/data/data_builder.py                    | 4 +---
 examples/omnivore/data/presets.py                         | 2 --
 examples/omnivore/train.py                                | 8 ++++----
 tests/models/test_late_fusion.py                          | 2 --
 tests/models/test_video_gpt.py                            | 1 -
 torchmultimodal/models/clip/image_encoder.py              | 1 -
 torchmultimodal/models/clip/model.py                      | 1 -
 torchmultimodal/models/coca/coca_model.py                 | 1 -
 torchmultimodal/models/flava/image_encoder.py             | 1 -
 torchmultimodal/models/flava/model.py                     | 1 -
 torchmultimodal/models/flava/text_encoder.py              | 1 -
 torchmultimodal/models/flava/transformer.py               | 1 -
 torchmultimodal/models/mdetr/model.py                     | 2 --
 torchmultimodal/models/mdetr/transformer.py               | 1 -
 torchmultimodal/modules/encoders/vision_transformer.py    | 1 -
 torchmultimodal/modules/fusions/deepset_fusion.py         | 1 -
 torchmultimodal/modules/layers/transformer.py             | 6 +-----
 torchmultimodal/modules/losses/blip2_losses.py            | 1 +
 .../modules/losses/contrastive_loss_with_temperature.py   | 1 -
 torchmultimodal/modules/losses/flava.py                   | 1 -
 torchmultimodal/modules/losses/mdetr.py                   | 2 +-
 torchmultimodal/transforms/clip_transform.py              | 2 --
 torchmultimodal/transforms/flava_transform.py             | 1 -
 torchmultimodal/transforms/mae_transform.py               | 1 -
 38 files changed, 19 insertions(+), 57 deletions(-)

diff --git a/examples/albef/data/retrieval_datamodule.py b/examples/albef/data/retrieval_datamodule.py
index ded29c53d..1db090a99 100644
--- a/examples/albef/data/retrieval_datamodule.py
+++ b/examples/albef/data/retrieval_datamodule.py
@@ -161,7 +161,7 @@ def text_dataloader(
 
 
 def retrieval_train_collate_fn(
-    batch: List[Tuple[Tensor, Tensor, int]]
+    batch: List[Tuple[Tensor, Tensor, int]],
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     image_list = []
     text_list = []
diff --git a/examples/albef/data/vqa_datamodules.py b/examples/albef/data/vqa_datamodules.py
index 27164ce95..f8461ab5d 100644
--- a/examples/albef/data/vqa_datamodules.py
+++ b/examples/albef/data/vqa_datamodules.py
@@ -157,7 +157,7 @@ def test_dataloader(
 
 
 def vqa_train_collate_fn(
-    batch: List[Tuple[Tensor, Tensor, List[Tensor], List[float]]]
+    batch: List[Tuple[Tensor, Tensor, List[Tensor], List[float]]],
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, List[int]]:
     image_list = []
     question_list = []
@@ -188,7 +188,7 @@ def vqa_train_collate_fn(
 
 
 def vqa_test_collate_fn(
-    batch: List[Tuple[Tensor, Tensor, int]]
+    batch: List[Tuple[Tensor, Tensor, int]],
 ) -> Tuple[Tensor, Tensor, Tensor, List[int]]:
     image_list, question_list, question_ids = [], [], []
     for image, question, question_id in batch:
diff --git a/examples/mdetr/data/dataset.py b/examples/mdetr/data/dataset.py
index 7fee64e9c..597b4d85f 100644
--- a/examples/mdetr/data/dataset.py
+++ b/examples/mdetr/data/dataset.py
@@ -240,7 +240,6 @@ def collate_fn(tokenizer, batch):
 
 
 def build_flickr(image_set, tokenizer, transform, args):
-
     img_dir = Path(args.flickr_img_path) / f"{image_set}"
 
     if args.GT_type == "merged":
diff --git a/examples/mdetr/data/flickr_eval.py b/examples/mdetr/data/flickr_eval.py
index 59d3d35e5..73d5d091d 100644
--- a/examples/mdetr/data/flickr_eval.py
+++ b/examples/mdetr/data/flickr_eval.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" Evaluator for Flickr30k """
+"""Evaluator for Flickr30k"""
+
 import xml.etree.ElementTree as Et
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Union
@@ -184,7 +185,6 @@ def __init__(
         iou_thresh: float = 0.5,
         merge_boxes: bool = False,
     ):
-
         assert subset in ["train", "test", "val"], f"Wrong flickr subset {subset}"
 
         self.topk = topk
diff --git a/examples/mdetr/data/postprocessors.py b/examples/mdetr/data/postprocessors.py
index 477c93dcc..c55786184 100644
--- a/examples/mdetr/data/postprocessors.py
+++ b/examples/mdetr/data/postprocessors.py
@@ -49,7 +49,6 @@ def __call__(
         positive_map: Tensor,
         phrases_per_sample: List[int],
     ) -> List[List[List[float]]]:
-
         assert output_logits.size(0) == target_sizes.size(
             0
         ), "Logits and target sizes should both have first dim = batch_size"
diff --git a/examples/mdetr/loss.py b/examples/mdetr/loss.py
index 63834b6b3..19212c2e8 100644
--- a/examples/mdetr/loss.py
+++ b/examples/mdetr/loss.py
@@ -67,7 +67,7 @@ def contrastive_alignment_loss(
     neg_term = negative_logits.logsumexp(2)
     nb_pos = positive_map.sum(2) + 1e-6
     box_to_token_loss = (
-        ((pos_term / nb_pos + neg_term)).masked_fill(~boxes_with_pos, 0).sum()
+        (pos_term / nb_pos + neg_term).masked_fill(~boxes_with_pos, 0).sum()
     )
 
     # Calculate the contrastive loss for all tokens
@@ -76,7 +76,7 @@ def contrastive_alignment_loss(
     neg_term = negative_logits.logsumexp(1)
     nb_pos = positive_map.sum(1) + 1e-6
     tokens_to_boxes_loss = (
-        ((pos_term / nb_pos + neg_term)).masked_fill(~tokens_with_pos, 0).sum()
+        (pos_term / nb_pos + neg_term).masked_fill(~tokens_with_pos, 0).sum()
     )
 
     tot_loss = (box_to_token_loss + tokens_to_boxes_loss) / 2
@@ -227,7 +227,6 @@ def forward(
         vqa_masks: Optional[Dict[str, Tensor]] = None,
         weight_dict: Optional[Dict[str, float]] = None,
     ) -> Dict[str, Tensor]:
-
         target_boxes = [t["boxes"] for t in targets]
         target_tokens = [t["tokens_positive"] for t in targets]
         n_target_boxes = [len(t) for t in target_boxes]
@@ -292,7 +291,6 @@ def build_mdetr_loss(
     no_object_weight: float = 0.1,
     temperature: Optional[float] = None,
 ) -> MDETRLoss:
-
     soft_token_loss = partial(
         soft_token_prediction_loss, no_object_weight=no_object_weight
     )
diff --git a/examples/mdetr/matcher.py b/examples/mdetr/matcher.py
index 3837a8548..b843d3244 100644
--- a/examples/mdetr/matcher.py
+++ b/examples/mdetr/matcher.py
@@ -73,7 +73,6 @@ def forward(
         target_boxes_per_sample: List[Tensor],
         positive_map: Tensor,
     ) -> List[Tuple[Tensor, Tensor]]:
-
         bs, num_queries = pred_logits.shape[:2]
         target_boxes = torch.cat(target_boxes_per_sample)
         # We flatten to compute the cost matrices in a batch
diff --git a/examples/mdetr/optimizer.py b/examples/mdetr/optimizer.py
index 056974ea0..2604961d0 100644
--- a/examples/mdetr/optimizer.py
+++ b/examples/mdetr/optimizer.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """Collections of utilities related to optimization."""
+
 from bisect import bisect_right
 
 import torch
diff --git a/examples/mdetr/phrase_grounding.py b/examples/mdetr/phrase_grounding.py
index 0b8ac0d7f..75567f4c8 100644
--- a/examples/mdetr/phrase_grounding.py
+++ b/examples/mdetr/phrase_grounding.py
@@ -31,7 +31,6 @@ def evaluate(
     evaluator,
     device,
 ):
-
     model.eval()
 
     metric_logger = MetricLogger(delimiter="  ")
diff --git a/examples/mdetr/utils/args_parse.py b/examples/mdetr/utils/args_parse.py
index 6af64c070..bbe80ae28 100644
--- a/examples/mdetr/utils/args_parse.py
+++ b/examples/mdetr/utils/args_parse.py
@@ -8,7 +8,6 @@
 
 
 def get_args_parser():
-
     parser = argparse.ArgumentParser("MDETR", add_help=False)
     parser.add_argument("--dataset_config", default=None, required=True)
     # Transformer
diff --git a/examples/mdetr/utils/metrics.py b/examples/mdetr/utils/metrics.py
index 4fb3b7e21..77263045c 100644
--- a/examples/mdetr/utils/metrics.py
+++ b/examples/mdetr/utils/metrics.py
@@ -7,6 +7,7 @@
 """
 Various utilities related to track and report metrics
 """
+
 import datetime
 import time
 from collections import defaultdict, deque
diff --git a/examples/mdetr/vqa_eval.py b/examples/mdetr/vqa_eval.py
index 0bc35d55c..7424e7790 100644
--- a/examples/mdetr/vqa_eval.py
+++ b/examples/mdetr/vqa_eval.py
@@ -32,7 +32,6 @@ def evaluate(
     device,
     weight_dict,
 ):
-
     model.eval()
     metric_logger = MetricLogger(delimiter="  ")
     header = "Test:"
diff --git a/examples/mugen/data/mugen_dataset.py b/examples/mugen/data/mugen_dataset.py
index 932ab295c..7b138e22d 100644
--- a/examples/mugen/data/mugen_dataset.py
+++ b/examples/mugen/data/mugen_dataset.py
@@ -38,9 +38,7 @@ class MUGENDatasetArgs:
         True  # render smap for mugen (and shield) as bounding boxes
     )
     bbox_smap_for_monsters: bool = True  # render smap for monsters as bounding boxes
-    use_manual_annotation: bool = (
-        False  # if True will only use videos with manual annotation and skip those without
-    )
+    use_manual_annotation: bool = False  # if True will only use videos with manual annotation and skip those without
     use_auto_annotation: bool = (
         True  # if True will only use videos with auto annotation and skip those without
     )
diff --git a/examples/mugen/retrieval/definitions.py b/examples/mugen/retrieval/definitions.py
index 1232e82fc..569e48f0d 100644
--- a/examples/mugen/retrieval/definitions.py
+++ b/examples/mugen/retrieval/definitions.py
@@ -81,9 +81,7 @@ class EvaluationArgs:
     datamodule_args: DataModuleArgs = DataModuleArgs()
     lightningmodule_args: LightningModuleArgs = LightningModuleArgs()
     videoclip_args: VideoCLIPArgs = VideoCLIPArgs()
-    checkpoint_path: str = (
-        "https://pytorch.s3.amazonaws.com/models/multimodal/mugen/videoclip_lightning_mugen.pt"
-    )
+    checkpoint_path: str = "https://pytorch.s3.amazonaws.com/models/multimodal/mugen/videoclip_lightning_mugen.pt"
     accelerator: str = "auto"
 
 
diff --git a/examples/omnivore/data/data_builder.py b/examples/omnivore/data/data_builder.py
index fedfc2252..afbe2d8aa 100644
--- a/examples/omnivore/data/data_builder.py
+++ b/examples/omnivore/data/data_builder.py
@@ -76,9 +76,7 @@ def construct_data_loader(dataset, sampler, num_workers, mode, args, drop_last=F
             mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
             # Since not all dataset return tuple of same length, we take the
             # first two elements for mixupcutmix during training
-            collate_fn = lambda batch: mixupcutmix(
-                *(default_collate(batch)[:2])
-            )  # noqa: E731
+            collate_fn = lambda batch: mixupcutmix(*(default_collate(batch)[:2]))  # noqa: E731
 
     data_loader = torch.utils.data.DataLoader(
         dataset,
diff --git a/examples/omnivore/data/presets.py b/examples/omnivore/data/presets.py
index ccf7b7e54..4bf9fb982 100644
--- a/examples/omnivore/data/presets.py
+++ b/examples/omnivore/data/presets.py
@@ -58,7 +58,6 @@ def __init__(
         std=(0.229, 0.224, 0.225),
         interpolation=InterpolationMode.BICUBIC,
     ):
-
         self.transforms = T.Compose(
             [
                 T.Resize(resize_size, interpolation=interpolation),
@@ -187,7 +186,6 @@ def __init__(
         std=(0.229, 0.224, 0.225, 0.0295),
         interpolation=InterpolationMode.BILINEAR,
     ):
-
         self.transforms = T.Compose(
             [
                 CT.DepthNorm(max_depth=max_depth, clamp_max_before_scale=True),
diff --git a/examples/omnivore/train.py b/examples/omnivore/train.py
index cbd7f56dc..785201405 100644
--- a/examples/omnivore/train.py
+++ b/examples/omnivore/train.py
@@ -38,10 +38,10 @@ def _chunk_forward_backward(
     args,
     scaler=None,
 ):
-
-    chunk_image, chunk_target = image[chunk_start:chunk_end, ...].to(device), target[
-        chunk_start:chunk_end, ...
-    ].to(device)
+    chunk_image, chunk_target = (
+        image[chunk_start:chunk_end, ...].to(device),
+        target[chunk_start:chunk_end, ...].to(device),
+    )
 
     with torch.cuda.amp.autocast(enabled=scaler is not None):
         chunk_output = model(chunk_image, input_type)
diff --git a/tests/models/test_late_fusion.py b/tests/models/test_late_fusion.py
index 516da1207..e7316b722 100644
--- a/tests/models/test_late_fusion.py
+++ b/tests/models/test_late_fusion.py
@@ -81,7 +81,6 @@ def modalities_3(self):
         }
 
     def test_forward(self, late_fusion, modalities_1):
-
         actual = late_fusion(modalities_1)
         expected = torch.Tensor(
             [[1, 0, 0.25, 0.75, 3, 1, 0.8, 0.9], [0, 1, 0.6, 0.4, 0.7, 2, 0.6, 0]]
@@ -90,7 +89,6 @@ def test_forward(self, late_fusion, modalities_1):
         assert_expected(actual, expected)
 
     def test_script(self, late_fusion, modalities_2):
-
         scripted_late_fusion = torch.jit.script(late_fusion)
         actual = scripted_late_fusion(modalities_2)
         expected = torch.Tensor([[7, 0, 0.65, 8, 9, 0.8], [88, 5, 0.3, 0.74, 2, 0]])
diff --git a/tests/models/test_video_gpt.py b/tests/models/test_video_gpt.py
index 091655a0c..245ae9ee2 100644
--- a/tests/models/test_video_gpt.py
+++ b/tests/models/test_video_gpt.py
@@ -47,7 +47,6 @@ def set_seed():
 
 
 class TestVideoGPT:
-
     _model_name = "video_gpt"
 
     @pytest.fixture
diff --git a/torchmultimodal/models/clip/image_encoder.py b/torchmultimodal/models/clip/image_encoder.py
index deb560db4..bfc4af954 100644
--- a/torchmultimodal/models/clip/image_encoder.py
+++ b/torchmultimodal/models/clip/image_encoder.py
@@ -80,7 +80,6 @@ def __init__(
         self.projection = nn.Parameter(scale * torch.randn(width, embedding_dim))
 
     def forward(self, x: Tensor) -> Tensor:
-
         if x.size(2) != self.image_size or x.size(3) != self.image_size:
             raise ValueError(
                 f"Expected input with width and height as {self.image_size}, found {x.size(2)} by {x.size(3)} "
diff --git a/torchmultimodal/models/clip/model.py b/torchmultimodal/models/clip/model.py
index f2718755f..64922fa0b 100644
--- a/torchmultimodal/models/clip/model.py
+++ b/torchmultimodal/models/clip/model.py
@@ -68,7 +68,6 @@ def forward(
         features_a: torch.Tensor,
         features_b: torch.Tensor,
     ) -> CLIPOutput:
-
         embeddings_a = self.encoder_a(features_a)
         embeddings_b = self.encoder_b(features_b)
         embeddings_a = F.normalize(embeddings_a)
diff --git a/torchmultimodal/models/coca/coca_model.py b/torchmultimodal/models/coca/coca_model.py
index f1e42ae9a..0566f3702 100644
--- a/torchmultimodal/models/coca/coca_model.py
+++ b/torchmultimodal/models/coca/coca_model.py
@@ -493,7 +493,6 @@ def __init__(
     def forward(
         self, images: Tensor, texts: Tensor, text_padding_mask: Optional[Tensor] = None
     ) -> Dict[str, Tensor]:
-
         model_out = self.model(images, texts, text_padding_mask)
         mm_out = model_out.multimodal_embeddings
 
diff --git a/torchmultimodal/models/flava/image_encoder.py b/torchmultimodal/models/flava/image_encoder.py
index b2c9c6190..2e9371655 100644
--- a/torchmultimodal/models/flava/image_encoder.py
+++ b/torchmultimodal/models/flava/image_encoder.py
@@ -247,7 +247,6 @@ def flava_image_encoder(
     patch_size: int = 16,
     num_channels: int = 3,
 ) -> ImageTransformer:
-
     embeddings = ImageEmbeddings(
         image_size=image_size,
         patch_size=patch_size,
diff --git a/torchmultimodal/models/flava/model.py b/torchmultimodal/models/flava/model.py
index 46bc28bd0..50e840947 100644
--- a/torchmultimodal/models/flava/model.py
+++ b/torchmultimodal/models/flava/model.py
@@ -555,7 +555,6 @@ def flava_model_for_classification(
     pretrained: bool = True,
     **flava_model_kwargs: Any,
 ) -> FLAVAForClassification:
-
     classifier = MLP(
         in_dim=classifier_in_dim,
         out_dim=num_classes,
diff --git a/torchmultimodal/models/flava/text_encoder.py b/torchmultimodal/models/flava/text_encoder.py
index b5e0d2727..17f7cb1f9 100644
--- a/torchmultimodal/models/flava/text_encoder.py
+++ b/torchmultimodal/models/flava/text_encoder.py
@@ -35,7 +35,6 @@ def flava_text_encoder(
     # TextEncoder params
     initializer_range: float = 0.02,
 ) -> BERTTextEncoder:
-
     embeddings = BERTTextEmbeddings(
         hidden_size=hidden_size,
         vocab_size=vocab_size,
diff --git a/torchmultimodal/models/flava/transformer.py b/torchmultimodal/models/flava/transformer.py
index 9092913f4..363db750c 100644
--- a/torchmultimodal/models/flava/transformer.py
+++ b/torchmultimodal/models/flava/transformer.py
@@ -261,7 +261,6 @@ def forward(
         return_attn_weights: bool = False,
         return_hidden_states: bool = False,
     ) -> TransformerOutput:
-
         all_hidden_states = [] if return_hidden_states else None
         all_self_attentions = [] if return_attn_weights else None
 
diff --git a/torchmultimodal/models/mdetr/model.py b/torchmultimodal/models/mdetr/model.py
index d4d2b17df..4eaf32ddc 100644
--- a/torchmultimodal/models/mdetr/model.py
+++ b/torchmultimodal/models/mdetr/model.py
@@ -123,7 +123,6 @@ def _pad_text(
         return padded_text, mask
 
     def forward(self, images: List[Tensor], text: List[Tensor]) -> MDETRModelOutput:
-
         images, image_mask = self._pad_images(images)
         text, text_attention_mask = self._pad_text(text)
         encoded_text = self.text_encoder(text, text_attention_mask)
@@ -392,7 +391,6 @@ def forward(
         images: List[Tensor],
         text: List[Tensor],
     ) -> MDETRPhraseGroundingOutput:
-
         model_output = self.model(images, text)
         final_hidden_state = model_output.transformer_output.decoder_hidden_states[-1]
 
diff --git a/torchmultimodal/models/mdetr/transformer.py b/torchmultimodal/models/mdetr/transformer.py
index e5551261a..e166244bb 100644
--- a/torchmultimodal/models/mdetr/transformer.py
+++ b/torchmultimodal/models/mdetr/transformer.py
@@ -166,7 +166,6 @@ def forward(
         src_key_padding_mask: Optional[Tensor] = None,
         pos: Optional[Tensor] = None,
     ) -> Tensor:
-
         output = src
 
         for layer in self.layers:
diff --git a/torchmultimodal/modules/encoders/vision_transformer.py b/torchmultimodal/modules/encoders/vision_transformer.py
index 4e170ee6e..eb58da917 100644
--- a/torchmultimodal/modules/encoders/vision_transformer.py
+++ b/torchmultimodal/modules/encoders/vision_transformer.py
@@ -60,7 +60,6 @@ def forward(
         image_patches_mask: Optional[Tensor] = None,
         attention_mask: Optional[Tensor] = None,
     ) -> TransformerOutput:
-
         embedding_output = self.embeddings(
             images, image_patches_mask=image_patches_mask
         ).embeddings
diff --git a/torchmultimodal/modules/fusions/deepset_fusion.py b/torchmultimodal/modules/fusions/deepset_fusion.py
index ae8eda077..a1f3415e9 100644
--- a/torchmultimodal/modules/fusions/deepset_fusion.py
+++ b/torchmultimodal/modules/fusions/deepset_fusion.py
@@ -84,7 +84,6 @@ def __init__(
         self.mlp = mlp
 
     def forward(self, embeddings: Dict[str, Tensor]) -> Tensor:
-
         projections = {}
         for channel, projection in self.projections.items():
             projections[channel] = projection(embeddings[channel])
diff --git a/torchmultimodal/modules/layers/transformer.py b/torchmultimodal/modules/layers/transformer.py
index 530a3b0e0..ec4e5d762 100644
--- a/torchmultimodal/modules/layers/transformer.py
+++ b/torchmultimodal/modules/layers/transformer.py
@@ -366,9 +366,7 @@ def _cross_attention_block(
         encoder_hidden_states: Tensor,
         cross_attention_mask: Optional[Tensor] = None,
     ) -> Tensor:
-        assert (
-            self.cross_attention is not None
-        ), """
+        assert self.cross_attention is not None, """
             Cannot use cross-attention unless self.cross_attention and
             self.cross_attention_dropout are defined.
         """
@@ -399,7 +397,6 @@ def _forward_prenorm(
         past_key_value: Optional[Tuple[Tensor, Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]:
-
         # Self-attention
         self_attn_input = self.attention_layernorm(hidden_states)
         attn_output, present_key_value = self._self_attention_block(
@@ -440,7 +437,6 @@ def _forward_postnorm(
         past_key_value: Optional[Tuple[Tensor, Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]:
-
         # Self-attention
         attn_output, present_key_value = self._self_attention_block(
             hidden_states,
diff --git a/torchmultimodal/modules/losses/blip2_losses.py b/torchmultimodal/modules/losses/blip2_losses.py
index 6e66d85d3..108119d7f 100644
--- a/torchmultimodal/modules/losses/blip2_losses.py
+++ b/torchmultimodal/modules/losses/blip2_losses.py
@@ -21,6 +21,7 @@
 @dataclass
 class Blip2Stage1Losses(OrderedDict):
     "Blip-2 stage 1 losses"
+
     image_text_contrastive_loss: torch.Tensor
     image_text_matching_loss: torch.Tensor
     image_captioning_loss: torch.Tensor
diff --git a/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py b/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py
index 155445a0d..88b0f7ee7 100644
--- a/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py
+++ b/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py
@@ -190,7 +190,6 @@ def forward(
         cross_entropy_kwargs: Optional[Dict[str, Any]] = None,
         mask: Optional[Tensor] = None,
     ) -> Tensor:
-
         self.logit_scale.data.clamp_(self.logit_scale_min, self.logit_scale_max)
         return contrastive_loss_with_temperature(
             embeddings_a=embeddings_a,
diff --git a/torchmultimodal/modules/losses/flava.py b/torchmultimodal/modules/losses/flava.py
index b3c5a5367..8580e9aca 100644
--- a/torchmultimodal/modules/losses/flava.py
+++ b/torchmultimodal/modules/losses/flava.py
@@ -264,7 +264,6 @@ def forward(
         text_sequence: Tensor,
         mask: Tensor,
     ) -> FLAVAGlobalContrastiveLossOutput:
-
         text_embedding = nn.functional.normalize(text_sequence, dim=-1)
         image_embedding = nn.functional.normalize(
             image_sequence,
diff --git a/torchmultimodal/modules/losses/mdetr.py b/torchmultimodal/modules/losses/mdetr.py
index f07fd1a1f..8b00a7458 100644
--- a/torchmultimodal/modules/losses/mdetr.py
+++ b/torchmultimodal/modules/losses/mdetr.py
@@ -13,7 +13,7 @@
 
 
 def _get_src_permutation_idx(
-    indices: List[Tuple[Tensor, Tensor]]
+    indices: List[Tuple[Tensor, Tensor]],
 ) -> Tuple[Tensor, Tensor]:
     """
     Given a list of matched (src, tgt) indices, concatenate the src indices and
diff --git a/torchmultimodal/transforms/clip_transform.py b/torchmultimodal/transforms/clip_transform.py
index 0f1c49764..1b0f41724 100644
--- a/torchmultimodal/transforms/clip_transform.py
+++ b/torchmultimodal/transforms/clip_transform.py
@@ -267,7 +267,6 @@ def __init__(
         text_bpe_merges_path: str = CLIP_DEFAULT_VOCAB_BPE_PATH,
         num_merges: Optional[int] = 48894,
     ) -> None:
-
         super().__init__()
         local_merges_path = _PATH_MANAGER.get_local_path(text_bpe_merges_path)
         tokenizer = CLIPBPETransform(
@@ -399,7 +398,6 @@ def __init__(
         text_bpe_merges_path: str = CLIP_DEFAULT_VOCAB_BPE_PATH,
         num_merges: Optional[int] = 48894,
     ) -> None:
-
         super().__init__()
         self.image_transform = CLIPImageTransform(
             image_size, image_interpolation, image_mean, image_std, is_train
diff --git a/torchmultimodal/transforms/flava_transform.py b/torchmultimodal/transforms/flava_transform.py
index 584121f87..b05bedf64 100644
--- a/torchmultimodal/transforms/flava_transform.py
+++ b/torchmultimodal/transforms/flava_transform.py
@@ -116,7 +116,6 @@ def __init__(
         second_interpolation: transforms.InterpolationMode = transforms.InterpolationMode.LANCZOS,
         **kwargs: Any,
     ) -> None:
-
         if not isinstance(size, (list, tuple)):
             size = (size, size)
 
diff --git a/torchmultimodal/transforms/mae_transform.py b/torchmultimodal/transforms/mae_transform.py
index b1c83a4f8..2f75624b6 100644
--- a/torchmultimodal/transforms/mae_transform.py
+++ b/torchmultimodal/transforms/mae_transform.py
@@ -100,7 +100,6 @@ def __init__(
         mean: Tuple[float, float, float] = (0.485, 0.456, 0.406),
         std: Tuple[float, float, float] = (0.229, 0.224, 0.225),
     ) -> None:
-
         img_transforms: List[Callable] = [
             transforms.RandomResizedCrop(
                 input_size, scale=scale, interpolation=interpolation