diff --git a/examples/albef/data/retrieval_datamodule.py b/examples/albef/data/retrieval_datamodule.py index ded29c53d..1db090a99 100644 --- a/examples/albef/data/retrieval_datamodule.py +++ b/examples/albef/data/retrieval_datamodule.py @@ -161,7 +161,7 @@ def text_dataloader( def retrieval_train_collate_fn( - batch: List[Tuple[Tensor, Tensor, int]] + batch: List[Tuple[Tensor, Tensor, int]], ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: image_list = [] text_list = [] diff --git a/examples/albef/data/vqa_datamodules.py b/examples/albef/data/vqa_datamodules.py index 27164ce95..f8461ab5d 100644 --- a/examples/albef/data/vqa_datamodules.py +++ b/examples/albef/data/vqa_datamodules.py @@ -157,7 +157,7 @@ def test_dataloader( def vqa_train_collate_fn( - batch: List[Tuple[Tensor, Tensor, List[Tensor], List[float]]] + batch: List[Tuple[Tensor, Tensor, List[Tensor], List[float]]], ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, List[int]]: image_list = [] question_list = [] @@ -188,7 +188,7 @@ def vqa_train_collate_fn( def vqa_test_collate_fn( - batch: List[Tuple[Tensor, Tensor, int]] + batch: List[Tuple[Tensor, Tensor, int]], ) -> Tuple[Tensor, Tensor, Tensor, List[int]]: image_list, question_list, question_ids = [], [], [] for image, question, question_id in batch: diff --git a/examples/mdetr/data/dataset.py b/examples/mdetr/data/dataset.py index 7fee64e9c..597b4d85f 100644 --- a/examples/mdetr/data/dataset.py +++ b/examples/mdetr/data/dataset.py @@ -240,7 +240,6 @@ def collate_fn(tokenizer, batch): def build_flickr(image_set, tokenizer, transform, args): - img_dir = Path(args.flickr_img_path) / f"{image_set}" if args.GT_type == "merged": diff --git a/examples/mdetr/data/flickr_eval.py b/examples/mdetr/data/flickr_eval.py index 59d3d35e5..73d5d091d 100644 --- a/examples/mdetr/data/flickr_eval.py +++ b/examples/mdetr/data/flickr_eval.py @@ -4,7 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -""" Evaluator for Flickr30k """ +"""Evaluator for Flickr30k""" + import xml.etree.ElementTree as Et from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Union @@ -184,7 +185,6 @@ def __init__( iou_thresh: float = 0.5, merge_boxes: bool = False, ): - assert subset in ["train", "test", "val"], f"Wrong flickr subset {subset}" self.topk = topk diff --git a/examples/mdetr/data/postprocessors.py b/examples/mdetr/data/postprocessors.py index 477c93dcc..c55786184 100644 --- a/examples/mdetr/data/postprocessors.py +++ b/examples/mdetr/data/postprocessors.py @@ -49,7 +49,6 @@ def __call__( positive_map: Tensor, phrases_per_sample: List[int], ) -> List[List[List[float]]]: - assert output_logits.size(0) == target_sizes.size( 0 ), "Logits and target sizes should both have first dim = batch_size" diff --git a/examples/mdetr/loss.py b/examples/mdetr/loss.py index 63834b6b3..19212c2e8 100644 --- a/examples/mdetr/loss.py +++ b/examples/mdetr/loss.py @@ -67,7 +67,7 @@ def contrastive_alignment_loss( neg_term = negative_logits.logsumexp(2) nb_pos = positive_map.sum(2) + 1e-6 box_to_token_loss = ( - ((pos_term / nb_pos + neg_term)).masked_fill(~boxes_with_pos, 0).sum() + (pos_term / nb_pos + neg_term).masked_fill(~boxes_with_pos, 0).sum() ) # Calculate the contrastive loss for all tokens @@ -76,7 +76,7 @@ def contrastive_alignment_loss( neg_term = negative_logits.logsumexp(1) nb_pos = positive_map.sum(1) + 1e-6 tokens_to_boxes_loss = ( - ((pos_term / nb_pos + neg_term)).masked_fill(~tokens_with_pos, 0).sum() + (pos_term / nb_pos + neg_term).masked_fill(~tokens_with_pos, 0).sum() ) tot_loss = (box_to_token_loss + tokens_to_boxes_loss) / 2 @@ -227,7 +227,6 @@ def forward( vqa_masks: Optional[Dict[str, Tensor]] = None, weight_dict: Optional[Dict[str, float]] = None, ) -> Dict[str, Tensor]: - target_boxes = [t["boxes"] for t in targets] target_tokens = [t["tokens_positive"] for t in targets] n_target_boxes = [len(t) for t in target_boxes] @@ -292,7 +291,6 @@ def build_mdetr_loss( no_object_weight: float = 0.1, temperature: Optional[float] = None, ) -> MDETRLoss: - soft_token_loss = partial( soft_token_prediction_loss, no_object_weight=no_object_weight ) diff --git a/examples/mdetr/matcher.py b/examples/mdetr/matcher.py index 3837a8548..b843d3244 100644 --- a/examples/mdetr/matcher.py +++ b/examples/mdetr/matcher.py @@ -73,7 +73,6 @@ def forward( target_boxes_per_sample: List[Tensor], positive_map: Tensor, ) -> List[Tuple[Tensor, Tensor]]: - bs, num_queries = pred_logits.shape[:2] target_boxes = torch.cat(target_boxes_per_sample) # We flatten to compute the cost matrices in a batch diff --git a/examples/mdetr/optimizer.py b/examples/mdetr/optimizer.py index 056974ea0..2604961d0 100644 --- a/examples/mdetr/optimizer.py +++ b/examples/mdetr/optimizer.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. """Collections of utilities related to optimization.""" + from bisect import bisect_right import torch diff --git a/examples/mdetr/phrase_grounding.py b/examples/mdetr/phrase_grounding.py index 0b8ac0d7f..75567f4c8 100644 --- a/examples/mdetr/phrase_grounding.py +++ b/examples/mdetr/phrase_grounding.py @@ -31,7 +31,6 @@ def evaluate( evaluator, device, ): - model.eval() metric_logger = MetricLogger(delimiter=" ") diff --git a/examples/mdetr/utils/args_parse.py b/examples/mdetr/utils/args_parse.py index 6af64c070..bbe80ae28 100644 --- a/examples/mdetr/utils/args_parse.py +++ b/examples/mdetr/utils/args_parse.py @@ -8,7 +8,6 @@ def get_args_parser(): - parser = argparse.ArgumentParser("MDETR", add_help=False) parser.add_argument("--dataset_config", default=None, required=True) # Transformer diff --git a/examples/mdetr/utils/metrics.py b/examples/mdetr/utils/metrics.py index 4fb3b7e21..77263045c 100644 --- a/examples/mdetr/utils/metrics.py +++ b/examples/mdetr/utils/metrics.py @@ -7,6 +7,7 @@ """ Various utilities related to track and report metrics """ + import datetime import time from collections import defaultdict, deque diff --git a/examples/mdetr/vqa_eval.py b/examples/mdetr/vqa_eval.py index 0bc35d55c..7424e7790 100644 --- a/examples/mdetr/vqa_eval.py +++ b/examples/mdetr/vqa_eval.py @@ -32,7 +32,6 @@ def evaluate( device, weight_dict, ): - model.eval() metric_logger = MetricLogger(delimiter=" ") header = "Test:" diff --git a/examples/mugen/data/mugen_dataset.py b/examples/mugen/data/mugen_dataset.py index 932ab295c..7b138e22d 100644 --- a/examples/mugen/data/mugen_dataset.py +++ b/examples/mugen/data/mugen_dataset.py @@ -38,9 +38,7 @@ class MUGENDatasetArgs: True # render smap for mugen (and shield) as bounding boxes ) bbox_smap_for_monsters: bool = True # render smap for monsters as bounding boxes - use_manual_annotation: bool = ( - False # if True will only use videos with manual annotation and skip those without - ) + use_manual_annotation: bool = False # if True will only use videos with manual annotation and skip those without use_auto_annotation: bool = ( True # if True will only use videos with auto annotation and skip those without ) diff --git a/examples/mugen/retrieval/definitions.py b/examples/mugen/retrieval/definitions.py index 1232e82fc..569e48f0d 100644 --- a/examples/mugen/retrieval/definitions.py +++ b/examples/mugen/retrieval/definitions.py @@ -81,9 +81,7 @@ class EvaluationArgs: datamodule_args: DataModuleArgs = DataModuleArgs() lightningmodule_args: LightningModuleArgs = LightningModuleArgs() videoclip_args: VideoCLIPArgs = VideoCLIPArgs() - checkpoint_path: str = ( - "https://pytorch.s3.amazonaws.com/models/multimodal/mugen/videoclip_lightning_mugen.pt" - ) + checkpoint_path: str = "https://pytorch.s3.amazonaws.com/models/multimodal/mugen/videoclip_lightning_mugen.pt" accelerator: str = "auto" diff --git a/examples/omnivore/data/data_builder.py b/examples/omnivore/data/data_builder.py index fedfc2252..afbe2d8aa 100644 --- a/examples/omnivore/data/data_builder.py +++ b/examples/omnivore/data/data_builder.py @@ -76,9 +76,7 @@ def construct_data_loader(dataset, sampler, num_workers, mode, args, drop_last=F mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) # Since not all dataset return tuple of same length, we take the # first two elements for mixupcutmix during training - collate_fn = lambda batch: mixupcutmix( - *(default_collate(batch)[:2]) - ) # noqa: E731 + collate_fn = lambda batch: mixupcutmix(*(default_collate(batch)[:2])) # noqa: E731 data_loader = torch.utils.data.DataLoader( dataset, diff --git a/examples/omnivore/data/presets.py b/examples/omnivore/data/presets.py index ccf7b7e54..4bf9fb982 100644 --- a/examples/omnivore/data/presets.py +++ b/examples/omnivore/data/presets.py @@ -58,7 +58,6 @@ def __init__( std=(0.229, 0.224, 0.225), interpolation=InterpolationMode.BICUBIC, ): - self.transforms = T.Compose( [ T.Resize(resize_size, interpolation=interpolation), @@ -187,7 +186,6 @@ def __init__( std=(0.229, 0.224, 0.225, 0.0295), interpolation=InterpolationMode.BILINEAR, ): - self.transforms = T.Compose( [ CT.DepthNorm(max_depth=max_depth, clamp_max_before_scale=True), diff --git a/examples/omnivore/train.py b/examples/omnivore/train.py index cbd7f56dc..785201405 100644 --- a/examples/omnivore/train.py +++ b/examples/omnivore/train.py @@ -38,10 +38,10 @@ def _chunk_forward_backward( args, scaler=None, ): - - chunk_image, chunk_target = image[chunk_start:chunk_end, ...].to(device), target[ - chunk_start:chunk_end, ... - ].to(device) + chunk_image, chunk_target = ( + image[chunk_start:chunk_end, ...].to(device), + target[chunk_start:chunk_end, ...].to(device), + ) with torch.cuda.amp.autocast(enabled=scaler is not None): chunk_output = model(chunk_image, input_type) diff --git a/tests/models/test_late_fusion.py b/tests/models/test_late_fusion.py index 516da1207..e7316b722 100644 --- a/tests/models/test_late_fusion.py +++ b/tests/models/test_late_fusion.py @@ -81,7 +81,6 @@ def modalities_3(self): } def test_forward(self, late_fusion, modalities_1): - actual = late_fusion(modalities_1) expected = torch.Tensor( [[1, 0, 0.25, 0.75, 3, 1, 0.8, 0.9], [0, 1, 0.6, 0.4, 0.7, 2, 0.6, 0]] @@ -90,7 +89,6 @@ def test_forward(self, late_fusion, modalities_1): assert_expected(actual, expected) def test_script(self, late_fusion, modalities_2): - scripted_late_fusion = torch.jit.script(late_fusion) actual = scripted_late_fusion(modalities_2) expected = torch.Tensor([[7, 0, 0.65, 8, 9, 0.8], [88, 5, 0.3, 0.74, 2, 0]]) diff --git a/tests/models/test_video_gpt.py b/tests/models/test_video_gpt.py index 091655a0c..245ae9ee2 100644 --- a/tests/models/test_video_gpt.py +++ b/tests/models/test_video_gpt.py @@ -47,7 +47,6 @@ def set_seed(): class TestVideoGPT: - _model_name = "video_gpt" @pytest.fixture diff --git a/torchmultimodal/models/clip/image_encoder.py b/torchmultimodal/models/clip/image_encoder.py index deb560db4..bfc4af954 100644 --- a/torchmultimodal/models/clip/image_encoder.py +++ b/torchmultimodal/models/clip/image_encoder.py @@ -80,7 +80,6 @@ def __init__( self.projection = nn.Parameter(scale * torch.randn(width, embedding_dim)) def forward(self, x: Tensor) -> Tensor: - if x.size(2) != self.image_size or x.size(3) != self.image_size: raise ValueError( f"Expected input with width and height as {self.image_size}, found {x.size(2)} by {x.size(3)} " diff --git a/torchmultimodal/models/clip/model.py b/torchmultimodal/models/clip/model.py index f2718755f..64922fa0b 100644 --- a/torchmultimodal/models/clip/model.py +++ b/torchmultimodal/models/clip/model.py @@ -68,7 +68,6 @@ def forward( features_a: torch.Tensor, features_b: torch.Tensor, ) -> CLIPOutput: - embeddings_a = self.encoder_a(features_a) embeddings_b = self.encoder_b(features_b) embeddings_a = F.normalize(embeddings_a) diff --git a/torchmultimodal/models/coca/coca_model.py b/torchmultimodal/models/coca/coca_model.py index f1e42ae9a..0566f3702 100644 --- a/torchmultimodal/models/coca/coca_model.py +++ b/torchmultimodal/models/coca/coca_model.py @@ -493,7 +493,6 @@ def __init__( def forward( self, images: Tensor, texts: Tensor, text_padding_mask: Optional[Tensor] = None ) -> Dict[str, Tensor]: - model_out = self.model(images, texts, text_padding_mask) mm_out = model_out.multimodal_embeddings diff --git a/torchmultimodal/models/flava/image_encoder.py b/torchmultimodal/models/flava/image_encoder.py index b2c9c6190..2e9371655 100644 --- a/torchmultimodal/models/flava/image_encoder.py +++ b/torchmultimodal/models/flava/image_encoder.py @@ -247,7 +247,6 @@ def flava_image_encoder( patch_size: int = 16, num_channels: int = 3, ) -> ImageTransformer: - embeddings = ImageEmbeddings( image_size=image_size, patch_size=patch_size, diff --git a/torchmultimodal/models/flava/model.py b/torchmultimodal/models/flava/model.py index 46bc28bd0..50e840947 100644 --- a/torchmultimodal/models/flava/model.py +++ b/torchmultimodal/models/flava/model.py @@ -555,7 +555,6 @@ def flava_model_for_classification( pretrained: bool = True, **flava_model_kwargs: Any, ) -> FLAVAForClassification: - classifier = MLP( in_dim=classifier_in_dim, out_dim=num_classes, diff --git a/torchmultimodal/models/flava/text_encoder.py b/torchmultimodal/models/flava/text_encoder.py index b5e0d2727..17f7cb1f9 100644 --- a/torchmultimodal/models/flava/text_encoder.py +++ b/torchmultimodal/models/flava/text_encoder.py @@ -35,7 +35,6 @@ def flava_text_encoder( # TextEncoder params initializer_range: float = 0.02, ) -> BERTTextEncoder: - embeddings = BERTTextEmbeddings( hidden_size=hidden_size, vocab_size=vocab_size, diff --git a/torchmultimodal/models/flava/transformer.py b/torchmultimodal/models/flava/transformer.py index 9092913f4..363db750c 100644 --- a/torchmultimodal/models/flava/transformer.py +++ b/torchmultimodal/models/flava/transformer.py @@ -261,7 +261,6 @@ def forward( return_attn_weights: bool = False, return_hidden_states: bool = False, ) -> TransformerOutput: - all_hidden_states = [] if return_hidden_states else None all_self_attentions = [] if return_attn_weights else None diff --git a/torchmultimodal/models/mdetr/model.py b/torchmultimodal/models/mdetr/model.py index d4d2b17df..4eaf32ddc 100644 --- a/torchmultimodal/models/mdetr/model.py +++ b/torchmultimodal/models/mdetr/model.py @@ -123,7 +123,6 @@ def _pad_text( return padded_text, mask def forward(self, images: List[Tensor], text: List[Tensor]) -> MDETRModelOutput: - images, image_mask = self._pad_images(images) text, text_attention_mask = self._pad_text(text) encoded_text = self.text_encoder(text, text_attention_mask) @@ -392,7 +391,6 @@ def forward( images: List[Tensor], text: List[Tensor], ) -> MDETRPhraseGroundingOutput: - model_output = self.model(images, text) final_hidden_state = model_output.transformer_output.decoder_hidden_states[-1] diff --git a/torchmultimodal/models/mdetr/transformer.py b/torchmultimodal/models/mdetr/transformer.py index e5551261a..e166244bb 100644 --- a/torchmultimodal/models/mdetr/transformer.py +++ b/torchmultimodal/models/mdetr/transformer.py @@ -166,7 +166,6 @@ def forward( src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ) -> Tensor: - output = src for layer in self.layers: diff --git a/torchmultimodal/modules/encoders/vision_transformer.py b/torchmultimodal/modules/encoders/vision_transformer.py index 4e170ee6e..eb58da917 100644 --- a/torchmultimodal/modules/encoders/vision_transformer.py +++ b/torchmultimodal/modules/encoders/vision_transformer.py @@ -60,7 +60,6 @@ def forward( image_patches_mask: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, ) -> TransformerOutput: - embedding_output = self.embeddings( images, image_patches_mask=image_patches_mask ).embeddings diff --git a/torchmultimodal/modules/fusions/deepset_fusion.py b/torchmultimodal/modules/fusions/deepset_fusion.py index ae8eda077..a1f3415e9 100644 --- a/torchmultimodal/modules/fusions/deepset_fusion.py +++ b/torchmultimodal/modules/fusions/deepset_fusion.py @@ -84,7 +84,6 @@ def __init__( self.mlp = mlp def forward(self, embeddings: Dict[str, Tensor]) -> Tensor: - projections = {} for channel, projection in self.projections.items(): projections[channel] = projection(embeddings[channel]) diff --git a/torchmultimodal/modules/layers/transformer.py b/torchmultimodal/modules/layers/transformer.py index 530a3b0e0..ec4e5d762 100644 --- a/torchmultimodal/modules/layers/transformer.py +++ b/torchmultimodal/modules/layers/transformer.py @@ -366,9 +366,7 @@ def _cross_attention_block( encoder_hidden_states: Tensor, cross_attention_mask: Optional[Tensor] = None, ) -> Tensor: - assert ( - self.cross_attention is not None - ), """ + assert self.cross_attention is not None, """ Cannot use cross-attention unless self.cross_attention and self.cross_attention_dropout are defined. """ @@ -399,7 +397,6 @@ def _forward_prenorm( past_key_value: Optional[Tuple[Tensor, Tensor]] = None, use_cache: bool = False, ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]: - # Self-attention self_attn_input = self.attention_layernorm(hidden_states) attn_output, present_key_value = self._self_attention_block( @@ -440,7 +437,6 @@ def _forward_postnorm( past_key_value: Optional[Tuple[Tensor, Tensor]] = None, use_cache: bool = False, ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]: - # Self-attention attn_output, present_key_value = self._self_attention_block( hidden_states, diff --git a/torchmultimodal/modules/losses/blip2_losses.py b/torchmultimodal/modules/losses/blip2_losses.py index 6e66d85d3..108119d7f 100644 --- a/torchmultimodal/modules/losses/blip2_losses.py +++ b/torchmultimodal/modules/losses/blip2_losses.py @@ -21,6 +21,7 @@ @dataclass class Blip2Stage1Losses(OrderedDict): "Blip-2 stage 1 losses" + image_text_contrastive_loss: torch.Tensor image_text_matching_loss: torch.Tensor image_captioning_loss: torch.Tensor diff --git a/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py b/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py index 155445a0d..88b0f7ee7 100644 --- a/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py +++ b/torchmultimodal/modules/losses/contrastive_loss_with_temperature.py @@ -190,7 +190,6 @@ def forward( cross_entropy_kwargs: Optional[Dict[str, Any]] = None, mask: Optional[Tensor] = None, ) -> Tensor: - self.logit_scale.data.clamp_(self.logit_scale_min, self.logit_scale_max) return contrastive_loss_with_temperature( embeddings_a=embeddings_a, diff --git a/torchmultimodal/modules/losses/flava.py b/torchmultimodal/modules/losses/flava.py index b3c5a5367..8580e9aca 100644 --- a/torchmultimodal/modules/losses/flava.py +++ b/torchmultimodal/modules/losses/flava.py @@ -264,7 +264,6 @@ def forward( text_sequence: Tensor, mask: Tensor, ) -> FLAVAGlobalContrastiveLossOutput: - text_embedding = nn.functional.normalize(text_sequence, dim=-1) image_embedding = nn.functional.normalize( image_sequence, diff --git a/torchmultimodal/modules/losses/mdetr.py b/torchmultimodal/modules/losses/mdetr.py index f07fd1a1f..8b00a7458 100644 --- a/torchmultimodal/modules/losses/mdetr.py +++ b/torchmultimodal/modules/losses/mdetr.py @@ -13,7 +13,7 @@ def _get_src_permutation_idx( - indices: List[Tuple[Tensor, Tensor]] + indices: List[Tuple[Tensor, Tensor]], ) -> Tuple[Tensor, Tensor]: """ Given a list of matched (src, tgt) indices, concatenate the src indices and diff --git a/torchmultimodal/transforms/clip_transform.py b/torchmultimodal/transforms/clip_transform.py index 0f1c49764..1b0f41724 100644 --- a/torchmultimodal/transforms/clip_transform.py +++ b/torchmultimodal/transforms/clip_transform.py @@ -267,7 +267,6 @@ def __init__( text_bpe_merges_path: str = CLIP_DEFAULT_VOCAB_BPE_PATH, num_merges: Optional[int] = 48894, ) -> None: - super().__init__() local_merges_path = _PATH_MANAGER.get_local_path(text_bpe_merges_path) tokenizer = CLIPBPETransform( @@ -399,7 +398,6 @@ def __init__( text_bpe_merges_path: str = CLIP_DEFAULT_VOCAB_BPE_PATH, num_merges: Optional[int] = 48894, ) -> None: - super().__init__() self.image_transform = CLIPImageTransform( image_size, image_interpolation, image_mean, image_std, is_train diff --git a/torchmultimodal/transforms/flava_transform.py b/torchmultimodal/transforms/flava_transform.py index 584121f87..b05bedf64 100644 --- a/torchmultimodal/transforms/flava_transform.py +++ b/torchmultimodal/transforms/flava_transform.py @@ -116,7 +116,6 @@ def __init__( second_interpolation: transforms.InterpolationMode = transforms.InterpolationMode.LANCZOS, **kwargs: Any, ) -> None: - if not isinstance(size, (list, tuple)): size = (size, size) diff --git a/torchmultimodal/transforms/mae_transform.py b/torchmultimodal/transforms/mae_transform.py index b1c83a4f8..2f75624b6 100644 --- a/torchmultimodal/transforms/mae_transform.py +++ b/torchmultimodal/transforms/mae_transform.py @@ -100,7 +100,6 @@ def __init__( mean: Tuple[float, float, float] = (0.485, 0.456, 0.406), std: Tuple[float, float, float] = (0.229, 0.224, 0.225), ) -> None: - img_transforms: List[Callable] = [ transforms.RandomResizedCrop( input_size, scale=scale, interpolation=interpolation