From c7d3fc3490841bee84609121e8f58f7946da763e Mon Sep 17 00:00:00 2001
From: Gift Sinthong <gift.sinthong@ibm.com>
Date: Wed, 16 Aug 2023 16:49:13 -0400
Subject: [PATCH 001/189] Initial commit of PatchTST model classes

Co-authored-by: Phanwadee Sinthong <phsinthong@gmail.com>
Co-authored-by: Nam Nguyen <namctin@gmail.com>
Co-authored-by: Vijay Ekambaram <vijaykr.e@gmail.com>
Co-authored-by: Ngoc Diep Do <55230119+diepi@users.noreply.github.com>
Co-authored-by: Wesley Gifford <79663411+wgifford@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/model_doc/patchtst.md          |  50 ++
 src/transformers/__init__.py                  |  14 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |  11 +
 src/transformers/models/auto/modeling_auto.py |   4 +
 src/transformers/models/patchtst/__init__.py  |  60 ++
 .../models/patchtst/configuration_patchtst.py | 241 +++++++
 .../models/patchtst/modeling_patchtst.py      | 596 ++++++++++++++++++
 tests/models/patchtst/__init__.py             |   0
 .../models/patchtst/test_modeling_patchtst.py | 512 +++++++++++++++
 11 files changed, 1491 insertions(+)
 create mode 100644 docs/source/en/model_doc/patchtst.md
 create mode 100644 src/transformers/models/patchtst/__init__.py
 create mode 100644 src/transformers/models/patchtst/configuration_patchtst.py
 create mode 100755 src/transformers/models/patchtst/modeling_patchtst.py
 create mode 100644 tests/models/patchtst/__init__.py
 create mode 100644 tests/models/patchtst/test_modeling_patchtst.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index fd55a47cd80543..823f49e6c92609 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -712,6 +712,8 @@
         title: Autoformer
       - local: model_doc/informer
         title: Informer
+      - local: model_doc/patchtst
+        title: PatchTST
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
new file mode 100644
index 00000000000000..14523d65c70f3d
--- /dev/null
+++ b/docs/source/en/model_doc/patchtst.md
@@ -0,0 +1,50 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PatchTST
+
+## Overview
+
+The PatchTST model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## PatchTSTConfig
+
+[[autodoc]] PatchTSTConfig
+
+
+## PatchTSTModel
+
+[[autodoc]] PatchTSTModel
+    - forward
+
+
+## PatchTSTForPrediction
+
+[[autodoc]] PatchTSTForPrediction
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b978757d1fe12f..051d4ef647f59c 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -372,6 +372,7 @@
     ],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
     "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"],
+    "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"],
     "models.instructblip": [
         "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "InstructBlipConfig",
@@ -1989,6 +1990,13 @@
             "InformerPreTrainedModel",
         ]
     )
+    _import_structure["models.patchtst"].extend(
+        [
+            "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PatchTSTModel",
+            "PatchTSTPreTrainedModel",
+        ]
+    )
     _import_structure["models.instructblip"].extend(
         [
             "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4462,6 +4470,7 @@
     )
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
+    from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
     from .models.instructblip import (
         INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         InstructBlipConfig,
@@ -5847,6 +5856,11 @@
             InformerModel,
             InformerPreTrainedModel,
         )
+        from .models.patchtst import (
+            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSTModel,
+            PatchTSTPreTrainedModel,
+        )
         from .models.instructblip import (
             INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             InstructBlipForConditionalGeneration,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 3241a412572deb..3b958ac5c1df40 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -105,6 +105,7 @@
     idefics,
     imagegpt,
     informer,
+    patchtst,
     instructblip,
     jukebox,
     layoutlm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0a3effd7955e1b..ac524d6882ad82 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -113,6 +113,8 @@
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
         ("informer", "InformerConfig"),
+        ("patchtst", "PatchTSTConfig"),
+        ("patchtst", "PatchTSTConfig"),
         ("instructblip", "InstructBlipConfig"),
         ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
@@ -198,6 +200,8 @@
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
+        ("patchtst", "PatchTSTConfig"),
+        ("patchtst", "PatchTSTConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
@@ -319,6 +323,8 @@
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -397,6 +403,8 @@
         ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -527,6 +535,7 @@
         ("idefics", "IDEFICS"),
         ("imagegpt", "ImageGPT"),
         ("informer", "Informer"),
+        ("patchtst", "PatchTST"),
         ("instructblip", "InstructBLIP"),
         ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
@@ -623,6 +632,8 @@
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
+        ("patchtst", "patchtst"),
+        ("patchtst", "PatchTST"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "Trajectory Transformer"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2ab504e2f23fdc..75cf77cc73a809 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -110,6 +110,8 @@
         ("idefics", "IdeficsModel"),
         ("imagegpt", "ImageGPTModel"),
         ("informer", "InformerModel"),
+        ("patchtst", "PatchTSTModel"),
+        ("patchtst", "PatchTSTModel"),
         ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
@@ -187,6 +189,8 @@
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
+        ("patchtst", "PatchTSTModel"),
+        ("patchtst", "PatchTSTModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
new file mode 100644
index 00000000000000..e633177a381952
--- /dev/null
+++ b/src/transformers/models/patchtst/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_patchtst": [
+        "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PatchTSTConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_patchtst"] = [
+        "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PatchTSTForPrediction",
+        "PatchTSTModel",
+        "PatchTSTPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_patchtst import (
+            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSTForPrediction,
+            PatchTSTModel,
+            PatchTSTPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
new file mode 100644
index 00000000000000..29759fb4bfcc6a
--- /dev/null
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -0,0 +1,241 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PatchTST model configuration"""
+
+from typing import List, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ibm/patchtst-base": "https://huggingface.co/ibm/patchtst-base/resolve/main/config.json",
+}
+
+
+class PatchTSTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an
+    PatchTST model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
+            typically dictated by the dataset and we recommend to set it appropriately.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If `None`, the context length will be the same as the
+            `prediction_length`.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        attention_type (`str`, *optional*, defaults to "prob"):
+            Attention used in encoder. This can be set to "prob" (PatchTST's ProbAttention) or "full" (vanilla
+            transformer's canonical self-attention).
+        sampling_factor (`int`, *optional*, defaults to 5):
+            ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the
+            reduced query matrix (Q_reduce) input length.
+        distil (`bool`, *optional*, defaults to `True`):
+            Whether to use distilling in encoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import PatchTSTConfig, PatchTSTModel
+
+    >>> # Initializing an PatchTST configuration with 12 time steps for prediction
+    >>> configuration = PatchTSTConfig(prediction_length=12)
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = PatchTSTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "patchtst"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        input_size: int = 1,
+        context_length: int = 32,
+        patch_length: int = 8,
+        stride: int = 8,
+        encoder_layers: int = 3,
+        d_model: int = 128,
+        encoder_attention_heads: int = 16,
+        shared_embedding: bool = True,
+        channel_attention: bool = False,
+        encoder_ffn_dim: int = 256,
+        norm: str = "BatchNorm",
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+        positional_dropout: float = 0.0,
+        dropout_path: float = 0.0,
+        ff_dropout: float = 0.0,
+        bias: bool = True,
+        activation_function: str = "gelu",
+        pre_norm: bool = False,
+        store_attn: bool = False,
+        positional_encoding: str = "sincos",
+        learn_pe: bool = False,
+        use_cls_token: bool = False,
+        patch_last: bool = True,
+        individual: bool = False,
+        mask_type: str = "random",
+        mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        mask_patch_ratios: list = [1, 1],
+        channel_consistent_masking: bool = True,
+        d_size: str = "4D",
+        cv_channel_indices: list = None,
+        mask_value=0,
+        pooling: str = 'mean',
+        num_classes: int = 1,
+        head_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+        qkv_bias: bool = True,
+        num_dynamic_real_features: int = 0,
+        num_static_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_time_features: int = 0,
+        is_encoder_decoder: bool = False,
+        encoder_layerdrop: float = 0.1,
+
+        # PatchTST arguments
+        attention_type: str = "prob",
+        sampling_factor: int = 5,
+        distil: bool = True,
+        **kwargs,
+    ):
+
+        # time series specific configuration
+        self.context_length = context_length
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+
+        # Transformer architecture configuration
+        self.d_model = d_model
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.shared_embedding = shared_embedding
+        self.channel_attention = channel_attention
+        self.norm = norm
+        self.positional_dropout = positional_dropout
+        self.dropout_path = dropout_path
+        self.ff_dropout = ff_dropout
+        self.bias = bias
+        self.activation_function = activation_function
+        self.pre_norm = pre_norm
+        self.store_attention = store_attn
+        self.positional_encoding = positional_encoding
+        self.learn_pe = learn_pe
+        self.use_cls_token = use_cls_token
+        self.patch_last = patch_last
+        self.individual = individual
+
+        # PatchTST
+        self.patch_length = patch_length
+        self.stride = stride
+        self.attention_type = attention_type
+        self.sampling_factor = sampling_factor
+        self.distil = distil
+
+        # Masking
+        self.mask_type = mask_type
+        self.mask_ratio = mask_ratio
+        self.mask_patches = mask_patches
+        self.mask_patch_ratios = mask_patch_ratios
+        self.channel_consistent_masking = channel_consistent_masking
+        self.d_size = d_size
+        self.cv_channel_indices = cv_channel_indices
+        self.mask_value = mask_value
+
+        # Classification
+        self.pooling = pooling
+        self.num_classes = num_classes
+        self.head_dropout = head_dropout
+        self.proj_dropout = proj_dropout
+        self.qkv_bias = qkv_bias
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
new file mode 100755
index 00000000000000..3e861d8b9ac553
--- /dev/null
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -0,0 +1,596 @@
+# coding=utf-8
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PatchTST model."""
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+import math
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, logging
+from ...modeling_outputs import BaseModelOutputWithNoAttention
+from .configuration_patchtst import PatchTSTConfig
+from torch.nn.modules.activation import MultiheadAttention
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PatchTSTConfig"
+
+
+PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ibm/patchtst-base",
+    # See all PatchTST models at https://huggingface.co/models?filter=patchtst
+]
+
+
+class PatchTSTAttention(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+
+        self.self_attn = MultiheadAttention(embed_dim=config.d_model,
+                                                 num_heads=config.encoder_attention_heads,
+                                                 dropout=config.attention_dropout,
+                                                 bias=config.bias,
+                                                 add_bias_kv=True,
+                                                 add_zero_attn=False,
+                                                 batch_first=True
+                                                 )
+
+    def forward(self, src: torch.Tensor) -> torch.Tensor:
+        """
+        src: Tensor [bs x q_len x d_model]
+        """
+        src, _ = self.self_attn(src, src, src, need_weights=False)
+        return src
+
+
+def get_activation_fn(activation):
+    if callable(activation): return activation()
+    elif activation.lower() == "relu": return nn.ReLU()
+    elif activation.lower() == "gelu": return nn.GELU()
+    raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable')
+
+
+class Transpose(nn.Module):
+    def __init__(self, *dims, contiguous=False):
+        super().__init__()
+        self.dims, self.contiguous = dims, contiguous
+
+    def forward(self, x):
+        if self.contiguous: return x.transpose(*self.dims).contiguous()
+        else: return x.transpose(*self.dims)
+
+
+def positional_encoding(pe, learn_pe, q_len, d_model):
+    # Positional encoding
+    if pe == None:
+        w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe
+        nn.init.uniform_(w_pos, -0.02, 0.02)
+        learn_pe = False
+    elif pe == 'zero':
+        w_pos = torch.empty((q_len, 1))
+        nn.init.uniform_(w_pos, -0.02, 0.02)
+    elif pe == 'zeros':
+        w_pos = torch.empty((q_len, d_model))
+        nn.init.uniform_(w_pos, -0.02, 0.02)
+    elif pe == 'normal' or pe == 'gauss':
+        w_pos = torch.zeros((q_len, 1))
+        torch.nn.init.normal_(w_pos, mean=0.0, std=0.1)
+    elif pe == 'uniform':
+        w_pos = torch.zeros((q_len, 1))
+        nn.init.uniform_(w_pos, a=0.0, b=0.1)
+    elif pe == 'lin1d': w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True)
+    elif pe == 'exp1d': w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True)
+    elif pe == 'lin2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True)
+    elif pe == 'exp2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True)
+    elif pe == 'sincos':
+        pos_enc = torch.zeros(q_len, d_model)
+        position = torch.arange(0, q_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
+        pos_enc[:, 0::2] = torch.sin(position * div_term)
+        pos_enc[:, 1::2] = torch.cos(position * div_term)
+        pos_enc = pos_enc - pos_enc.mean()
+        pos_enc = pos_enc / (pos_enc.std() * 10)
+        w_pos = pos_enc
+    else: raise ValueError(f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \
+        'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)")
+    return nn.Parameter(w_pos, requires_grad=learn_pe)
+
+
+def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False):
+    x = .5 if exponential else 1
+    i = 0
+    for i in range(100):
+        cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1
+        # pv(f'{i:4.0f}  {x:5.3f}  {cpe.mean():+6.3f}', verbose)
+        if abs(cpe.mean()) <= eps: break
+        elif cpe.mean() > eps: x += .001
+        else: x -= .001
+        i += 1
+    if normalize:
+        cpe = cpe - cpe.mean()
+        cpe = cpe / (cpe.std() * 10)
+    return cpe
+
+
+def coord1d_pos_encoding(q_len, exponential=False, normalize=True):
+    cpe = (2 * (torch.linspace(0, 1, q_len).reshape(-1, 1)**(.5 if exponential else 1)) - 1)
+    if normalize:
+        cpe = cpe - cpe.mean()
+        cpe = cpe / (cpe.std() * 10)
+    return cpe
+
+
+class TSTEncoderLayer(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.pre_norm = config.pre_norm
+
+        assert not config.d_model % config.encoder_attention_heads, f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})"
+
+        # Multi-Head attention
+        self.self_attn = PatchTSTAttention(config)
+
+        # Add & Norm of the sublayer 1
+        self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
+        if "batch" in config.norm.lower():
+            self.norm_sublayer1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+        else:
+            self.norm_sublayer1 = nn.LayerNorm(config.d_model)
+
+        # Position-wise Feed-Forward
+        self.ff = nn.Sequential(
+            nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias),
+            get_activation_fn(config.activation_function),
+            nn.Dropout(config.ff_dropout) if config.ff_dropout > 0 else nn.Identity(),
+            nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias),
+        )
+
+        # Add & Norm of sublayer 2
+        self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
+        if "batch" in config.norm.lower():
+            self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+        else:
+            self.norm_sublayer2 = nn.LayerNorm(config.d_model)
+
+    def forward(self, src: torch.Tensor):
+        """
+        src: tensor [bs x seq_len x d_model]
+        Return:
+            Tensor [bs x seq_len x d_model]
+        """
+        # First sublayer: mixing across time
+        if self.pre_norm:
+            ## Norm and Multi-Head attention and Add residual connection
+            src = src + self.dropout_path1(
+                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
+        else:
+            ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
+            src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src)))
+
+        # Second sublayer: mixing across hidden dimension
+        if self.pre_norm:
+            ## Norm and Position-wise Feed-Forward and Add residual connection
+            src = src + self.dropout_path2(
+                self.ff(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
+        else:
+            ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
+            src = self.norm_sublayer2(
+                src + self.dropout_path2(self.ff(src)))  # Add: residual connection with residual dropout
+
+        return src
+
+
+class TSTEncoder(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            [
+                TSTEncoderLayer(config)
+                for i in range(config.encoder_layers)
+            ]
+        )
+
+    def forward(self, src: torch.Tensor,
+                output_hidden_states: Optional[bool] = False,
+                output_attention: Optional[bool] = False
+                ) -> torch.Tensor:
+        """
+        src: tensor [bs x seq_len x d_model]
+        Return:
+            Tensor [bs x seq_len x d_model]
+        """
+        all_hidden_states = []
+        for mod in self.layers:
+            if output_hidden_states:
+                src = mod(src)
+                all_hidden_states.append(src)
+        if output_hidden_states: return src, all_hidden_states
+        return src
+    
+
+class PatchTSTPreTrainedModel(PreTrainedModel):
+    config_class = PatchTSTConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize weights"""
+        if self.config.use_cls_token:
+            torch.nn.init.normal_(self.config.cls_token, std=.02)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (PatchTSTEncoder)):
+            module.gradient_checkpointing = value
+            
+
+class PatchTSTEncoder(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+        # self.n_vars = c_in
+        self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1
+        self.d_model = config.d_model
+        self.shared_embedding = config.shared_embedding
+        self.use_cls_token = config.use_cls_token
+
+        # Added params for patching
+        self.patch_last = config.patch_last
+        self.mask_ratio = config.mask_ratio
+
+        # Input encoding: projection of feature vectors onto a d-dim vector space
+        if not self.shared_embedding:
+            self.W_P = nn.ModuleList()
+            for _ in range(config.input_size):
+                self.W_P.append(nn.Linear(config.patch_length, self.d_model))
+        else:
+            self.W_P = nn.Linear(config.patch_length, config.d_model)
+
+        # Positional encoding
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
+            self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model)
+        else:
+            self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model)
+
+        # Positional dropout
+        self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity()
+
+        # Encoder
+        self.encoder = TSTEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: tensor [bs x nvars x num_patch x patch_len]    #[bs x num_patch x nvars x patch_len]
+        return:
+            tensor [bs x nvars x num_patch x d_model]
+                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        """
+
+        # bs, num_patch, n_vars, patch_len = x.shape
+        bs, n_vars, num_patch, patch_len = x.shape
+        # Input encoding
+        if not self.shared_embedding:
+            x_out = []
+            for i in range(n_vars):
+                z = self.W_P[i](x[:, i, :, :])
+                x_out.append(z)
+            x = torch.stack(x_out, dim=1)
+        else:
+            x = self.W_P(x)  # x: [bs x nvars  x num_patch x d_model]
+
+        # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model]
+        x = x.view(bs * n_vars, num_patch, self.d_model)  # x: [bs * nvars x num_patch x d_model]
+
+        if self.use_cls_token:
+            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
+            x = self.dropout(x + self.W_pos[1:, :])  # x: [bs * nvars x num_patch x d_model]
+            # append cls token
+            cls_token = self.cls_token + self.W_pos[:1, :]  # cls_token: [1 x 1 x d_model]
+            cls_tokens = cls_token.expand(x.shape[0], -1, -1)  # get the same copy for all the batch samples
+            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs * nvars x (num_patch+1) x d_model]
+        else:
+            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
+            x = self.dropout(x + self.W_pos)  # x: [bs * nvars x num_patch x d_model]
+
+        # Encoder
+        x = self.encoder(x)  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
+        x = torch.reshape(x, (bs, n_vars, -1, self.d_model))  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        return x
+
+
+PATCHTST_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PatchTSTConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PATCHTST_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+            Past values of the time series, that serve as context in order to predict the future. The sequence size of
+            this tensor must be larger than the `context_length` of the model, since the model will use the larger size
+            to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
+            context".
+
+            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
+            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
+            the past.
+
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+            Required time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step. Holiday features are also a good example of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`.
+
+            The sequence length here is equal to `prediction_length`.
+
+            See the demo notebook and code snippets for details.
+
+            Optionally, during training any missing values need to be replaced with zeros and indicated via the
+            `future_observed_mask`.
+
+            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            variates in the time series per time step.
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+            Required time features for the prediction window, which the model internally will add to `future_values`.
+            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
+            Fourier features). These could also be so-called "age" features, which basically help the model know "at
+            which point in life" a time-series is. Age features have small values for distant past time steps and
+            increase monotonically the more we approach the current time step. Holiday features are also a good example
+            of time features.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features. The Time Series Transformer only learns
+            additional embeddings for `static_categorical_features`.
+
+            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
+            must but known at prediction time.
+
+            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            This mask is used to filter out missing values for the final loss calculation.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class PatchTSTEncoder(PatchTSTPreTrainedModel):
+    """
+    PatchTST encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
+    attention layer is an [`PatchTSTEncoderLayer`].
+
+    Args:
+        config: PatchTSTConfig
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+        # self.n_vars = c_in
+        self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1
+        self.d_model = config.d_model
+        self.shared_embedding = config.shared_embedding
+        self.use_cls_token = config.use_cls_token
+
+        # Added params for patching
+        self.patch_last = config.patch_last
+        self.mask_ratio = config.mask_ratio
+
+        # Input encoding: projection of feature vectors onto a d-dim vector space
+        if not self.shared_embedding:
+            self.w_p = nn.ModuleList()
+            for _ in range(config.input_size):
+                self.w_p.append(nn.Linear(config.patch_length, self.d_model))
+        else:
+            self.w_p = nn.Linear(config.patch_length, config.d_model)
+
+        # Positional encoding
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
+            self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model)
+        else:
+            self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model)
+
+        # Positional dropout
+        self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity()
+
+        # Encoder
+        self.encoder = TSTEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: tensor [bs x nvars x num_patch x patch_len]    #[bs x num_patch x nvars x patch_len]
+        return:
+            tensor [bs x nvars x num_patch x d_model]
+                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        """
+
+        # bs, num_patch, n_vars, patch_len = x.shape
+        bs, n_vars, num_patch, patch_len = x.shape
+        # Input encoding
+        if not self.shared_embedding:
+            x_out = []
+            for i in range(n_vars):
+                z = self.w_p[i](x[:, i, :, :])
+                x_out.append(z)
+            x = torch.stack(x_out, dim=1)
+        else:
+            x = self.w_p(x)  # x: [bs x nvars  x num_patch x d_model]
+
+        # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model]
+        x = x.view(bs * n_vars, num_patch, self.d_model)  # x: [bs * nvars x num_patch x d_model]
+
+        if self.use_cls_token:
+            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
+            x = self.dropout(x + self.w_pos[1:, :])  # x: [bs * nvars x num_patch x d_model]
+            # append cls token
+            cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x d_model]
+            cls_tokens = cls_token.expand(x.shape[0], -1, -1)  # get the same copy for all the batch samples
+            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs * nvars x (num_patch+1) x d_model]
+        else:
+            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
+            x = self.dropout(x + self.w_pos)  # x: [bs * nvars x num_patch x d_model]
+
+        # Encoder
+        x = self.encoder(
+            x)  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
+        x = torch.reshape(x, (bs, n_vars, -1,
+                              self.d_model))  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        return x
+
+
+@add_start_docstrings(
+    "The bare PatchTST Model outputting raw hidden-states without any specific head on top.",
+    PATCHTST_START_DOCSTRING,
+)
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
+class PatchTSTModel(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+        self.encoder = PatchTSTEncoder(config)
+
+    def forward(self, x: torch.Tensor):
+        encoder_output = self.encoder(x)
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=encoder_output,
+            hidden_states=None
+        )
+
+
diff --git a/tests/models/patchtst/__init__.py b/tests/models/patchtst/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
new file mode 100644
index 00000000000000..cf8060a284f232
--- /dev/null
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -0,0 +1,512 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch PatchTST model. """
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import PatchTSTConfig, PatchTSTForPrediction, PatchTSTModel
+    from transformers.models.patchtst.modeling_patchtst import PatchTSTDecoder, PatchTSTEncoder
+
+
+@require_torch
+class PatchTSTModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+        sampling_factor=10,
+        distil=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = min(
+            sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
+        )
+        self.decoder_seq_length = min(
+            sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
+        )
+        self.sampling_factor = sampling_factor
+        self.distil = distil
+
+    def get_config(self):
+        return PatchTSTConfig(
+            prediction_length=self.prediction_length,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            context_length=self.context_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            num_static_real_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+            sampling_factor=self.sampling_factor,
+            distil=self.distil,
+        )
+
+    def prepare_patchtst_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        static_real_features = floats_tensor([self.batch_size, 1])
+
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "static_real_features": static_real_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_patchtst_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = PatchTSTModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = PatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
+        enc_input = transformer_inputs[:, : config.context_length, ...]
+        dec_input = transformer_inputs[:, config.context_length :, ...]
+
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = PatchTSTDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (PatchTSTModel, PatchTSTForPrediction) if is_torch_available() else ()
+    all_generative_model_classes = (PatchTSTForPrediction,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = PatchTSTModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=PatchTSTConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.context_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Ignore since we have no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_model_outputs_equivalence(self):
+        pass
+
+    def test_determinism(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            expected_arg_names.extend(
+                [
+                    "future_observed_mask",
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+                if "future_observed_mask" in arg_names
+                else [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        context_length = getattr(self.model_tester, "context_length", seq_len)
+        prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 7
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    encoder_seq_length,
+                ],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+        )
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class PatchTSTModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+        batch = prepare_batch()
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            ).last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_time_features=batch["future_time_features"],
+            ).encoder_last_hidden_state
+
+        # encoder distils the context length to 1/8th of the original length
+        expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 97628bab3853e8182ec2ff4a602cb5202d869c59 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Wed, 16 Aug 2023 19:06:51 -0400
Subject: [PATCH 002/189] Add PatchTSTForPretraining

---
 src/transformers/models/patchtst/__init__.py  |   4 +-
 .../models/patchtst/modeling_patchtst.py      | 268 +++++++++++++++++-
 2 files changed, 263 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index e633177a381952..73333c3fee067a 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -32,7 +32,7 @@
 else:
     _import_structure["modeling_patchtst"] = [
         "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "PatchTSTForPrediction",
+        "PatchTSTForPretraining",
         "PatchTSTModel",
         "PatchTSTPreTrainedModel",
     ]
@@ -49,7 +49,7 @@
     else:
         from .modeling_patchtst import (
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
-            PatchTSTForPrediction,
+            PatchTSTForPretraining,
             PatchTSTModel,
             PatchTSTPreTrainedModel,
         )
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 3e861d8b9ac553..6e72fb057bd845 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -25,7 +25,7 @@
 from ...modeling_outputs import BaseModelOutputWithNoAttention
 from .configuration_patchtst import PatchTSTConfig
 from torch.nn.modules.activation import MultiheadAttention
-
+from ...utils import ModelOutput
 
 logger = logging.get_logger(__name__)
 
@@ -265,12 +265,12 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if self.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model)
+            self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model)
         else:
-            self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model)
+            self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model)
 
         # Positional dropout
-        self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity()
+        self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
 
         # Encoder
         self.encoder = TSTEncoder(config)
@@ -521,12 +521,12 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if self.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model)
         else:
-            self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model)
 
         # Positional dropout
-        self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity()
+        self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
 
         # Encoder
         self.encoder = TSTEncoder(config)
@@ -594,3 +594,257 @@ def forward(self, x: torch.Tensor):
         )
 
 
+class PretrainHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dropout = nn.Dropout(config.dropout)
+        self.linear = nn.Linear(config.d_model, config.patch_length)
+        self.use_cls_token = config.use_cls_token
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: tensor [bs x nvars x num_patch x d_model]
+                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: tensor [bs x nvars x num_patch x patch_len]
+        """
+        x = self.linear(self.dropout(x))  # [bs x nvars x num_patch x patch_len]
+        if self.use_cls_token: x = x[:, :, 1:, :]  # remove the first cls token
+        return x
+
+
+def cv_random_masking(xb: torch.Tensor,
+                      mask_ratio: float,
+                      cv_channel_indices: list = None,
+                      channel_consistent_masking: bool = True,
+                      d_size="4D",
+                      mask_value=0):
+    """cv_random_masking: Mask the input considering the control variables.
+
+    Args:
+        xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
+        mask_ratio (float): Mask ratio.
+        cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D".
+        mask_value (int, optional): Value to use for masking. Defaults to 0.
+
+    Returns:
+        Tensor: xb_mask, masked input, same shape as input
+        Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x tsg2 x c x n]
+    """
+    if d_size == "4D":
+        bs, nvars, L, D = xb.shape
+
+    len_keep = int(L * (1 - mask_ratio))
+
+    if d_size == "4D":
+        if channel_consistent_masking:
+            noise = torch.rand(bs, 1, L, device=xb.device)  # noise in [0, 1], bs x 1 x  L
+            noise = noise.repeat(1, nvars, 1)  # bs x nvars x L
+        else:
+            noise = torch.rand(bs, nvars, L, device=xb.device)  # noise in [0, 1], bs x nvars x L
+
+        mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
+        mask[:, :, :len_keep] = 0
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
+    mask = torch.gather(mask, dim=-1, index=ids_restore)
+
+    if d_size == "4D":
+        mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
+        if cv_channel_indices is not None:
+            mask[:, cv_channel_indices, :, :] = 0
+
+    xb_mask = xb.masked_fill(mask.bool(), mask_value)
+    return xb_mask, mask[..., 0]
+
+
+class PatchMasking(nn.Module):
+    def __init__(self,
+                 mask_type: str = "random",
+                 mask_ratio=0.5,
+                 mask_patches: list = [2, 3],
+                 mask_patch_ratios: list = [1, 1],
+                 channel_consistent_masking: bool = True,
+                 d_size: str = "4D",
+                 cv_channel_indices: list = None,
+                 mask_value=0, ):
+        """PatchMasking: Class to random or forcast masking.
+
+        Args:
+            mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
+            mask_ratio (float, optional): Mask ratio.
+            mask_patches (list, optional): List of patch lengths to mask in the end of the data.
+            mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
+            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
+            cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
+            channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+            d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D".
+            mask_value (int, optional): Value to use for masking. Defaults to 0.
+        """
+
+        self.mask_ratio = mask_ratio
+        self.channel_consistent_masking = channel_consistent_masking
+        self.d_size = d_size
+        self.mask_type = mask_type
+        self.mask_patches = mask_patches
+        self.mask_patch_ratios = mask_patch_ratios
+        self.cv_channel_indices = cv_channel_indices
+        self.mask_value = mask_value
+        if self.cv_channel_indices is not None:
+            self.cv_channel_indices.sort()
+
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+
+        """
+        Input:
+            x: patched input
+                4D: [bs x n_vars x num_patch  x patch_len]
+
+        Output:
+            x_mask: Masked patched input
+                4D: [bs x n_vars x num_patch  x patch_len]
+            mask: bool tensor indicating True on masked points
+                4D: [bs x n_vars x num_patch]
+        """
+
+        if self.mask_type == "random":
+            x_mask, mask = cv_random_masking(xb=x,
+                                             mask_ratio=self.mask_ratio,
+                                             cv_channel_indices=self.cv_channel_indices,
+                                             channel_consistent_masking=self.channel_consistent_masking,
+                                             d_size=self.d_size,
+                                             mask_value=self.mask_value)
+
+        else:
+            raise Exception("Invalid mask type")
+
+        mask = mask.bool()  # mask: [bs x n_vars x num_patch]
+
+        return x_mask, mask
+
+
+class Patch(nn.Module):
+    """
+    A class to patchify the time series sequence into different patches
+    """
+    def __init__(self,
+                 seq_len: int,
+                 patch_len: int,
+                 stride: int,
+                 padding: bool = False  # TODO: use this to set whether we want to pad zeros to the sequence
+                 ):
+        super().__init__()
+
+        assert (seq_len > patch_len), f'Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})'
+
+        self.seq_len = seq_len
+        self.patch_len = patch_len
+        self.stride = stride
+
+        # get the number of patches
+        self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1
+        tgt_len = patch_len + stride * (self.num_patch - 1)
+        self.s_begin = seq_len - tgt_len
+
+    def forward(self, x: torch.Tensor):
+        """
+
+        Args:
+            x (torch.Tensor, required): Input of shape [bs x ... x seq_len x n_vars]
+        Returns:
+            z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
+        """
+        seq_len = x.shape[-2]
+        assert (seq_len == self.seq_len), f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
+
+        # x = x[:, :, self.s_begin:, :]  # xb: [bs x ... x tgt_len x nvars]
+        z = x.transpose(0, -2)[self.s_begin:]    # z: [tgt_len x ... x bs x n_vars]
+        z = z.transpose(0, -2).contiguous()     # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
+        z = z.unfold(dimension=-2, size=self.patch_len, step=self.stride)  # xb: [bs x ... x num_patch x n_vars x patch_len]
+        z = z.transpose(-2, -3).contiguous()  # xb: [bs x ... x n_vars x num_patch x patch_len]
+        return z
+
+
+class PatchTSTForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PatchTSTForPretraining(PatchTSTPreTrainedModel):
+    # PatchTSTModel + Pretraining Head
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.patching = Patch(config.context_length,
+                              patch_len=config.patch_length,
+                              stride=config.stride)
+        self.masking = PatchMasking(mask_type=config.mask_type,
+                                    mask_ratio=config.mask_ratio,
+                                    mask_patches=config.mask_patches,
+                                    mask_patch_ratios=config.mask_patch_ratios,
+                                    channel_consistent_masking=config.channel_consistent_masking,
+                                    d_size=config.d_size,
+                                    cv_channel_indices=config.cv_channel_indices,
+                                    mask_value=config.mask_value)
+        self.model = PatchTSTModel(config)
+        self.head = PretrainHead(config)
+        self.loss = torch.nn.MSELoss(reduction='mean')
+
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor] = None
+                ) -> PatchTSTForPreTrainingOutput:
+        """
+        past_values (x): tensor [bs x n_vars x num_patch x patch_len]
+        future_values (y): labels
+        """
+
+        # x: [bs x n_vars x num_patch x patch_len] for pretrain
+
+        patched_x = self.patching(past_values)
+        masked_x, masked = self.masking(patched_x)
+        model_output = self.model(masked_x)  # x: [bs x nvars x num_patch x d_model]
+        #  or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len]
+
+        loss_val = self.loss(x_hat, patched_x)
+        return PatchTSTForPreTrainingOutput(
+            loss=loss_val,
+            prediction_logits=x_hat,
+        )
+
+
+

From 10b75170f56041f42403b5ce93886cbd4e8a1034 Mon Sep 17 00:00:00 2001
From: "Wesley M. Gifford" <wmgifford@us.ibm.com>
Date: Mon, 21 Aug 2023 08:15:15 -0400
Subject: [PATCH 003/189] update to include classification

Co-authored-by: Phanwadee Sinthong <phsinthong@gmail.com>
Co-authored-by: Nam Nguyen <namctin@gmail.com>
Co-authored-by: Vijay Ekambaram <vijaykr.e@gmail.com>
Co-authored-by: Ngoc Diep Do <55230119+diepi@users.noreply.github.com>
Co-authored-by: Wesley Gifford <79663411+wgifford@users.noreply.github.com>
---
 .../models/auto/configuration_auto.py         |  14 +-
 src/transformers/models/auto/modeling_auto.py |  10 +
 .../models/patchtst/modeling_patchtst.py      | 350 ++++++++++++------
 3 files changed, 250 insertions(+), 124 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ac524d6882ad82..f3d32bc388b9f5 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -113,8 +113,6 @@
         ("idefics", "IdeficsConfig"),
         ("imagegpt", "ImageGPTConfig"),
         ("informer", "InformerConfig"),
-        ("patchtst", "PatchTSTConfig"),
-        ("patchtst", "PatchTSTConfig"),
         ("instructblip", "InstructBlipConfig"),
         ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
@@ -159,6 +157,7 @@
         ("openai-gpt", "OpenAIGPTConfig"),
         ("opt", "OPTConfig"),
         ("owlvit", "OwlViTConfig"),
+        ("patchtst", "PatchTSTConfig"),
         ("pegasus", "PegasusConfig"),
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
@@ -200,8 +199,6 @@
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
-        ("patchtst", "PatchTSTConfig"),
-        ("patchtst", "PatchTSTConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
@@ -323,8 +320,6 @@
         ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -365,6 +360,7 @@
         ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("opt", "OPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("owlvit", "OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus_x", "PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("perceiver", "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -403,8 +399,6 @@
         ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -535,7 +529,6 @@
         ("idefics", "IDEFICS"),
         ("imagegpt", "ImageGPT"),
         ("informer", "Informer"),
-        ("patchtst", "PatchTST"),
         ("instructblip", "InstructBLIP"),
         ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
@@ -588,6 +581,7 @@
         ("openai-gpt", "OpenAI GPT"),
         ("opt", "OPT"),
         ("owlvit", "OWL-ViT"),
+        ("patchtst", "PatchTST"),
         ("pegasus", "Pegasus"),
         ("pegasus_x", "PEGASUS-X"),
         ("perceiver", "Perceiver"),
@@ -632,8 +626,6 @@
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
-        ("patchtst", "patchtst"),
-        ("patchtst", "PatchTST"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "Trajectory Transformer"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 75cf77cc73a809..64ccf4061b28aa 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1111,6 +1111,12 @@
     ]
 )
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("PatchTST", "PatchTSTForClassification"),
+    ]
+)
+
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
@@ -1196,6 +1202,10 @@
 
 MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES
+)
+
 
 class AutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6e72fb057bd845..f661008f022c02 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -42,14 +42,15 @@ class PatchTSTAttention(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.self_attn = MultiheadAttention(embed_dim=config.d_model,
-                                                 num_heads=config.encoder_attention_heads,
-                                                 dropout=config.attention_dropout,
-                                                 bias=config.bias,
-                                                 add_bias_kv=True,
-                                                 add_zero_attn=False,
-                                                 batch_first=True
-                                                 )
+        self.self_attn = MultiheadAttention(
+            embed_dim=config.d_model,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            bias=config.bias,
+            add_bias_kv=True,
+            add_zero_attn=False,
+            batch_first=True,
+        )
 
     def forward(self, src: torch.Tensor) -> torch.Tensor:
         """
@@ -60,9 +61,12 @@ def forward(self, src: torch.Tensor) -> torch.Tensor:
 
 
 def get_activation_fn(activation):
-    if callable(activation): return activation()
-    elif activation.lower() == "relu": return nn.ReLU()
-    elif activation.lower() == "gelu": return nn.GELU()
+    if callable(activation):
+        return activation()
+    elif activation.lower() == "relu":
+        return nn.ReLU()
+    elif activation.lower() == "gelu":
+        return nn.GELU()
     raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable')
 
 
@@ -72,33 +76,39 @@ def __init__(self, *dims, contiguous=False):
         self.dims, self.contiguous = dims, contiguous
 
     def forward(self, x):
-        if self.contiguous: return x.transpose(*self.dims).contiguous()
-        else: return x.transpose(*self.dims)
+        if self.contiguous:
+            return x.transpose(*self.dims).contiguous()
+        else:
+            return x.transpose(*self.dims)
 
 
 def positional_encoding(pe, learn_pe, q_len, d_model):
     # Positional encoding
     if pe == None:
-        w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe
+        w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
         nn.init.uniform_(w_pos, -0.02, 0.02)
         learn_pe = False
-    elif pe == 'zero':
+    elif pe == "zero":
         w_pos = torch.empty((q_len, 1))
         nn.init.uniform_(w_pos, -0.02, 0.02)
-    elif pe == 'zeros':
+    elif pe == "zeros":
         w_pos = torch.empty((q_len, d_model))
         nn.init.uniform_(w_pos, -0.02, 0.02)
-    elif pe == 'normal' or pe == 'gauss':
+    elif pe == "normal" or pe == "gauss":
         w_pos = torch.zeros((q_len, 1))
         torch.nn.init.normal_(w_pos, mean=0.0, std=0.1)
-    elif pe == 'uniform':
+    elif pe == "uniform":
         w_pos = torch.zeros((q_len, 1))
         nn.init.uniform_(w_pos, a=0.0, b=0.1)
-    elif pe == 'lin1d': w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True)
-    elif pe == 'exp1d': w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True)
-    elif pe == 'lin2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True)
-    elif pe == 'exp2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True)
-    elif pe == 'sincos':
+    elif pe == "lin1d":
+        w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True)
+    elif pe == "exp1d":
+        w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True)
+    elif pe == "lin2d":
+        w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True)
+    elif pe == "exp2d":
+        w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True)
+    elif pe == "sincos":
         pos_enc = torch.zeros(q_len, d_model)
         position = torch.arange(0, q_len).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
@@ -107,20 +117,29 @@ def positional_encoding(pe, learn_pe, q_len, d_model):
         pos_enc = pos_enc - pos_enc.mean()
         pos_enc = pos_enc / (pos_enc.std() * 10)
         w_pos = pos_enc
-    else: raise ValueError(f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \
-        'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)")
+    else:
+        raise ValueError(
+            f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \
+        'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)"
+        )
     return nn.Parameter(w_pos, requires_grad=learn_pe)
 
 
 def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False):
-    x = .5 if exponential else 1
+    x = 0.5 if exponential else 1
     i = 0
     for i in range(100):
-        cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1
+        cpe = (
+            2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
+            - 1
+        )
         # pv(f'{i:4.0f}  {x:5.3f}  {cpe.mean():+6.3f}', verbose)
-        if abs(cpe.mean()) <= eps: break
-        elif cpe.mean() > eps: x += .001
-        else: x -= .001
+        if abs(cpe.mean()) <= eps:
+            break
+        elif cpe.mean() > eps:
+            x += 0.001
+        else:
+            x -= 0.001
         i += 1
     if normalize:
         cpe = cpe - cpe.mean()
@@ -129,7 +148,7 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=
 
 
 def coord1d_pos_encoding(q_len, exponential=False, normalize=True):
-    cpe = (2 * (torch.linspace(0, 1, q_len).reshape(-1, 1)**(.5 if exponential else 1)) - 1)
+    cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** (0.5 if exponential else 1)) - 1
     if normalize:
         cpe = cpe - cpe.mean()
         cpe = cpe / (cpe.std() * 10)
@@ -141,7 +160,9 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.pre_norm = config.pre_norm
 
-        assert not config.d_model % config.encoder_attention_heads, f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})"
+        assert (
+            not config.d_model % config.encoder_attention_heads
+        ), f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})"
 
         # Multi-Head attention
         self.self_attn = PatchTSTAttention(config)
@@ -178,7 +199,8 @@ def forward(self, src: torch.Tensor):
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer1(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src)))
@@ -187,11 +209,13 @@ def forward(self, src: torch.Tensor):
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path2(
-                self.ff(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
+                self.ff(self.norm_sublayer2(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer2(
-                src + self.dropout_path2(self.ff(src)))  # Add: residual connection with residual dropout
+                src + self.dropout_path2(self.ff(src))
+            )  # Add: residual connection with residual dropout
 
         return src
 
@@ -200,17 +224,11 @@ class TSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.layers = nn.ModuleList(
-            [
-                TSTEncoderLayer(config)
-                for i in range(config.encoder_layers)
-            ]
-        )
+        self.layers = nn.ModuleList([TSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
-    def forward(self, src: torch.Tensor,
-                output_hidden_states: Optional[bool] = False,
-                output_attention: Optional[bool] = False
-                ) -> torch.Tensor:
+    def forward(
+        self, src: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attention: Optional[bool] = False
+    ) -> torch.Tensor:
         """
         src: tensor [bs x seq_len x d_model]
         Return:
@@ -221,9 +239,10 @@ def forward(self, src: torch.Tensor,
             if output_hidden_states:
                 src = mod(src)
                 all_hidden_states.append(src)
-        if output_hidden_states: return src, all_hidden_states
+        if output_hidden_states:
+            return src, all_hidden_states
         return src
-    
+
 
 class PatchTSTPreTrainedModel(PreTrainedModel):
     config_class = PatchTSTConfig
@@ -234,12 +253,12 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize weights"""
         if self.config.use_cls_token:
-            torch.nn.init.normal_(self.config.cls_token, std=.02)
+            torch.nn.init.normal_(self.config.cls_token, std=0.02)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (PatchTSTEncoder)):
             module.gradient_checkpointing = value
-            
+
 
 class PatchTSTEncoder(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
@@ -265,9 +284,13 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if self.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model)
+            self.W_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model
+            )
         else:
-            self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model)
+            self.W_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, self.num_patch, config.d_model
+            )
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -313,8 +336,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x = self.dropout(x + self.W_pos)  # x: [bs * nvars x num_patch x d_model]
 
         # Encoder
-        x = self.encoder(x)  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
-        x = torch.reshape(x, (bs, n_vars, -1, self.d_model))  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        x = self.encoder(
+            x
+        )  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
+        x = torch.reshape(
+            x, (bs, n_vars, -1, self.d_model)
+        )  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         return x
 
 
@@ -521,9 +548,13 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if self.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model
+            )
         else:
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, self.num_patch, config.d_model
+            )
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -570,9 +601,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # Encoder
         x = self.encoder(
-            x)  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
-        x = torch.reshape(x, (bs, n_vars, -1,
-                              self.d_model))  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+            x
+        )  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
+        x = torch.reshape(
+            x, (bs, n_vars, -1, self.d_model)
+        )  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         return x
 
 
@@ -588,10 +621,7 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x: torch.Tensor):
         encoder_output = self.encoder(x)
-        return BaseModelOutputWithNoAttention(
-            last_hidden_state=encoder_output,
-            hidden_states=None
-        )
+        return BaseModelOutputWithNoAttention(last_hidden_state=encoder_output, hidden_states=None)
 
 
 class PretrainHead(nn.Module):
@@ -608,16 +638,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         output: tensor [bs x nvars x num_patch x patch_len]
         """
         x = self.linear(self.dropout(x))  # [bs x nvars x num_patch x patch_len]
-        if self.use_cls_token: x = x[:, :, 1:, :]  # remove the first cls token
+        if self.use_cls_token:
+            x = x[:, :, 1:, :]  # remove the first cls token
         return x
 
 
-def cv_random_masking(xb: torch.Tensor,
-                      mask_ratio: float,
-                      cv_channel_indices: list = None,
-                      channel_consistent_masking: bool = True,
-                      d_size="4D",
-                      mask_value=0):
+def cv_random_masking(
+    xb: torch.Tensor,
+    mask_ratio: float,
+    cv_channel_indices: list = None,
+    channel_consistent_masking: bool = True,
+    d_size="4D",
+    mask_value=0,
+):
     """cv_random_masking: Mask the input considering the control variables.
 
     Args:
@@ -662,15 +695,17 @@ def cv_random_masking(xb: torch.Tensor,
 
 
 class PatchMasking(nn.Module):
-    def __init__(self,
-                 mask_type: str = "random",
-                 mask_ratio=0.5,
-                 mask_patches: list = [2, 3],
-                 mask_patch_ratios: list = [1, 1],
-                 channel_consistent_masking: bool = True,
-                 d_size: str = "4D",
-                 cv_channel_indices: list = None,
-                 mask_value=0, ):
+    def __init__(
+        self,
+        mask_type: str = "random",
+        mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        mask_patch_ratios: list = [1, 1],
+        channel_consistent_masking: bool = True,
+        d_size: str = "4D",
+        cv_channel_indices: list = None,
+        mask_value=0,
+    ):
         """PatchMasking: Class to random or forcast masking.
 
         Args:
@@ -699,7 +734,6 @@ def __init__(self,
         super().__init__()
 
     def forward(self, x: torch.Tensor):
-
         """
         Input:
             x: patched input
@@ -713,12 +747,14 @@ def forward(self, x: torch.Tensor):
         """
 
         if self.mask_type == "random":
-            x_mask, mask = cv_random_masking(xb=x,
-                                             mask_ratio=self.mask_ratio,
-                                             cv_channel_indices=self.cv_channel_indices,
-                                             channel_consistent_masking=self.channel_consistent_masking,
-                                             d_size=self.d_size,
-                                             mask_value=self.mask_value)
+            x_mask, mask = cv_random_masking(
+                xb=x,
+                mask_ratio=self.mask_ratio,
+                cv_channel_indices=self.cv_channel_indices,
+                channel_consistent_masking=self.channel_consistent_masking,
+                d_size=self.d_size,
+                mask_value=self.mask_value,
+            )
 
         else:
             raise Exception("Invalid mask type")
@@ -732,15 +768,19 @@ class Patch(nn.Module):
     """
     A class to patchify the time series sequence into different patches
     """
-    def __init__(self,
-                 seq_len: int,
-                 patch_len: int,
-                 stride: int,
-                 padding: bool = False  # TODO: use this to set whether we want to pad zeros to the sequence
-                 ):
+
+    def __init__(
+        self,
+        seq_len: int,
+        patch_len: int,
+        stride: int,
+        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+    ):
         super().__init__()
 
-        assert (seq_len > patch_len), f'Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})'
+        assert (
+            seq_len > patch_len
+        ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})"
 
         self.seq_len = seq_len
         self.patch_len = patch_len
@@ -760,12 +800,14 @@ def forward(self, x: torch.Tensor):
             z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
         """
         seq_len = x.shape[-2]
-        assert (seq_len == self.seq_len), f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
+        assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
 
         # x = x[:, :, self.s_begin:, :]  # xb: [bs x ... x tgt_len x nvars]
-        z = x.transpose(0, -2)[self.s_begin:]    # z: [tgt_len x ... x bs x n_vars]
-        z = z.transpose(0, -2).contiguous()     # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
-        z = z.unfold(dimension=-2, size=self.patch_len, step=self.stride)  # xb: [bs x ... x num_patch x n_vars x patch_len]
+        z = x.transpose(0, -2)[self.s_begin :]  # z: [tgt_len x ... x bs x n_vars]
+        z = z.transpose(0, -2).contiguous()  # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
+        z = z.unfold(
+            dimension=-2, size=self.patch_len, step=self.stride
+        )  # xb: [bs x ... x num_patch x n_vars x patch_len]
         z = z.transpose(-2, -3).contiguous()  # xb: [bs x ... x n_vars x num_patch x patch_len]
         return z
 
@@ -808,25 +850,24 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        self.patching = Patch(config.context_length,
-                              patch_len=config.patch_length,
-                              stride=config.stride)
-        self.masking = PatchMasking(mask_type=config.mask_type,
-                                    mask_ratio=config.mask_ratio,
-                                    mask_patches=config.mask_patches,
-                                    mask_patch_ratios=config.mask_patch_ratios,
-                                    channel_consistent_masking=config.channel_consistent_masking,
-                                    d_size=config.d_size,
-                                    cv_channel_indices=config.cv_channel_indices,
-                                    mask_value=config.mask_value)
+        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
+        self.masking = PatchMasking(
+            mask_type=config.mask_type,
+            mask_ratio=config.mask_ratio,
+            mask_patches=config.mask_patches,
+            mask_patch_ratios=config.mask_patch_ratios,
+            channel_consistent_masking=config.channel_consistent_masking,
+            d_size=config.d_size,
+            cv_channel_indices=config.cv_channel_indices,
+            mask_value=config.mask_value,
+        )
         self.model = PatchTSTModel(config)
         self.head = PretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction='mean')
+        self.loss = torch.nn.MSELoss(reduction="mean")
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor] = None
-                ) -> PatchTSTForPreTrainingOutput:
+    def forward(
+        self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None
+    ) -> PatchTSTForPreTrainingOutput:
         """
         past_values (x): tensor [bs x n_vars x num_patch x patch_len]
         future_values (y): labels
@@ -838,7 +879,7 @@ def forward(self,
         masked_x, masked = self.masking(patched_x)
         model_output = self.model(masked_x)  # x: [bs x nvars x num_patch x d_model]
         #  or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len]
+        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len]
 
         loss_val = self.loss(x_hat, patched_x)
         return PatchTSTForPreTrainingOutput(
@@ -847,4 +888,87 @@ def forward(self,
         )
 
 
+class PatchTSTForClassification(PatchTSTPretrainedModel):
+    # PatchTST model + classification head
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.patching = Patch(config.seq_len, patch_len=config.patch_len, stride=config.stride)
+
+        self.model = PatchTSTModel(config)
+        self.head = ClassificationHead(config)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, x, y=None):
+        patched_x = self.patching(x)
+        model_output = self.model(patched_x)
+        y_hat = self.head(model_output[0])
+
+        loss_val = None
+        if y is not None:
+            loss_val = self.loss(y_hat, y)
+        return PatchTSTForClassificationOutput(
+            loss=loss_val,
+            prediction_logits=y_hat,
+        )
+
+
+class ClassificationHead(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.use_cls_token = config.use_cls_token
+        self.pooling = config.pooling
+        self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+        self.linear = nn.Linear(config.n_vars * config.d_model, config.n_classes)
+
+    def forward(self, x):
+        """
+        x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: [bs x n_classes]
+        """
+        if self.use_cls_token:
+            x = x[:, :, 0, :]  # use the first output token, x: bs x nvars x d_model
+        elif self.pooling == "mean":
+            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
+        elif self.pooling == "max":
+            x = x.max(dim=2)  # x: [bs x nvars x d_model]
+        else:
+            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
+        x = self.flatten(x)  # x: bs x nvars * d_model
+        y = self.linear(self.dropout(x))  # y: bs x n_classes
+        return y
+
+
+class PatchTSTForClassificationOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForClassification`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None

From 1935eef97d52eb98caadd90828124e41126d8001 Mon Sep 17 00:00:00 2001
From: "Wesley M. Gifford" <wmgifford@us.ibm.com>
Date: Mon, 21 Aug 2023 09:23:39 -0400
Subject: [PATCH 004/189] clean up auto files

---
 src/transformers/models/auto/modeling_auto.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 64ccf4061b28aa..c3f1929190e3e5 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -111,7 +111,6 @@
         ("imagegpt", "ImageGPTModel"),
         ("informer", "InformerModel"),
         ("patchtst", "PatchTSTModel"),
-        ("patchtst", "PatchTSTModel"),
         ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
@@ -189,8 +188,6 @@
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
-        ("patchtst", "PatchTSTModel"),
-        ("patchtst", "PatchTSTModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),

From c6195cb3d839a11e2854a110dc5d0530ae215833 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 22 Aug 2023 10:04:07 -0400
Subject: [PATCH 005/189] Add PatchTSTForPrediction

---
 .../models/patchtst/configuration_patchtst.py |  10 +-
 .../models/patchtst/modeling_patchtst.py      | 148 ++++++++++++++++--
 2 files changed, 140 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 29759fb4bfcc6a..7f6fc611d0d282 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -173,6 +173,7 @@ def __init__(
         num_time_features: int = 0,
         is_encoder_decoder: bool = False,
         encoder_layerdrop: float = 0.1,
+        prediction_length: int = 24,
 
         # PatchTST arguments
         attention_type: str = "prob",
@@ -183,7 +184,7 @@ def __init__(
 
         # time series specific configuration
         self.context_length = context_length
-        self.input_size = input_size
+        self.input_size = input_size # n_vars
         self.num_time_features = num_time_features
         self.num_dynamic_real_features = num_dynamic_real_features
         self.num_static_real_features = num_static_real_features
@@ -216,6 +217,7 @@ def __init__(
         # PatchTST
         self.patch_length = patch_length
         self.stride = stride
+        self.num_patch = self._num_patches()
         self.attention_type = attention_type
         self.sampling_factor = sampling_factor
         self.distil = distil
@@ -237,5 +239,11 @@ def __init__(
         self.proj_dropout = proj_dropout
         self.qkv_bias = qkv_bias
 
+        # Forcasting
+        self.prediction_length = prediction_length
+
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
+    def _num_patches(self):
+        return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
+
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f661008f022c02..26899814cd6e49 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -20,12 +20,12 @@
 import torch
 from torch import nn
 import math
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ...modeling_outputs import BaseModelOutputWithNoAttention
-from .configuration_patchtst import PatchTSTConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, logging
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
 from torch.nn.modules.activation import MultiheadAttention
-from ...utils import ModelOutput
+from transformers.utils import ModelOutput
 
 logger = logging.get_logger(__name__)
 
@@ -869,13 +869,10 @@ def forward(
         self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None
     ) -> PatchTSTForPreTrainingOutput:
         """
-        past_values (x): tensor [bs x n_vars x num_patch x patch_len]
+        past_values (x): tensor [bs x seq_len x n_vars ]
         future_values (y): labels
         """
-
-        # x: [bs x n_vars x num_patch x patch_len] for pretrain
-
-        patched_x = self.patching(past_values)
+        patched_x = self.patching(past_values) # patched_x: [bs x n_vars x num_patch x patch_len] for pretrain
         masked_x, masked = self.masking(patched_x)
         model_output = self.model(masked_x)  # x: [bs x nvars x num_patch x d_model]
         #  or [bs x nvars x (num_patch+1) x d_model] if use cls_token
@@ -888,25 +885,25 @@ def forward(
         )
 
 
-class PatchTSTForClassification(PatchTSTPretrainedModel):
+class PatchTSTForClassification(PatchTSTPreTrainedModel):
     # PatchTST model + classification head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        self.patching = Patch(config.seq_len, patch_len=config.patch_len, stride=config.stride)
+        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
 
         self.model = PatchTSTModel(config)
         self.head = ClassificationHead(config)
         self.loss = nn.CrossEntropyLoss()
 
-    def forward(self, x, y=None):
-        patched_x = self.patching(x)
+    def forward(self, past_values, future_values=None):
+        patched_x = self.patching(past_values)
         model_output = self.model(patched_x)
         y_hat = self.head(model_output[0])
 
         loss_val = None
-        if y is not None:
-            loss_val = self.loss(y_hat, y)
+        if future_values is not None:
+            loss_val = self.loss(y_hat, future_values)
         return PatchTSTForClassificationOutput(
             loss=loss_val,
             prediction_logits=y_hat,
@@ -920,7 +917,7 @@ def __init__(self, config: PatchTSTConfig):
         self.pooling = config.pooling
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        self.linear = nn.Linear(config.n_vars * config.d_model, config.n_classes)
+        self.linear = nn.Linear(config.input_size * config.d_model, config.num_classes)
 
     def forward(self, x):
         """
@@ -972,3 +969,120 @@ class PatchTSTForClassificationOutput(ModelOutput):
     seq_relationship_logits: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PredictionHead(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.individual = config.individual
+        self.n_vars = config.input_size
+        self.use_cls_token = config.use_cls_token
+        self.pooling = config.pooling
+        head_dimension = config.d_model if config.pooling else config.d_model * config.num_patch
+
+        if self.individual:
+            self.linears = nn.ModuleList()
+            self.dropouts = nn.ModuleList()
+            self.flattens = nn.ModuleList()
+            for i in range(self.n_vars):
+                self.flattens.append(nn.Flatten(start_dim=2))
+                self.linears.append(nn.Linear(head_dimension, config.prediction_length))
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+                                     )
+        else:
+            self.flatten = nn.Flatten(start_dim=2)
+            self.linear = nn.Linear(head_dimension, config.prediction_length)
+            self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor):
+        """
+        x: [bs x nvars x num_patch x d_model]
+            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: [bs x forecast_len x nvars]
+        """
+
+        if self.use_cls_token:
+            y = x[:, :, 0, :]      # y: [bs x nvars x d_model]
+        else:
+            if self.pooling == 'mean':
+                y = x.mean(dim=2)  # y: [bs x nvars x d_model]
+            elif self.pooling == 'max':
+                y = x.max(dim=2)  # y: [bs x nvars x d_model]
+            else:
+                y = x       # y: [bs x nvars x num_patch x d_model]
+
+        if self.individual:
+            x_out = []
+            for i in range(self.n_vars):
+                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patch)] or [bs x d_model)]
+                z = self.linears[i](z)  # z: [bs x forecast_len]
+                z = self.dropouts[i](z)
+                x_out.append(z)
+            x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
+        else:
+            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)]
+            z = self.dropout(z)
+            x = self.linear(z)  # x: [bs x nvars x forecast_len]
+
+        x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
+
+        return x
+
+
+class PatchTSTForPredictionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PatchTSTForPrediction(PatchTSTPreTrainedModel):
+    # PatchTST model + classification head
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
+
+        self.model = PatchTSTModel(config)
+        self.head = PredictionHead(config)
+        self.loss = nn.MSELoss(reduction='mean')
+
+    def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]):
+        patched_x = self.patching(past_values)
+        model_output = self.model(patched_x)
+        y_hat = self.head(model_output[0])
+
+        loss_val = None
+        if future_values is not None:
+            loss_val = self.loss(y_hat, future_values)
+        return PatchTSTForPredictionOutput(
+            loss=loss_val,
+            prediction_logits=y_hat,
+        )
+

From 2d4b02cfe5fcaf5b5f70d5ccb1833d454851ac33 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 22 Aug 2023 10:26:15 -0400
Subject: [PATCH 006/189] Fix relative import

---
 src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 26899814cd6e49..2ada2d8aaaf39c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -20,12 +20,12 @@
 import torch
 from torch import nn
 import math
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, logging
-from transformers.modeling_outputs import BaseModelOutputWithNoAttention
-from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, logging
+from ...modeling_outputs import BaseModelOutputWithNoAttention
+from .configuration_patchtst import PatchTSTConfig
 from torch.nn.modules.activation import MultiheadAttention
-from transformers.utils import ModelOutput
+from ...utils import ModelOutput
 
 logger = logging.get_logger(__name__)
 

From ee8c8726a3121ba3dfe8fc21cc68f50b9ce48277 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <gift.sinthong@ibm.com>
Date: Tue, 22 Aug 2023 17:42:45 -0400
Subject: [PATCH 007/189] Replace original PatchTSTEncoder with
 ChannelAttentionPatchTSTEncoder

---
 src/transformers/models/auto/modeling_auto.py |   1 -
 .../models/patchtst/modeling_patchtst.py      | 270 ++++++------------
 2 files changed, 86 insertions(+), 185 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c3f1929190e3e5..b7cf99b0e0e4ae 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -110,7 +110,6 @@
         ("idefics", "IdeficsModel"),
         ("imagegpt", "ImageGPTModel"),
         ("informer", "InformerModel"),
-        ("patchtst", "PatchTSTModel"),
         ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 2ada2d8aaaf39c..b735c2cb57540c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 """ PyTorch PatchTST model."""
 
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
+from typing import Optional, Tuple
 import torch
 from torch import nn
 import math
+
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
 from ...modeling_outputs import BaseModelOutputWithNoAttention
@@ -155,14 +154,30 @@ def coord1d_pos_encoding(q_len, exponential=False, normalize=True):
     return cpe
 
 
-class TSTEncoderLayer(nn.Module):
+class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.pre_norm = config.pre_norm
 
-        assert (
-            not config.d_model % config.encoder_attention_heads
-        ), f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})"
+        self.layers = nn.ModuleList(
+            [
+                ChannelAttentionTSTEncoderLayer(config)
+                for i in range(config.encoder_layers)
+            ]
+        )
+
+    def forward(self, src: torch.Tensor):
+        """
+        src: tensor [bs x nvars x seq_len x d_model]
+        Return:
+            Tensor [bs x nvars x seq_len x d_model]
+        """
+        for mod in self.layers: src = mod(src)
+        return src
+
+
+class ChannelAttentionTSTEncoderLayer(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
 
         # Multi-Head attention
         self.self_attn = PatchTSTAttention(config)
@@ -174,6 +189,13 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.norm_sublayer1 = nn.LayerNorm(config.d_model)
 
+        # Add & Norm of the sublayer 2
+        self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
+        if "batch" in config.norm.lower():
+            self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+        else:
+            self.norm_sublayer2 = nn.LayerNorm(config.d_model)
+
         # Position-wise Feed-Forward
         self.ff = nn.Sequential(
             nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias),
@@ -182,68 +204,58 @@ def __init__(self, config: PatchTSTConfig):
             nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias),
         )
 
-        # Add & Norm of sublayer 2
-        self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
+        # Add & Norm of sublayer 3
+        self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+            self.norm_sublayer3 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
         else:
-            self.norm_sublayer2 = nn.LayerNorm(config.d_model)
+            self.norm_sublayer3 = nn.LayerNorm(config.d_model)
+
+        self.pre_norm = config.pre_norm
+        self.store_attn = config.store_attention
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x seq_len x d_model]
+        src: tensor [bs x nvars x seq_len x d_model]
         Return:
-            Tensor [bs x seq_len x d_model]
+            Tensor [bs x nvars x seq_len x d_model]
         """
-        # First sublayer: mixing across time
+        bs, n_vars, seq_len, d_model = src.shape
+
+        # First sublayer: attention across time
+        src = src.view(bs*n_vars, seq_len, d_model)      # src: [(bs*nvars) x seq_len x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
-            src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src))
-            )  # Add: residual connection with residual dropout
+            src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src)))
+            src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) )     # src: [(bs*nvars) x seq_len x d_model]
+        src = src.reshape(bs, n_vars, seq_len, d_model)     # [bs x nvars x seq_len x d_model]
 
-        # Second sublayer: mixing across hidden dimension
+        # second sublayer: attention across variable at any given time
+        # [bs x nvars x seq_len x d_model] -> [bs x seq_len x nvars x d_model] -> [(bs*seq_len) x nvars x d_model]
+        src = src.transpose(2, 1).contiguous().view(bs*seq_len, n_vars, d_model)        # [(bs*seq_len) x nvars x d_model]
+        if self.pre_norm:
+            ## Norm and Multi-Head attention and Add residual connection
+            src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) )  # Add: residual connection with residual dropout
+        else:
+            ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
+            src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) )     # src: [(bs*seq_len) x nvars x d_model]
+        src = src.reshape(bs, seq_len, n_vars, d_model).transpose(1,2).contiguous()         # src: [bs x nvars x seq_len x d_model]
+
+        # Third sublayer: mixing across hidden
+        src = src.view(bs*n_vars, seq_len, d_model)      # src: [(bs*nvars) x seq_len x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
-            src = src + self.dropout_path2(
-                self.ff(self.norm_sublayer2(src))
-            )  # Add: residual connection with residual dropout
+            src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) ))  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer2(
-                src + self.dropout_path2(self.ff(src))
-            )  # Add: residual connection with residual dropout
+            src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout
+        src = src.reshape(bs, n_vars, seq_len, d_model)     # [bs x nvars x seq_len x d_model]
 
         return src
 
 
-class TSTEncoder(nn.Module):
-    def __init__(self, config: PatchTSTConfig):
-        super().__init__()
-
-        self.layers = nn.ModuleList([TSTEncoderLayer(config) for i in range(config.encoder_layers)])
-
-    def forward(
-        self, src: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attention: Optional[bool] = False
-    ) -> torch.Tensor:
-        """
-        src: tensor [bs x seq_len x d_model]
-        Return:
-            Tensor [bs x seq_len x d_model]
-        """
-        all_hidden_states = []
-        for mod in self.layers:
-            if output_hidden_states:
-                src = mod(src)
-                all_hidden_states.append(src)
-        if output_hidden_states:
-            return src, all_hidden_states
-        return src
-
-
 class PatchTSTPreTrainedModel(PreTrainedModel):
     config_class = PatchTSTConfig
     base_model_prefix = "model"
@@ -256,92 +268,75 @@ def _init_weights(self, module):
             torch.nn.init.normal_(self.config.cls_token, std=0.02)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (PatchTSTEncoder)):
+        if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
             module.gradient_checkpointing = value
 
 
-class PatchTSTEncoder(PatchTSTPreTrainedModel):
+class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-        # self.n_vars = c_in
-        self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1
+        self.n_vars = config.input_size
+        self.num_patch = config.num_patch
+        self.patch_length = config.patch_length
         self.d_model = config.d_model
         self.shared_embedding = config.shared_embedding
         self.use_cls_token = config.use_cls_token
 
-        # Added params for patching
-        self.patch_last = config.patch_last
-        self.mask_ratio = config.mask_ratio
-
         # Input encoding: projection of feature vectors onto a d-dim vector space
-        if not self.shared_embedding:
-            self.W_P = nn.ModuleList()
-            for _ in range(config.input_size):
-                self.W_P.append(nn.Linear(config.patch_length, self.d_model))
+        if not config.shared_embedding:
+            self.w_p = nn.ModuleList()
+            for _ in range(self.n_vars):
+                self.w_p.append(nn.Linear(config.patch_length, config.d_model))
         else:
-            self.W_P = nn.Linear(config.patch_length, config.d_model)
+            self.w_p = nn.Linear(config.patch_length, config.d_model)
 
         # Positional encoding
-        if self.use_cls_token:
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.W_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model
-            )
+        if config.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch + 1, config.d_model)
         else:
-            self.W_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, self.num_patch, config.d_model
-            )
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch, config.d_model)
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
 
         # Encoder
-        self.encoder = TSTEncoder(config)
+        self.encoder = ChannelAttentionTSTEncoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        x: tensor [bs x nvars x num_patch x patch_len]    #[bs x num_patch x nvars x patch_len]
+        x: tensor [bs x nvars x num_patch x patch_len]
         return:
             tensor [bs x nvars x num_patch x d_model]
                 or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         """
-
         # bs, num_patch, n_vars, patch_len = x.shape
         bs, n_vars, num_patch, patch_len = x.shape
         # Input encoding
         if not self.shared_embedding:
             x_out = []
             for i in range(n_vars):
-                z = self.W_P[i](x[:, i, :, :])
+                z = self.w_p[i](x[:, i, :, :])
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)
         else:
-            x = self.W_P(x)  # x: [bs x nvars  x num_patch x d_model]
-
-        # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model]
-        x = x.view(bs * n_vars, num_patch, self.d_model)  # x: [bs * nvars x num_patch x d_model]
+            x = self.w_p(x)  # x: [bs x nvars  x num_patch x d_model]
 
         if self.use_cls_token:
-            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
-            x = self.dropout(x + self.W_pos[1:, :])  # x: [bs * nvars x num_patch x d_model]
+            x = self.dropout(x + self.w_pos[1:, :])  # x: [bs x nvars x num_patch x d_model]
             # append cls token
-            cls_token = self.cls_token + self.W_pos[:1, :]  # cls_token: [1 x 1 x d_model]
+            cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(x.shape[0], -1, -1)  # get the same copy for all the batch samples
-            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs * nvars x (num_patch+1) x d_model]
+            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs x nvars x (num_patch+1) x d_model]
         else:
-            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
-            x = self.dropout(x + self.W_pos)  # x: [bs * nvars x num_patch x d_model]
+            x = self.dropout(x + self.w_pos)  # x: [bs x nvars x num_patch x d_model]
 
         # Encoder
         x = self.encoder(
-            x
-        )  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
-        x = torch.reshape(
-            x, (bs, n_vars, -1, self.d_model)
-        )  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+            x)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         return x
 
 
@@ -516,99 +511,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 """
 
 
-class PatchTSTEncoder(PatchTSTPreTrainedModel):
-    """
-    PatchTST encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
-    attention layer is an [`PatchTSTEncoderLayer`].
-
-    Args:
-        config: PatchTSTConfig
-    """
-
-    def __init__(self, config: PatchTSTConfig):
-        super().__init__(config)
-        # self.n_vars = c_in
-        self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1
-        self.d_model = config.d_model
-        self.shared_embedding = config.shared_embedding
-        self.use_cls_token = config.use_cls_token
-
-        # Added params for patching
-        self.patch_last = config.patch_last
-        self.mask_ratio = config.mask_ratio
-
-        # Input encoding: projection of feature vectors onto a d-dim vector space
-        if not self.shared_embedding:
-            self.w_p = nn.ModuleList()
-            for _ in range(config.input_size):
-                self.w_p.append(nn.Linear(config.patch_length, self.d_model))
-        else:
-            self.w_p = nn.Linear(config.patch_length, config.d_model)
-
-        # Positional encoding
-        if self.use_cls_token:
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model))
-            self.w_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model
-            )
-        else:
-            self.w_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, self.num_patch, config.d_model
-            )
-
-        # Positional dropout
-        self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
-
-        # Encoder
-        self.encoder = TSTEncoder(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: tensor [bs x nvars x num_patch x patch_len]    #[bs x num_patch x nvars x patch_len]
-        return:
-            tensor [bs x nvars x num_patch x d_model]
-                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        """
-
-        # bs, num_patch, n_vars, patch_len = x.shape
-        bs, n_vars, num_patch, patch_len = x.shape
-        # Input encoding
-        if not self.shared_embedding:
-            x_out = []
-            for i in range(n_vars):
-                z = self.w_p[i](x[:, i, :, :])
-                x_out.append(z)
-            x = torch.stack(x_out, dim=1)
-        else:
-            x = self.w_p(x)  # x: [bs x nvars  x num_patch x d_model]
-
-        # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model]
-        x = x.view(bs * n_vars, num_patch, self.d_model)  # x: [bs * nvars x num_patch x d_model]
-
-        if self.use_cls_token:
-            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
-            x = self.dropout(x + self.w_pos[1:, :])  # x: [bs * nvars x num_patch x d_model]
-            # append cls token
-            cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x d_model]
-            cls_tokens = cls_token.expand(x.shape[0], -1, -1)  # get the same copy for all the batch samples
-            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs * nvars x (num_patch+1) x d_model]
-        else:
-            # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}')
-            x = self.dropout(x + self.w_pos)  # x: [bs * nvars x num_patch x d_model]
-
-        # Encoder
-        x = self.encoder(
-            x
-        )  # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token
-        x = torch.reshape(
-            x, (bs, n_vars, -1, self.d_model)
-        )  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        return x
-
-
 @add_start_docstrings(
     "The bare PatchTST Model outputting raw hidden-states without any specific head on top.",
     PATCHTST_START_DOCSTRING,
@@ -617,7 +519,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-        self.encoder = PatchTSTEncoder(config)
+        self.encoder = ChannelAttentionPatchTSTEncoder(config)
 
     def forward(self, x: torch.Tensor):
         encoder_output = self.encoder(x)

From c657ae8524b794a1613b8ac3beb3e259107cddfe Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 25 Aug 2023 18:42:40 +0700
Subject: [PATCH 008/189] temporary adding absolute path + add
 PatchTSTForForecasting class

---
 src/transformers/models/patchtst/__init__.py  |   2 +-
 .../models/patchtst/configuration_patchtst.py |   4 +-
 .../models/patchtst/modeling_patchtst.py      | 155 ++++++++++++++++--
 3 files changed, 141 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 73333c3fee067a..88ed72154b826c 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -14,7 +14,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 7f6fc611d0d282..782a45fede9e8b 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -16,8 +16,8 @@
 
 from typing import List, Optional, Union
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index b735c2cb57540c..bafd9c3e85b39b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -19,12 +19,14 @@
 from torch import nn
 import math
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ...modeling_outputs import BaseModelOutputWithNoAttention
-from .configuration_patchtst import PatchTSTConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, logging
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.utils import ModelOutput
+
 from torch.nn.modules.activation import MultiheadAttention
-from ...utils import ModelOutput
+
+from .configuration_patchtst import PatchTSTConfig
 
 logger = logging.get_logger(__name__)
 
@@ -768,7 +770,8 @@ def __init__(self, config: PatchTSTConfig):
         self.loss = torch.nn.MSELoss(reduction="mean")
 
     def forward(
-        self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None
+        self, past_values: torch.Tensor, 
+        future_values: Optional[torch.Tensor] = None
     ) -> PatchTSTForPreTrainingOutput:
         """
         past_values (x): tensor [bs x seq_len x n_vars ]
@@ -902,7 +905,6 @@ def forward(self, x: torch.Tensor):
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         output: [bs x forecast_len x nvars]
         """
-
         if self.use_cls_token:
             y = x[:, :, 0, :]      # y: [bs x nvars x d_model]
         else:
@@ -937,13 +939,11 @@ class PatchTSTForPredictionOutput(ModelOutput):
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
+            MSE loss.
+
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
+
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -958,8 +958,7 @@ class PatchTSTForPredictionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -985,6 +984,128 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso
             loss_val = self.loss(y_hat, future_values)
         return PatchTSTForPredictionOutput(
             loss=loss_val,
-            prediction_logits=y_hat,
+            prediction_outputs=y_hat,
         )
 
+
+class PatchTSTForForecastingOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+
+        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Forecasting outputs of the time series modeling heads.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    forecast_outputs: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class ForecastHead(nn.Module):
+    def __init__(self,
+                individual: bool,
+                n_vars: int,
+                d_model: int,
+                num_patch: int,
+                forecast_len: int,
+                head_dropout: float = 0.,
+                use_cls_token: bool = False,
+                pooling: str = None,
+                ):
+        super().__init__()
+
+        self.individual = individual
+        self.n_vars = n_vars
+        self.use_cls_token = use_cls_token
+        self.pooling = pooling
+        head_dim = d_model if pooling else d_model * num_patch
+
+        if self.individual:
+            self.linears = nn.ModuleList()
+            self.dropouts = nn.ModuleList()
+            self.flattens = nn.ModuleList()
+            for i in range(self.n_vars):
+                self.flattens.append(nn.Flatten(start_dim=2))
+                self.linears.append(nn.Linear(head_dim, forecast_len))
+                self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity()
+                                     )
+        else:
+            self.flatten = nn.Flatten(start_dim=2)
+            self.linear = nn.Linear(head_dim, forecast_len)
+            self.dropout = nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity()
+
+    def forward(self, x):
+        """
+        x: [bs x nvars x num_patch x d_model]
+            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: [bs x forecast_len x nvars]
+        """
+
+        if self.use_cls_token:
+            y = x[:, :, 0, :]      # y: [bs x nvars x d_model]
+        else:
+            if self.pooling == 'mean':
+                y = x.mean(dim=2)  # y: [bs x nvars x d_model]
+            elif self.pooling == 'max':
+                y = x.max(dim=2)  # y: [bs x nvars x d_model]
+            else:
+                y = x       # y: [bs x nvars x num_patch x d_model]
+
+        if self.individual:
+            x_out = []
+            for i in range(self.n_vars):
+                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patch)] or [bs x d_model)]
+                z = self.linears[i](z)  # z: [bs x forecast_len]
+                z = self.dropouts[i](z)
+                x_out.append(z)
+            x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
+        else:
+            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)]
+            z = self.dropout(z)
+            x = self.linear(z)  # x: [bs x nvars x forecast_len]
+
+        x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
+
+        return x
+
+
+class PatchTSTForForecasting(PatchTSTPreTrainedModel):
+    # PatchTST model + classification head
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
+
+        self.model = PatchTSTModel(config)
+        self.head = ForecastHead(config)
+        self.loss = nn.MSELoss(reduction='mean')
+
+    def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]):
+        patched_x = self.patching(past_values)
+        model_output = self.model(patched_x)
+        y_hat = self.head(model_output[0])
+
+        loss_val = None
+        if future_values is not None:
+            loss_val = self.loss(y_hat, future_values)
+        return PatchTSTForForecastingOutput(
+            loss=loss_val,
+            forecast_outputs=y_hat,
+        )

From fa72e8a310e216fabec87b9734df80eed32b5b7b Mon Sep 17 00:00:00 2001
From: Gift Sinthong <gift.sinthong@ibm.com>
Date: Fri, 25 Aug 2023 12:39:46 -0400
Subject: [PATCH 009/189] Update base PatchTSTModel + Unittest

---
 .../models/patchtst/configuration_patchtst.py |   8 +-
 .../models/patchtst/modeling_patchtst.py      | 265 ++++++--
 .../models/patchtst/test_modeling_patchtst.py | 594 +++++++++---------
 3 files changed, 497 insertions(+), 370 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 782a45fede9e8b..dae40eee1ee12c 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -56,10 +56,6 @@ class PatchTSTConfig(PretrainedConfig):
             The number of static categorical features.
         num_static_real_features (`int`, *optional*, defaults to 0):
             The number of static real valued features.
-        cardinality (`list[int]`, *optional*):
-            The cardinality (number of different values) for each of the static categorical features. Should be a list
-            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
         embedding_dimension (`list[int]`, *optional*):
             The dimension of the embedding for each of the static categorical features. Should be a list of integers,
             having the same length as `num_static_categorical_features`. Cannot be `None` if
@@ -154,6 +150,8 @@ def __init__(
         use_cls_token: bool = False,
         patch_last: bool = True,
         individual: bool = False,
+        seed_number= None,
+        mask_input: Optional[bool] = None,
         mask_type: str = "random",
         mask_ratio=0.5,
         mask_patches: list = [2, 3],
@@ -223,6 +221,8 @@ def __init__(
         self.distil = distil
 
         # Masking
+        self.seed_number = seed_number
+        self.mask_input = mask_input
         self.mask_type = mask_type
         self.mask_ratio = mask_ratio
         self.mask_patches = mask_patches
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index bafd9c3e85b39b..1fbd5f204fa7fa 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -18,15 +18,15 @@
 import torch
 from torch import nn
 import math
+import random
+import numpy as np
 
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, logging
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.utils import ModelOutput
-
 from torch.nn.modules.activation import MultiheadAttention
-
-from .configuration_patchtst import PatchTSTConfig
+from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
 
 logger = logging.get_logger(__name__)
 
@@ -167,14 +167,20 @@ def __init__(self, config: PatchTSTConfig):
             ]
         )
 
-    def forward(self, src: torch.Tensor):
+    def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
         src: tensor [bs x nvars x seq_len x d_model]
         Return:
             Tensor [bs x nvars x seq_len x d_model]
         """
-        for mod in self.layers: src = mod(src)
-        return src
+        all_hidden_states = []
+        for mod in self.layers:
+            if output_hidden_states:
+                src = mod(src)
+                all_hidden_states.append(src)
+        if output_hidden_states:
+            return src, all_hidden_states
+        return src, None
 
 
 class ChannelAttentionTSTEncoderLayer(nn.Module):
@@ -283,6 +289,7 @@ def __init__(self, config: PatchTSTConfig):
         self.d_model = config.d_model
         self.shared_embedding = config.shared_embedding
         self.use_cls_token = config.use_cls_token
+        self.gradient_checkpointing = False
 
         # Input encoding: projection of feature vectors onto a d-dim vector space
         if not config.shared_embedding:
@@ -308,7 +315,7 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor:
         """
         x: tensor [bs x nvars x num_patch x patch_len]
         return:
@@ -316,30 +323,38 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         """
         # bs, num_patch, n_vars, patch_len = x.shape
-        bs, n_vars, num_patch, patch_len = x.shape
+        bs, n_vars, num_patch, patch_len = past_values.shape
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         # Input encoding
         if not self.shared_embedding:
             x_out = []
             for i in range(n_vars):
-                z = self.w_p[i](x[:, i, :, :])
+                z = self.w_p[i](past_values[:, i, :, :])
                 x_out.append(z)
-            x = torch.stack(x_out, dim=1)
+            past_values = torch.stack(x_out, dim=1)
         else:
-            x = self.w_p(x)  # x: [bs x nvars  x num_patch x d_model]
+            past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patch x d_model]
 
         if self.use_cls_token:
-            x = self.dropout(x + self.w_pos[1:, :])  # x: [bs x nvars x num_patch x d_model]
+            past_values = self.dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patch x d_model]
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
-            cls_tokens = cls_token.expand(x.shape[0], -1, -1)  # get the same copy for all the batch samples
-            x = torch.cat((cls_tokens, x), dim=1)  # x: [bs x nvars x (num_patch+1) x d_model]
+            cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
+            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x nvars x (num_patch+1) x d_model]
         else:
-            x = self.dropout(x + self.w_pos)  # x: [bs x nvars x num_patch x d_model]
+            past_values = self.dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patch x d_model]
 
         # Encoder
-        x = self.encoder(
-            x)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        return x
+        past_values, hidden_states = self.encoder(
+            past_values, output_hidden_states)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        # return past_values
+        # return past_values, hidden_states
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=past_values, hidden_states=hidden_states
+        )
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -521,11 +536,56 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
+        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
+        if config.mask_input:
+            self.masking = PatchMasking(
+                mask_type=config.mask_type,
+                mask_ratio=config.mask_ratio,
+                mask_patches=config.mask_patches,
+                mask_patch_ratios=config.mask_patch_ratios,
+                channel_consistent_masking=config.channel_consistent_masking,
+                d_size=config.d_size,
+                cv_channel_indices=config.cv_channel_indices,
+                mask_value=config.mask_value,
+                seed_number=config.seed_number
+            )
+        else:
+            self.masking = nn.Identity()
         self.encoder = ChannelAttentionPatchTSTEncoder(config)
 
-    def forward(self, x: torch.Tensor):
-        encoder_output = self.encoder(x)
-        return BaseModelOutputWithNoAttention(last_hidden_state=encoder_output, hidden_states=None)
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor]=None,
+                output_hidden_states: Optional[bool] = None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain
+        masked_values = self.masking(patched_values)
+        encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
+        return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
+                                                  hidden_states=encoder_output.hidden_states,
+                                                  patched_input=patched_values)
+
+
+class PatchTSTModelOutputWithNoAttention(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        patched_input
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    patched_input: torch.FloatTensor = None
 
 
 class PretrainHead(nn.Module):
@@ -598,6 +658,13 @@ def cv_random_masking(
     return xb_mask, mask[..., 0]
 
 
+def set_seed(x=42):
+    random.seed(x)
+    np.random.seed(x)
+    torch.manual_seed(x)
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
+
+
 class PatchMasking(nn.Module):
     def __init__(
         self,
@@ -609,6 +676,7 @@ def __init__(
         d_size: str = "4D",
         cv_channel_indices: list = None,
         mask_value=0,
+        seed_number: Optional[int] = None
     ):
         """PatchMasking: Class to random or forcast masking.
 
@@ -623,7 +691,8 @@ def __init__(
             d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D".
             mask_value (int, optional): Value to use for masking. Defaults to 0.
         """
-
+        if seed_number:
+            set_seed(seed_number)
         self.mask_ratio = mask_ratio
         self.channel_consistent_masking = channel_consistent_masking
         self.d_size = d_size
@@ -665,7 +734,7 @@ def forward(self, x: torch.Tensor):
 
         mask = mask.bool()  # mask: [bs x n_vars x num_patch]
 
-        return x_mask, mask
+        return x_mask #, mask
 
 
 class Patch(nn.Module):
@@ -699,7 +768,7 @@ def forward(self, x: torch.Tensor):
         """
 
         Args:
-            x (torch.Tensor, required): Input of shape [bs x ... x seq_len x n_vars]
+            x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars]
         Returns:
             z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
         """
@@ -754,39 +823,28 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
-        self.masking = PatchMasking(
-            mask_type=config.mask_type,
-            mask_ratio=config.mask_ratio,
-            mask_patches=config.mask_patches,
-            mask_patch_ratios=config.mask_patch_ratios,
-            channel_consistent_masking=config.channel_consistent_masking,
-            d_size=config.d_size,
-            cv_channel_indices=config.cv_channel_indices,
-            mask_value=config.mask_value,
-        )
+        config.mask_input = True
         self.model = PatchTSTModel(config)
         self.head = PretrainHead(config)
         self.loss = torch.nn.MSELoss(reduction="mean")
 
     def forward(
-        self, past_values: torch.Tensor, 
-        future_values: Optional[torch.Tensor] = None
+        self, past_values: torch.Tensor,
+            future_values: Optional[torch.Tensor] = None,
+            output_hidden_states: Optional[bool] = None
     ) -> PatchTSTForPreTrainingOutput:
         """
         past_values (x): tensor [bs x seq_len x n_vars ]
         future_values (y): labels
         """
-        patched_x = self.patching(past_values) # patched_x: [bs x n_vars x num_patch x patch_len] for pretrain
-        masked_x, masked = self.masking(patched_x)
-        model_output = self.model(masked_x)  # x: [bs x nvars x num_patch x d_model]
-        #  or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        model_output = self.model(past_values)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len]
 
-        loss_val = self.loss(x_hat, patched_x)
+        loss_val = self.loss(x_hat, model_output.patched_input)
         return PatchTSTForPreTrainingOutput(
             loss=loss_val,
             prediction_logits=x_hat,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -795,15 +853,12 @@ class PatchTSTForClassification(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
-
         self.model = PatchTSTModel(config)
         self.head = ClassificationHead(config)
         self.loss = nn.CrossEntropyLoss()
 
-    def forward(self, past_values, future_values=None):
-        patched_x = self.patching(past_values)
-        model_output = self.model(patched_x)
+    def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None):
+        model_output = self.model(past_values)
         y_hat = self.head(model_output[0])
 
         loss_val = None
@@ -812,6 +867,7 @@ def forward(self, past_values, future_values=None):
         return PatchTSTForClassificationOutput(
             loss=loss_val,
             prediction_logits=y_hat,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -968,16 +1024,19 @@ class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
-
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
 
-    def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]):
-        patched_x = self.patching(past_values)
-        model_output = self.model(patched_x)
-        y_hat = self.head(model_output[0])
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor],
+                output_hidden_states: Optional[bool] = None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
         if future_values is not None:
@@ -985,6 +1044,7 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso
         return PatchTSTForPredictionOutput(
             loss=loss_val,
             prediction_outputs=y_hat,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -1097,9 +1157,14 @@ def __init__(self, config: PatchTSTConfig):
         self.head = ForecastHead(config)
         self.loss = nn.MSELoss(reduction='mean')
 
-    def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]):
-        patched_x = self.patching(past_values)
-        model_output = self.model(patched_x)
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor],
+                output_hidden_states: Optional[bool] = None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output[0])
 
         loss_val = None
@@ -1108,4 +1173,92 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso
         return PatchTSTForForecastingOutput(
             loss=loss_val,
             forecast_outputs=y_hat,
+            hidden_states=model_output.hidden_states
         )
+
+
+if __name__ == "__main__":
+
+    from transformers import Trainer, TrainingArguments
+    from torch.utils.data import Dataset
+    from transformers import AutoModel, AutoConfig
+    import numpy as np
+
+    class AssetDataset(Dataset):
+        def __init__(self, x, y, seq_len=10, pred_len=10, is_pred=False):
+            self.seq_len = seq_len
+            self.x = x
+            self.y = y
+            self.is_pred = is_pred
+            self.pred_len = pred_len
+
+        def __getitem__(self, index):
+            s_begin = index
+            s_end = s_begin + self.seq_len
+            r_begin = s_end - 1
+            r_end = s_end + self.pred_len
+
+            seq_x = self.x[s_begin:s_end]
+            seq_y = np.array(self.y[r_begin])
+            if self.is_pred:
+                seq_y = self.x[s_end:r_end]
+
+            return {'past_values': seq_x, 'future_values': seq_y}
+
+        def __len__(self):
+            if self.is_pred:
+                return len(self.x) - self.seq_len - self.pred_len + 1
+            return len(self.x) - self.seq_len + 1
+
+    n_classes = 3
+    bs = 200
+    n_features = 20
+    pred_len = 7
+    x = torch.randn(bs, n_features)
+    y = torch.randint(low=0, high=n_classes, size=(bs, 1))[:, 0]
+    valid_asset_ds = train_asset_ds = AssetDataset(x, y, seq_len=10, pred_len=pred_len, is_pred=False)
+    config = PatchTSTConfig(
+        input_size=n_features,
+        num_classes=n_classes,
+        context_length=10,
+        patch_length=5,
+        stride=5,
+        batch_size=50,
+        standardscale=None,  # 'bysample'
+        context_points=10,
+        encoder_layers=12,
+        encoder_attention_heads=8,
+        d_model=256,
+        encoder_ffn_dim=1024,
+        dropout=0.2,
+        fc_dropout=0,
+        r=0.4,
+        prediction_length=pred_len,
+    )
+    # model = PatchTSTForPretraining(config)
+    # model = PatchTSTForPrediction(config)
+    model = PatchTSTForClassification(config)
+    training_args = TrainingArguments(
+        output_dir='./save_model/',
+        num_train_epochs=1,
+        per_device_train_batch_size=5,
+        per_device_eval_batch_size=5,
+        report_to=[],
+        save_strategy='no',
+        remove_unused_columns=False,
+        no_cuda=True
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_asset_ds,
+        eval_dataset=valid_asset_ds
+    )
+    trainer.train()
+    trainer.save_model('./save_model')
+    # AutoConfig.register("patchtst", PatchTSTConfig)
+    AutoModel.register(PatchTSTConfig, PatchTSTForClassification)
+    config = AutoConfig.from_pretrained('./save_model')
+    model = AutoModel.from_pretrained('./save_model', config=config)
+    print(model)
+
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index cf8060a284f232..34caf2d0253442 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -25,7 +25,7 @@
 from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, _config_zero_init
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -34,8 +34,8 @@
 if is_torch_available():
     import torch
 
-    from transformers import PatchTSTConfig, PatchTSTForPrediction, PatchTSTModel
-    from transformers.models.patchtst.modeling_patchtst import PatchTSTDecoder, PatchTSTEncoder
+    from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder
 
 
 @require_torch
@@ -46,9 +46,10 @@ def __init__(
         batch_size=13,
         prediction_length=7,
         context_length=14,
-        cardinality=19,
-        embedding_dimension=5,
-        num_time_features=4,
+        patch_length=5,
+        stride=5,
+        input_size=1,
+        num_time_features=1,
         is_training=True,
         hidden_size=16,
         num_hidden_layers=2,
@@ -60,15 +61,17 @@ def __init__(
         lags_sequence=[1, 2, 3, 4, 5],
         sampling_factor=10,
         distil=False,
+        seed_number=42
     ):
         self.parent = parent
         self.batch_size = batch_size
         self.prediction_length = prediction_length
         self.context_length = context_length
-        self.cardinality = cardinality
+        self.patch_length = patch_length
+        self.stride = stride
+        self.input_size = input_size
         self.num_time_features = num_time_features
         self.lags_sequence = lags_sequence
-        self.embedding_dimension = embedding_dimension
         self.is_training = is_training
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -81,56 +84,42 @@ def __init__(
         self.encoder_seq_length = min(
             sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
         )
-        self.decoder_seq_length = min(
-            sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length
-        )
+        self.seed_number = seed_number
         self.sampling_factor = sampling_factor
         self.distil = distil
+        self.num_patch = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
 
     def get_config(self):
         return PatchTSTConfig(
             prediction_length=self.prediction_length,
+            patch_length=self.patch_length,
+            stride=self.stride,
+            input_size=self.input_size,
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
             encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
             dropout=self.hidden_dropout_prob,
             attention_dropout=self.attention_probs_dropout_prob,
             context_length=self.context_length,
-            lags_sequence=self.lags_sequence,
-            num_time_features=self.num_time_features,
-            num_static_categorical_features=1,
-            num_static_real_features=1,
-            cardinality=[self.cardinality],
-            embedding_dimension=[self.embedding_dimension],
-            sampling_factor=self.sampling_factor,
-            distil=self.distil,
+            activation_function=self.hidden_act,
+            seed_number=self.seed_number
         )
 
     def prepare_patchtst_inputs_dict(self, config):
-        _past_length = config.context_length + max(config.lags_sequence)
-
-        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
-        static_real_features = floats_tensor([self.batch_size, 1])
+        _past_length = config.context_length
+        # bs, n_vars, num_patch, patch_len
 
-        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
-        past_values = floats_tensor([self.batch_size, _past_length])
-        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
+        # [bs x seq_len x n_vars]
+        past_values = floats_tensor([self.batch_size, _past_length, self.input_size])
+        # past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
 
-        # decoder inputs
-        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
-        future_values = floats_tensor([self.batch_size, config.prediction_length])
+        future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size])
 
         inputs_dict = {
             "past_values": past_values,
-            "static_categorical_features": static_categorical_features,
-            "static_real_features": static_real_features,
-            "past_time_features": past_time_features,
-            "past_observed_mask": past_observed_mask,
-            "future_time_features": future_time_features,
+            # "past_observed_mask": past_observed_mask,
+            # "future_time_features": future_time_features,
             "future_values": future_values,
         }
         return inputs_dict
@@ -144,44 +133,32 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = PatchTSTModel(config=config).to(torch_device).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = PatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device)
-
-        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
-        enc_input = transformer_inputs[:, : config.context_length, ...]
-        dec_input = transformer_inputs[:, config.context_length :, ...]
-
-        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = PatchTSTDecoder.from_pretrained(tmpdirname).to(torch_device)
-
-        last_hidden_state_2 = decoder(
-            inputs_embeds=dec_input,
-            encoder_hidden_states=encoder_last_hidden_state,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+    # def check_encoder_model_standalone(self, config, inputs_dict):
+    #     model = PatchTSTModel(config=config).to(torch_device).eval()
+    #     outputs = model(**inputs_dict)
+    #
+    #     encoder_last_hidden_state = outputs.encoder_last_hidden_state
+    #
+    #     with tempfile.TemporaryDirectory() as tmpdirname:
+    #         encoder = model.get_encoder()
+    #         encoder.save_pretrained(tmpdirname)
+    #         encoder = ChannelAttentionPatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device)
+    #
+    #     transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
+    #     # [bs x seq_len x n_vars] => bs, num_patch, n_vars, patch_len = x.shape
+    #     enc_input = transformer_inputs[:, : config.context_length, ...]
+    #
+    #     encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+    #
+    #     self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
 
 
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (PatchTSTModel, PatchTSTForPrediction) if is_torch_available() else ()
-    all_generative_model_classes = (PatchTSTForPrediction,) if is_torch_available() else ()
-    is_encoder_decoder = True
+    all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else ()
+    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
+    is_encoder_decoder = False
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
@@ -189,6 +166,13 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     test_inputs_embeds = False
     test_model_common_attributes = False
 
+
+    test_resize_embeddings = True
+    test_resize_position_embeddings = False
+    test_mismatched_shapes = True
+    test_model_parallel = False
+    has_attentions = False
+
     def setUp(self):
         self.model_tester = PatchTSTModelTester(self)
         self.config_tester = ConfigTester(
@@ -211,10 +195,10 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
+    # def test_encoder_model_standalone(self):
+    #     config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+    #     self.model_tester.check_encoder_model_standalone(*config_and_inputs)
+#
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -227,39 +211,22 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
 
             expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers
             )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.context_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
+            num_patch = self.model_tester.num_patch
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
+                [num_patch, self.model_tester.hidden_size],
             )
 
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
+            print('model_class: ', model_class)
+
             check_hidden_states_output(inputs_dict, config, model_class)
 
             # check that output_hidden_states also work using config
@@ -267,23 +234,30 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             config.output_hidden_states = True
 
             check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Ignore since we have no tokens embeddings
+#
+#     # Ignore since we have no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass
 
     def test_model_outputs_equivalence(self):
         pass
-
+#
     def test_determinism(self):
         pass
 
-    # # Input is 'static_categorical_features' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -296,217 +270,217 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "past_values",
-                "past_time_features",
-                "past_observed_mask",
-                "static_categorical_features",
-                "static_real_features",
+                # "past_time_features",
+                # "past_observed_mask",
+                # "static_categorical_features",
+                # "static_real_features",
                 "future_values",
-                "future_time_features",
+                # "future_time_features",
             ]
 
             expected_arg_names.extend(
                 [
-                    "future_observed_mask",
-                    "decoder_attention_mask",
-                    "head_mask",
-                    "decoder_head_mask",
-                    "cross_attn_head_mask",
-                    "encoder_outputs",
-                    "past_key_values",
+                    # "future_observed_mask",
+                    # "decoder_attention_mask",
+                    # "head_mask",
+                    # "decoder_head_mask",
+                    # "cross_attn_head_mask",
+                    # "encoder_outputs",
+                    # "past_key_values",
                     "output_hidden_states",
-                    "output_attentions",
-                    "use_cache",
-                    "return_dict",
-                ]
-                if "future_observed_mask" in arg_names
-                else [
-                    "decoder_attention_mask",
-                    "head_mask",
-                    "decoder_head_mask",
-                    "cross_attn_head_mask",
-                    "encoder_outputs",
-                    "past_key_values",
-                    "output_hidden_states",
-                    "output_attentions",
-                    "use_cache",
-                    "return_dict",
+                    # "output_attentions",
+                    # "use_cache",
+                    # "return_dict",
                 ]
+                # if "future_observed_mask" in arg_names
+                # else [
+                #     "decoder_attention_mask",
+                #     "head_mask",
+                #     "decoder_head_mask",
+                #     "cross_attn_head_mask",
+                #     "encoder_outputs",
+                #     "past_key_values",
+                #     "output_hidden_states",
+                #     "output_attentions",
+                #     "use_cache",
+                #     "return_dict",
+                # ]
             )
 
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        context_length = getattr(self.model_tester, "context_length", seq_len)
-        prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 7
-
-            if "last_hidden_state" in outputs:
-                correct_outlen += 1
-
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            if "loss" in outputs:
-                correct_outlen += 1
-
-            if "params" in outputs:
-                correct_outlen += 1
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    encoder_seq_length,
-                ],
-            )
-
-        # Check attention is always last and order is fine
-        inputs_dict["output_attentions"] = True
-        inputs_dict["output_hidden_states"] = True
-        model = model_class(config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        self.assertEqual(out_len + 2, len(outputs))
-
-        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-        self.assertListEqual(
-            list(self_attentions[0].shape[-3:]),
-            [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
-        )
-
-    @is_flaky()
-    def test_retain_grad_hidden_states_attentions(self):
-        super().test_retain_grad_hidden_states_attentions()
-
-
-def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
-    return batch
-
-
-@require_torch
-@slow
-class PatchTSTModelIntegrationTests(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-        batch = prepare_batch()
-
-        torch.manual_seed(0)
-        with torch.no_grad():
-            output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-                future_values=batch["future_values"],
-                future_time_features=batch["future_time_features"],
-            ).last_hidden_state
-        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
-            device=torch_device,
-        )
-        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-        batch = prepare_batch("val-batch.pt")
-
-        torch.manual_seed(0)
-        with torch.no_grad():
-            output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-                future_time_features=batch["future_time_features"],
-            ).encoder_last_hidden_state
-
-        # encoder distils the context length to 1/8th of the original length
-        expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-        batch = prepare_batch("val-batch.pt")
-
-        torch.manual_seed(0)
-        with torch.no_grad():
-            outputs = model.generate(
-                static_categorical_features=batch["static_categorical_features"],
-                past_time_features=batch["past_time_features"],
-                past_values=batch["past_values"],
-                future_time_features=batch["future_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-            )
-        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-
-        expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
-        mean_prediction = outputs.sequences.mean(dim=1)
-        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
+#
+#     def test_attention_outputs(self):
+#         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+#         config.return_dict = True
+#
+#         seq_len = getattr(self.model_tester, "seq_length", None)
+#         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+#         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+#         context_length = getattr(self.model_tester, "context_length", seq_len)
+#         prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
+#
+#         for model_class in self.all_model_classes:
+#             inputs_dict["output_attentions"] = True
+#             inputs_dict["output_hidden_states"] = False
+#             config.return_dict = True
+#             model = model_class(config)
+#             model.to(torch_device)
+#             model.eval()
+#             with torch.no_grad():
+#                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+#             attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+#             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+#
+#             # check that output_attentions also work using config
+#             del inputs_dict["output_attentions"]
+#             config.output_attentions = True
+#             model = model_class(config)
+#             model.to(torch_device)
+#             model.eval()
+#             with torch.no_grad():
+#                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+#             attentions = outputs.encoder_attentions
+#             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+#
+#             self.assertListEqual(
+#                 list(attentions[0].shape[-3:]),
+#                 [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+#             )
+#             out_len = len(outputs)
+#
+#             correct_outlen = 7
+#
+#             if "last_hidden_state" in outputs:
+#                 correct_outlen += 1
+#
+#             if "past_key_values" in outputs:
+#                 correct_outlen += 1  # past_key_values have been returned
+#
+#             if "loss" in outputs:
+#                 correct_outlen += 1
+#
+#             if "params" in outputs:
+#                 correct_outlen += 1
+#
+#             self.assertEqual(out_len, correct_outlen)
+#
+#             # decoder attentions
+#             decoder_attentions = outputs.decoder_attentions
+#             self.assertIsInstance(decoder_attentions, (list, tuple))
+#             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+#             self.assertListEqual(
+#                 list(decoder_attentions[0].shape[-3:]),
+#                 [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
+#             )
+#
+#             # cross attentions
+#             cross_attentions = outputs.cross_attentions
+#             self.assertIsInstance(cross_attentions, (list, tuple))
+#             self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+#             self.assertListEqual(
+#                 list(cross_attentions[0].shape[-3:]),
+#                 [
+#                     self.model_tester.num_attention_heads,
+#                     decoder_seq_length,
+#                     encoder_seq_length,
+#                 ],
+#             )
+#
+#         # Check attention is always last and order is fine
+#         inputs_dict["output_attentions"] = True
+#         inputs_dict["output_hidden_states"] = True
+#         model = model_class(config)
+#         model.to(torch_device)
+#         model.eval()
+#         with torch.no_grad():
+#             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+#
+#         self.assertEqual(out_len + 2, len(outputs))
+#
+#         self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+#
+#         self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+#         self.assertListEqual(
+#             list(self_attentions[0].shape[-3:]),
+#             [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
+#         )
+#
+#     @is_flaky()
+#     def test_retain_grad_hidden_states_attentions(self):
+#         super().test_retain_grad_hidden_states_attentions()
+#
+#
+# def prepare_batch(filename="train-batch.pt"):
+#     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
+#     batch = torch.load(file, map_location=torch_device)
+#     return batch
+#
+#
+# @require_torch
+# @slow
+# class PatchTSTModelIntegrationTests(unittest.TestCase):
+#     def test_inference_no_head(self):
+#         model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+#         batch = prepare_batch()
+#
+#         torch.manual_seed(0)
+#         with torch.no_grad():
+#             output = model(
+#                 past_values=batch["past_values"],
+#                 past_time_features=batch["past_time_features"],
+#                 past_observed_mask=batch["past_observed_mask"],
+#                 static_categorical_features=batch["static_categorical_features"],
+#                 future_values=batch["future_values"],
+#                 future_time_features=batch["future_time_features"],
+#             ).last_hidden_state
+#         expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
+#         self.assertEqual(output.shape, expected_shape)
+#
+#         expected_slice = torch.tensor(
+#             [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
+#             device=torch_device,
+#         )
+#         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+#
+#     def test_inference_head(self):
+#         model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+#         batch = prepare_batch("val-batch.pt")
+#
+#         torch.manual_seed(0)
+#         with torch.no_grad():
+#             output = model(
+#                 past_values=batch["past_values"],
+#                 past_time_features=batch["past_time_features"],
+#                 past_observed_mask=batch["past_observed_mask"],
+#                 static_categorical_features=batch["static_categorical_features"],
+#                 future_time_features=batch["future_time_features"],
+#             ).encoder_last_hidden_state
+#
+#         # encoder distils the context length to 1/8th of the original length
+#         expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
+#         self.assertEqual(output.shape, expected_shape)
+#
+#         expected_slice = torch.tensor(
+#             [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
+#         )
+#         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+#
+#     def test_seq_to_seq_generation(self):
+#         model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
+#         batch = prepare_batch("val-batch.pt")
+#
+#         torch.manual_seed(0)
+#         with torch.no_grad():
+#             outputs = model.generate(
+#                 static_categorical_features=batch["static_categorical_features"],
+#                 past_time_features=batch["past_time_features"],
+#                 past_values=batch["past_values"],
+#                 future_time_features=batch["future_time_features"],
+#                 past_observed_mask=batch["past_observed_mask"],
+#             )
+#         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+#         self.assertEqual(outputs.sequences.shape, expected_shape)
+#
+#         expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
+#         mean_prediction = outputs.sequences.mean(dim=1)
+#         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

From 8b1310ecec2ae78445c3a6de3967476ec3c2c15d Mon Sep 17 00:00:00 2001
From: Gift Sinthong <gift.sinthong@ibm.com>
Date: Fri, 25 Aug 2023 12:48:40 -0400
Subject: [PATCH 010/189] Update ForecastHead to use the config class

---
 .../models/patchtst/modeling_patchtst.py      | 118 ++----------------
 .../models/patchtst/test_modeling_patchtst.py |   6 +-
 2 files changed, 13 insertions(+), 111 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 1fbd5f204fa7fa..a4fcd4a85af215 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1079,23 +1079,14 @@ class PatchTSTForForecastingOutput(ModelOutput):
 
 
 class ForecastHead(nn.Module):
-    def __init__(self,
-                individual: bool,
-                n_vars: int,
-                d_model: int,
-                num_patch: int,
-                forecast_len: int,
-                head_dropout: float = 0.,
-                use_cls_token: bool = False,
-                pooling: str = None,
-                ):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.individual = individual
-        self.n_vars = n_vars
-        self.use_cls_token = use_cls_token
-        self.pooling = pooling
-        head_dim = d_model if pooling else d_model * num_patch
+        self.individual = config.individual
+        self.n_vars = config.input_size
+        self.use_cls_token = config.use_cls_token
+        self.pooling = config.pooling
+        head_dim = config.d_model if self.pooling else config.d_model * config.num_patch
 
         if self.individual:
             self.linears = nn.ModuleList()
@@ -1103,15 +1094,15 @@ def __init__(self,
             self.flattens = nn.ModuleList()
             for i in range(self.n_vars):
                 self.flattens.append(nn.Flatten(start_dim=2))
-                self.linears.append(nn.Linear(head_dim, forecast_len))
+                self.linears.append(nn.Linear(head_dim, config.prediction_length))
                 self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity()
                                      )
         else:
             self.flatten = nn.Flatten(start_dim=2)
-            self.linear = nn.Linear(head_dim, forecast_len)
-            self.dropout = nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity()
+            self.linear = nn.Linear(head_dim, config.prediction_length)
+            self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor):
         """
         x: [bs x nvars x num_patch x d_model]
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
@@ -1150,9 +1141,6 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     # PatchTST model + classification head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-
-        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
-
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
         self.loss = nn.MSELoss(reduction='mean')
@@ -1176,89 +1164,3 @@ def forward(self,
             hidden_states=model_output.hidden_states
         )
 
-
-if __name__ == "__main__":
-
-    from transformers import Trainer, TrainingArguments
-    from torch.utils.data import Dataset
-    from transformers import AutoModel, AutoConfig
-    import numpy as np
-
-    class AssetDataset(Dataset):
-        def __init__(self, x, y, seq_len=10, pred_len=10, is_pred=False):
-            self.seq_len = seq_len
-            self.x = x
-            self.y = y
-            self.is_pred = is_pred
-            self.pred_len = pred_len
-
-        def __getitem__(self, index):
-            s_begin = index
-            s_end = s_begin + self.seq_len
-            r_begin = s_end - 1
-            r_end = s_end + self.pred_len
-
-            seq_x = self.x[s_begin:s_end]
-            seq_y = np.array(self.y[r_begin])
-            if self.is_pred:
-                seq_y = self.x[s_end:r_end]
-
-            return {'past_values': seq_x, 'future_values': seq_y}
-
-        def __len__(self):
-            if self.is_pred:
-                return len(self.x) - self.seq_len - self.pred_len + 1
-            return len(self.x) - self.seq_len + 1
-
-    n_classes = 3
-    bs = 200
-    n_features = 20
-    pred_len = 7
-    x = torch.randn(bs, n_features)
-    y = torch.randint(low=0, high=n_classes, size=(bs, 1))[:, 0]
-    valid_asset_ds = train_asset_ds = AssetDataset(x, y, seq_len=10, pred_len=pred_len, is_pred=False)
-    config = PatchTSTConfig(
-        input_size=n_features,
-        num_classes=n_classes,
-        context_length=10,
-        patch_length=5,
-        stride=5,
-        batch_size=50,
-        standardscale=None,  # 'bysample'
-        context_points=10,
-        encoder_layers=12,
-        encoder_attention_heads=8,
-        d_model=256,
-        encoder_ffn_dim=1024,
-        dropout=0.2,
-        fc_dropout=0,
-        r=0.4,
-        prediction_length=pred_len,
-    )
-    # model = PatchTSTForPretraining(config)
-    # model = PatchTSTForPrediction(config)
-    model = PatchTSTForClassification(config)
-    training_args = TrainingArguments(
-        output_dir='./save_model/',
-        num_train_epochs=1,
-        per_device_train_batch_size=5,
-        per_device_eval_batch_size=5,
-        report_to=[],
-        save_strategy='no',
-        remove_unused_columns=False,
-        no_cuda=True
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_asset_ds,
-        eval_dataset=valid_asset_ds
-    )
-    trainer.train()
-    trainer.save_model('./save_model')
-    # AutoConfig.register("patchtst", PatchTSTConfig)
-    AutoModel.register(PatchTSTConfig, PatchTSTForClassification)
-    config = AutoConfig.from_pretrained('./save_model')
-    model = AutoModel.from_pretrained('./save_model', config=config)
-    print(model)
-
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 34caf2d0253442..efca51b1b4f4db 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -35,7 +35,7 @@
     import torch
 
     from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder
+    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder
 
 
 @require_torch
@@ -155,8 +155,8 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else ()
-    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else ()
+    all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
+    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     is_encoder_decoder = False
     test_pruning = False

From 7c09b86b058c356becf5a873736541083eb9096d Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sat, 26 Aug 2023 23:14:33 +0700
Subject: [PATCH 011/189] edit cv_random_masking, add mask to model output

---
 .../models/patchtst/modeling_patchtst.py      | 408 +++++++++---------
 1 file changed, 199 insertions(+), 209 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a4fcd4a85af215..695c6da96ffec0 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -156,6 +156,176 @@ def coord1d_pos_encoding(q_len, exponential=False, normalize=True):
     return cpe
 
 
+def set_seed(x=42):
+    random.seed(x)
+    np.random.seed(x)
+    torch.manual_seed(x)
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
+
+
+def random_masking(
+    xb: torch.Tensor,
+    mask_ratio: float,
+    unmasked_channel_indices: list = None,
+    channel_consistent_masking: bool = True,
+    mask_value=0,
+):
+    """random_masking: Mask the input considering the control variables.
+
+    Args:
+        xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len]
+        mask_ratio (float): Mask ratio.
+        unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        mask_value (int, optional): Value to use for masking. Defaults to 0.
+
+    Returns:
+        Tensor: xb_mask, masked input, same shape as input
+        Tensor: Mask tensor of shape [bs x c x n]
+    """
+    bs, nvars, L, D = xb.shape
+
+    len_keep = int(L * (1 - mask_ratio))
+
+    if channel_consistent_masking:
+        noise = torch.rand(bs, 1, L, device=xb.device)  # noise in [0, 1], bs x 1 x  L
+        noise = noise.repeat(1, nvars, 1)  # bs x nvars x L
+    else:
+        noise = torch.rand(bs, nvars, L, device=xb.device)  # noise in [0, 1], bs x nvars x L
+
+        mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
+        mask[:, :, :len_keep] = 0
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
+    mask = torch.gather(mask, dim=-1, index=ids_restore)
+
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    xb_mask = xb.masked_fill(mask.bool(), mask_value)
+    return xb_mask, mask[..., 0]
+
+
+class Patch(nn.Module):
+    """
+    A class to patchify the time series sequence into different patches
+    """
+
+    def __init__(
+        self,
+        seq_len: int,
+        patch_len: int,
+        stride: int,
+        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+    ):
+        super().__init__()
+
+        assert (
+            seq_len > patch_len
+        ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})"
+
+        self.seq_len = seq_len
+        self.patch_len = patch_len
+        self.stride = stride
+
+        # get the number of patches
+        self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1
+        tgt_len = patch_len + stride * (self.num_patch - 1)
+        self.s_begin = seq_len - tgt_len
+
+    def forward(self, x: torch.Tensor):
+        """
+
+        Args:
+            x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars]
+        Returns:
+            z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
+        """
+        seq_len = x.shape[-2]
+        assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
+
+        # x = x[:, :, self.s_begin:, :]  # xb: [bs x ... x tgt_len x nvars]
+        z = x.transpose(0, -2)[self.s_begin :]  # z: [tgt_len x ... x bs x n_vars]
+        z = z.transpose(0, -2).contiguous()  # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
+        z = z.unfold(
+            dimension=-2, size=self.patch_len, step=self.stride
+        )  # xb: [bs x ... x num_patch x n_vars x patch_len]
+        z = z.transpose(-2, -3).contiguous()  # xb: [bs x ... x n_vars x num_patch x patch_len]
+        return z
+
+
+class PatchMasking(nn.Module):
+    def __init__(
+        self,
+        mask_type: str = "random",
+        mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        mask_patch_ratios: list = [1, 1],
+        channel_consistent_masking: bool = False,
+        unmasked_channel_indices: list = None,
+        mask_value=0,
+        seed_number: Optional[int] = None
+    ):
+        """PatchMasking: Class to random or forcast masking.
+
+        Args:
+            mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
+            mask_ratio (float, optional): Mask ratio.
+            mask_patches (list, optional): List of patch lengths to mask in the end of the data.
+            mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
+            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
+            unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
+            channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+            mask_value (int, optional): Value to use for masking. Defaults to 0.
+        """
+        if seed_number:
+            set_seed(seed_number)
+        self.mask_ratio = mask_ratio
+        self.channel_consistent_masking = channel_consistent_masking
+        self.mask_type = mask_type
+        self.mask_patches = mask_patches
+        self.mask_patch_ratios = mask_patch_ratios
+        self.unmasked_channel_indices = unmasked_channel_indices
+        self.mask_value = mask_value
+        if self.unmasked_channel_indices is not None:
+            self.unmasked_channel_indices.sort()
+
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        """
+        Input:
+            x: patched input
+                4D: [bs x n_vars x num_patch  x patch_len]
+
+        Output:
+            x_mask: Masked patched input
+                4D: [bs x n_vars x num_patch  x patch_len]
+            mask: bool tensor indicating True on masked points
+                4D: [bs x n_vars x num_patch]
+        """
+
+        if self.mask_type == "random":
+            x_mask, mask = random_masking(
+                xb=x,
+                mask_ratio=self.mask_ratio,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                channel_consistent_masking=self.channel_consistent_masking,
+                mask_value=self.mask_value,
+            )
+
+        else:
+            raise Exception("Invalid mask type")
+
+        mask = mask.bool()  # mask: [bs x n_vars x num_patch]
+
+        return x_mask, mask
+
+
+
 class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
@@ -532,6 +702,29 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool
     "The bare PatchTST Model outputting raw hidden-states without any specific head on top.",
     PATCHTST_START_DOCSTRING,
 )
+
+
+class PatchTSTModelOutputWithNoAttention(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        patched_input
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    patched_input: torch.FloatTensor = None
+    mask: torch.FloatTensor = None
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
@@ -544,8 +737,7 @@ def __init__(self, config: PatchTSTConfig):
                 mask_patches=config.mask_patches,
                 mask_patch_ratios=config.mask_patch_ratios,
                 channel_consistent_masking=config.channel_consistent_masking,
-                d_size=config.d_size,
-                cv_channel_indices=config.cv_channel_indices,
+                unmasked_channel_indices=config.unmasked_channel_indices,
                 mask_value=config.mask_value,
                 seed_number=config.seed_number
             )
@@ -561,31 +753,13 @@ def forward(self,
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain
-        masked_values = self.masking(patched_values)
+        masked_values, mask = self.masking(patched_values)
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
         return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
                                                   hidden_states=encoder_output.hidden_states,
-                                                  patched_input=patched_values)
-
-
-class PatchTSTModelOutputWithNoAttention(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        patched_input
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    patched_input: torch.FloatTensor = None
+                                                  patched_input=patched_values,
+                                                  mask=mask
+                                                  )
 
 
 class PretrainHead(nn.Module):
@@ -607,184 +781,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def cv_random_masking(
-    xb: torch.Tensor,
-    mask_ratio: float,
-    cv_channel_indices: list = None,
-    channel_consistent_masking: bool = True,
-    d_size="4D",
-    mask_value=0,
-):
-    """cv_random_masking: Mask the input considering the control variables.
-
-    Args:
-        xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
-        mask_ratio (float): Mask ratio.
-        cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
-        d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D".
-        mask_value (int, optional): Value to use for masking. Defaults to 0.
-
-    Returns:
-        Tensor: xb_mask, masked input, same shape as input
-        Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x tsg2 x c x n]
-    """
-    if d_size == "4D":
-        bs, nvars, L, D = xb.shape
-
-    len_keep = int(L * (1 - mask_ratio))
-
-    if d_size == "4D":
-        if channel_consistent_masking:
-            noise = torch.rand(bs, 1, L, device=xb.device)  # noise in [0, 1], bs x 1 x  L
-            noise = noise.repeat(1, nvars, 1)  # bs x nvars x L
-        else:
-            noise = torch.rand(bs, nvars, L, device=xb.device)  # noise in [0, 1], bs x nvars x L
-
-        mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
-        mask[:, :, :len_keep] = 0
-
-    # sort noise for each sample
-    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
-    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
-    mask = torch.gather(mask, dim=-1, index=ids_restore)
-
-    if d_size == "4D":
-        mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
-        if cv_channel_indices is not None:
-            mask[:, cv_channel_indices, :, :] = 0
-
-    xb_mask = xb.masked_fill(mask.bool(), mask_value)
-    return xb_mask, mask[..., 0]
-
-
-def set_seed(x=42):
-    random.seed(x)
-    np.random.seed(x)
-    torch.manual_seed(x)
-    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
-
-
-class PatchMasking(nn.Module):
-    def __init__(
-        self,
-        mask_type: str = "random",
-        mask_ratio=0.5,
-        mask_patches: list = [2, 3],
-        mask_patch_ratios: list = [1, 1],
-        channel_consistent_masking: bool = True,
-        d_size: str = "4D",
-        cv_channel_indices: list = None,
-        mask_value=0,
-        seed_number: Optional[int] = None
-    ):
-        """PatchMasking: Class to random or forcast masking.
-
-        Args:
-            mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
-            mask_ratio (float, optional): Mask ratio.
-            mask_patches (list, optional): List of patch lengths to mask in the end of the data.
-            mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
-            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-            cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
-            channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
-            d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D".
-            mask_value (int, optional): Value to use for masking. Defaults to 0.
-        """
-        if seed_number:
-            set_seed(seed_number)
-        self.mask_ratio = mask_ratio
-        self.channel_consistent_masking = channel_consistent_masking
-        self.d_size = d_size
-        self.mask_type = mask_type
-        self.mask_patches = mask_patches
-        self.mask_patch_ratios = mask_patch_ratios
-        self.cv_channel_indices = cv_channel_indices
-        self.mask_value = mask_value
-        if self.cv_channel_indices is not None:
-            self.cv_channel_indices.sort()
-
-        super().__init__()
-
-    def forward(self, x: torch.Tensor):
-        """
-        Input:
-            x: patched input
-                4D: [bs x n_vars x num_patch  x patch_len]
-
-        Output:
-            x_mask: Masked patched input
-                4D: [bs x n_vars x num_patch  x patch_len]
-            mask: bool tensor indicating True on masked points
-                4D: [bs x n_vars x num_patch]
-        """
-
-        if self.mask_type == "random":
-            x_mask, mask = cv_random_masking(
-                xb=x,
-                mask_ratio=self.mask_ratio,
-                cv_channel_indices=self.cv_channel_indices,
-                channel_consistent_masking=self.channel_consistent_masking,
-                d_size=self.d_size,
-                mask_value=self.mask_value,
-            )
-
-        else:
-            raise Exception("Invalid mask type")
-
-        mask = mask.bool()  # mask: [bs x n_vars x num_patch]
-
-        return x_mask #, mask
-
-
-class Patch(nn.Module):
-    """
-    A class to patchify the time series sequence into different patches
-    """
-
-    def __init__(
-        self,
-        seq_len: int,
-        patch_len: int,
-        stride: int,
-        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
-    ):
-        super().__init__()
-
-        assert (
-            seq_len > patch_len
-        ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})"
-
-        self.seq_len = seq_len
-        self.patch_len = patch_len
-        self.stride = stride
-
-        # get the number of patches
-        self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1
-        tgt_len = patch_len + stride * (self.num_patch - 1)
-        self.s_begin = seq_len - tgt_len
-
-    def forward(self, x: torch.Tensor):
-        """
-
-        Args:
-            x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars]
-        Returns:
-            z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
-        """
-        seq_len = x.shape[-2]
-        assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
-
-        # x = x[:, :, self.s_begin:, :]  # xb: [bs x ... x tgt_len x nvars]
-        z = x.transpose(0, -2)[self.s_begin :]  # z: [tgt_len x ... x bs x n_vars]
-        z = z.transpose(0, -2).contiguous()  # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
-        z = z.unfold(
-            dimension=-2, size=self.patch_len, step=self.stride
-        )  # xb: [bs x ... x num_patch x n_vars x patch_len]
-        z = z.transpose(-2, -3).contiguous()  # xb: [bs x ... x n_vars x num_patch x patch_len]
-        return z
-
-
 class PatchTSTForPreTrainingOutput(ModelOutput):
     """
     Output type of [`BertForPreTraining`].
@@ -838,7 +834,7 @@ def forward(
         future_values (y): labels
         """
         model_output = self.model(past_values)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len]
+        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token
 
         loss_val = self.loss(x_hat, model_output.patched_input)
         return PatchTSTForPreTrainingOutput(
@@ -909,9 +905,6 @@ class PatchTSTForClassificationOutput(ModelOutput):
             (classification) loss.
         prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -927,7 +920,6 @@ class PatchTSTForClassificationOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -996,10 +988,8 @@ class PatchTSTForPredictionOutput(ModelOutput):
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             MSE loss.
-
         prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction outputs of the time series modeling heads.
-
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.

From 617db9ae49948be7baac3f699bc33ce4835736f1 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sat, 26 Aug 2023 23:15:08 +0700
Subject: [PATCH 012/189] Update configuration_patchtst.py

---
 src/transformers/models/patchtst/configuration_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index dae40eee1ee12c..c1547601335353 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -158,7 +158,7 @@ def __init__(
         mask_patch_ratios: list = [1, 1],
         channel_consistent_masking: bool = True,
         d_size: str = "4D",
-        cv_channel_indices: list = None,
+        unmasked_channel_indices: list = None,
         mask_value=0,
         pooling: str = 'mean',
         num_classes: int = 1,
@@ -229,7 +229,7 @@ def __init__(
         self.mask_patch_ratios = mask_patch_ratios
         self.channel_consistent_masking = channel_consistent_masking
         self.d_size = d_size
-        self.cv_channel_indices = cv_channel_indices
+        self.unmasked_channel_indices = unmasked_channel_indices
         self.mask_value = mask_value
 
         # Classification

From 484dc009323f2227edc7bef58030873520b138af Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sat, 26 Aug 2023 23:57:19 +0700
Subject: [PATCH 013/189] add masked_loss to the pretraining

---
 .../models/patchtst/modeling_patchtst.py      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 695c6da96ffec0..6c4dcffdfaedaf 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -134,7 +134,7 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=
             2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
             - 1
         )
-        # pv(f'{i:4.0f}  {x:5.3f}  {cpe.mean():+6.3f}', verbose)
+
         if abs(cpe.mean()) <= eps:
             break
         elif cpe.mean() > eps:
@@ -789,11 +789,8 @@ class PatchTSTForPreTrainingOutput(ModelOutput):
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction
             (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patch, patch_len )`):
+            Prediction outputs of the modeling head.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -808,8 +805,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -822,7 +818,7 @@ def __init__(self, config: PatchTSTConfig):
         config.mask_input = True
         self.model = PatchTSTModel(config)
         self.head = PretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction="mean")
+        self.loss = torch.nn.MSELoss(reduction=None)
 
     def forward(
         self, past_values: torch.Tensor,
@@ -836,10 +832,13 @@ def forward(
         model_output = self.model(past_values)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token
 
+        # calculate masked_loss
         loss_val = self.loss(x_hat, model_output.patched_input)
+        masked_loss = (loss_val * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
+
         return PatchTSTForPreTrainingOutput(
-            loss=loss_val,
-            prediction_logits=x_hat,
+            loss=masked_loss,
+            prediction_outputs=x_hat,
             hidden_states=model_output.hidden_states
         )
 

From b1ef4af8b63a7a1a84fcda6c1b354a57a7ac54df Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 27 Aug 2023 23:48:31 +0700
Subject: [PATCH 014/189] add PatchEmbeddings

---
 .../models/patchtst/modeling_patchtst.py      | 258 ++++++++++++------
 1 file changed, 167 insertions(+), 91 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6c4dcffdfaedaf..df7bc3281a37ab 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -26,7 +26,8 @@
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.utils import ModelOutput
 from torch.nn.modules.activation import MultiheadAttention
-from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+# from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+from patchtst.configuration_patchtst import PatchTSTConfig
 
 logger = logging.get_logger(__name__)
 
@@ -173,7 +174,7 @@ def random_masking(
     """random_masking: Mask the input considering the control variables.
 
     Args:
-        xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len]
+        xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
         unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
         channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
@@ -201,7 +202,7 @@ def random_masking(
     ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
     mask = torch.gather(mask, dim=-1, index=ids_restore)
 
-    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patches x patch_length]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
@@ -209,55 +210,139 @@ def random_masking(
     return xb_mask, mask[..., 0]
 
 
-class Patch(nn.Module):
+def compute_num_patches(sequence_length, patch_length, stride):
+    return (max(sequence_length, patch_length) - patch_length) // stride + 1
+
+
+class Patchify(nn.Module):
     """
     A class to patchify the time series sequence into different patches
+    Args:
+        sequence_length (int, required): input sequence length
+        patch_length (int, required): patch length
+        stride (int, required): stride between patches
+    Returns:
+        z: output tensor data [bs x n_vars x num_patches x patch_length]
     """
 
     def __init__(
         self,
-        seq_len: int,
-        patch_len: int,
+        sequence_length: int,
+        patch_length: int,
         stride: int,
         padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
         assert (
-            seq_len > patch_len
-        ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})"
+            sequence_length > patch_length
+        ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
-        self.seq_len = seq_len
-        self.patch_len = patch_len
+        self.sequence_length = sequence_length
+        self.patch_length = patch_length
         self.stride = stride
 
         # get the number of patches
-        self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1
-        tgt_len = patch_len + stride * (self.num_patch - 1)
-        self.s_begin = seq_len - tgt_len
+        self.num_patches = compute_num_patches(sequence_length, patch_length, stride)
+        new_sequence_length = patch_length + stride * (self.num_patches - 1)
+        self.s_begin = sequence_length - new_sequence_length
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, past_values: torch.Tensor):
         """
+        Args:
+            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars]
+        Returns:
+            x: output tensor data [bs x n_vars x num_patches x patch_length]
+        """
+        sequence_length = past_values.shape[-2]
+        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model ({self.sequence_length})."
+
+        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        x = x.unfold(
+            dimension=-2, size=self.patch_length, step=self.stride
+        )  # x: [bs x num_patches x n_vars x patch_length]
+        x = x.transpose(-2, -3).contiguous()  # xb: [bs x n_vars x num_patches x patch_length]
+        return x
+
+
+class PatchEmbeddings(nn.Module):
+    """
+    A class to patchify the time series sequence into different patches
+    Args:
+        sequence_length (int, required): input sequence length
+        patch_length (int, required): patch length
+        stride (int, required): stride between patches
+    Returns:
+        embeddings: output tensor data [bs x n_vars x num_patches x embed_dim]
+    """
+    def __init__(
+        self,
+        sequence_length: int,
+        patch_length: int,
+        stride: int,
+        embed_dim: int
+    ):
+        super().__init__()
+
+        assert (
+            sequence_length > patch_length
+        ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
+
+        # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
 
+        self.sequence_length = sequence_length
+        self.patch_length = patch_length
+        self.stride = stride
+        self.embed_dim = embed_dim
+
+        # get the number of patches
+        self.num_patches = compute_num_patches(sequence_length, patch_length, stride)
+        new_sequence_length = patch_length + stride * (self.num_patches - 1)
+        self.s_begin = sequence_length - new_sequence_length
+
+        # Embedding
+        self.projection = nn.Conv1d(in_channels=1,
+                                    out_channels=embed_dim,
+                                    kernel_size=patch_length,
+                                    stride=stride,
+                                    )
+
+    def forward(self, past_values: torch.Tensor):
+        """
         Args:
-            x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars]
+            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars]
         Returns:
-            z: output tensor data [bs x ... x n_vars x num_patch x patch_len]
+            embeddings: output tensor data [bs x n_vars x num_patches x emb_dim]
         """
-        seq_len = x.shape[-2]
-        assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})."
+        bs, sequence_length, n_vars = past_values.shape
+        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
+
+        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        # convert past_values to shape [bs*n_vars x 1 x sequence_length ]
+        x = x.transpose(1, 2).reshape(bs*n_vars, 1, -1).contiguous()
+        # projection
+        embeddings = self.projection(x)   # embeddings: [bs*n_vars x emb_dim x num_patches]
+        # reshape
+        embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim]
+        # embeddings = embeddings.flatten(2).transpose(1, 2)
+        return embeddings
 
-        # x = x[:, :, self.s_begin:, :]  # xb: [bs x ... x tgt_len x nvars]
-        z = x.transpose(0, -2)[self.s_begin :]  # z: [tgt_len x ... x bs x n_vars]
-        z = z.transpose(0, -2).contiguous()  # z: [bs x ... x tgt_len x n_vars]  # TODO: need a better solution
-        z = z.unfold(
-            dimension=-2, size=self.patch_len, step=self.stride
-        )  # xb: [bs x ... x num_patch x n_vars x patch_len]
-        z = z.transpose(-2, -3).contiguous()  # xb: [bs x ... x n_vars x num_patch x patch_len]
-        return z
 
 
 class PatchMasking(nn.Module):
+    """
+    PatchMasking: Class to random or forcast masking.
+
+    Args:
+        mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
+        mask_ratio (float, optional): Mask ratio.
+        mask_patches (list, optional): List of patch lengths to mask in the end of the data.
+        mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
+        if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
+        unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        mask_value (int, optional): Value to use for masking. Defaults to 0.
+    """
     def __init__(
         self,
         mask_type: str = "random",
@@ -269,18 +354,7 @@ def __init__(
         mask_value=0,
         seed_number: Optional[int] = None
     ):
-        """PatchMasking: Class to random or forcast masking.
 
-        Args:
-            mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
-            mask_ratio (float, optional): Mask ratio.
-            mask_patches (list, optional): List of patch lengths to mask in the end of the data.
-            mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
-            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-            unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
-            channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
-            mask_value (int, optional): Value to use for masking. Defaults to 0.
-        """
         if seed_number:
             set_seed(seed_number)
         self.mask_ratio = mask_ratio
@@ -299,11 +373,11 @@ def forward(self, x: torch.Tensor):
         """
         Input:
             x: patched input
-                4D: [bs x n_vars x num_patch  x patch_len]
+                4D: [bs x n_vars x num_patches  x patch_length]
 
         Output:
             x_mask: Masked patched input
-                4D: [bs x n_vars x num_patch  x patch_len]
+                4D: [bs x n_vars x num_patches  x patch_length]
             mask: bool tensor indicating True on masked points
                 4D: [bs x n_vars x num_patch]
         """
@@ -339,9 +413,9 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        src: tensor [bs x nvars x seq_len x d_model]
+        src: tensor [bs x nvars x sequence_length x d_model]
         Return:
-            Tensor [bs x nvars x seq_len x d_model]
+            Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
         for mod in self.layers:
@@ -394,42 +468,42 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x nvars x seq_len x d_model]
+        src: tensor [bs x nvars x sequence_length x d_model]
         Return:
-            Tensor [bs x nvars x seq_len x d_model]
+            Tensor [bs x nvars x sequence_length x d_model]
         """
-        bs, n_vars, seq_len, d_model = src.shape
+        bs, n_vars, sequence_length, d_model = src.shape
 
         # First sublayer: attention across time
-        src = src.view(bs*n_vars, seq_len, d_model)      # src: [(bs*nvars) x seq_len x d_model]
+        src = src.view(bs*n_vars, sequence_length, d_model)      # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) )     # src: [(bs*nvars) x seq_len x d_model]
-        src = src.reshape(bs, n_vars, seq_len, d_model)     # [bs x nvars x seq_len x d_model]
+            src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) )     # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.reshape(bs, n_vars, sequence_length, d_model)     # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
-        # [bs x nvars x seq_len x d_model] -> [bs x seq_len x nvars x d_model] -> [(bs*seq_len) x nvars x d_model]
-        src = src.transpose(2, 1).contiguous().view(bs*seq_len, n_vars, d_model)        # [(bs*seq_len) x nvars x d_model]
+        # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
+        src = src.transpose(2, 1).contiguous().view(bs*sequence_length, n_vars, d_model)        # [(bs*sequence_length) x nvars x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) )     # src: [(bs*seq_len) x nvars x d_model]
-        src = src.reshape(bs, seq_len, n_vars, d_model).transpose(1,2).contiguous()         # src: [bs x nvars x seq_len x d_model]
+            src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) )     # src: [(bs*sequence_length) x nvars x d_model]
+        src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,2).contiguous()         # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        src = src.view(bs*n_vars, seq_len, d_model)      # src: [(bs*nvars) x seq_len x d_model]
+        src = src.view(bs*n_vars, sequence_length, d_model)      # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) ))  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout
-        src = src.reshape(bs, n_vars, seq_len, d_model)     # [bs x nvars x seq_len x d_model]
+        src = src.reshape(bs, n_vars, sequence_length, d_model)     # [bs x nvars x sequence_length x d_model]
 
         return src
 
@@ -454,7 +528,7 @@ class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.n_vars = config.input_size
-        self.num_patch = config.num_patch
+        self.num_patches = config.num_patches
         self.patch_length = config.patch_length
         self.d_model = config.d_model
         self.shared_embedding = config.shared_embedding
@@ -472,9 +546,9 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch + 1, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model)
         else:
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, config.d_model)
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -487,13 +561,13 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor:
         """
-        x: tensor [bs x nvars x num_patch x patch_len]
+        x: tensor [bs x nvars x num_patches x patch_length]
         return:
-            tensor [bs x nvars x num_patch x d_model]
-                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+            tensor [bs x nvars x num_patches x d_model]
+                or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         """
-        # bs, num_patch, n_vars, patch_len = x.shape
-        bs, n_vars, num_patch, patch_len = past_values.shape
+        # bs, num_patches, n_vars, patch_length = x.shape
+        bs, n_vars, num_patches, patch_length = past_values.shape
 
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -506,24 +580,26 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool
                 x_out.append(z)
             past_values = torch.stack(x_out, dim=1)
         else:
-            past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patch x d_model]
+            past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patches x d_model]
 
         if self.use_cls_token:
-            past_values = self.dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patch x d_model]
+            past_values = self.dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patches x d_model]
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
-            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x nvars x (num_patch+1) x d_model]
+            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x nvars x (num_patches+1) x d_model]
         else:
-            past_values = self.dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patch x d_model]
+            past_values = self.dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patches x d_model]
 
         # Encoder
         past_values, hidden_states = self.encoder(
-            past_values, output_hidden_states)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        # return past_values
+            past_values, output_hidden_states)  # x: [bs x nvars x num_patches x d_model]
+                                                # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+
         # return past_values, hidden_states
         return BaseModelOutputWithNoAttention(
-            last_hidden_state=past_values, hidden_states=hidden_states
+            last_hidden_state=past_values,
+            hidden_states=hidden_states
         )
 
 
@@ -729,7 +805,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-        self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride)
+        self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride)
         if config.mask_input:
             self.masking = PatchMasking(
                 mask_type=config.mask_type,
@@ -752,7 +828,7 @@ def forward(self,
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain
+        patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
         masked_values, mask = self.masking(patched_values)
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
         return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
@@ -771,11 +847,11 @@ def __init__(self, config):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        x: tensor [bs x nvars x num_patch x d_model]
-                or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        output: tensor [bs x nvars x num_patch x patch_len]
+        x: tensor [bs x nvars x num_patches x d_model]
+                or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        output: tensor [bs x nvars x num_patches x patch_length]
         """
-        x = self.linear(self.dropout(x))  # [bs x nvars x num_patch x patch_len]
+        x = self.linear(self.dropout(x))  # [bs x nvars x num_patches x patch_length]
         if self.use_cls_token:
             x = x[:, :, 1:, :]  # remove the first cls token
         return x
@@ -789,7 +865,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput):
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction
             (classification) loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patch, patch_len )`):
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patches, patch_length )`):
             Prediction outputs of the modeling head.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
@@ -826,11 +902,11 @@ def forward(
             output_hidden_states: Optional[bool] = None
     ) -> PatchTSTForPreTrainingOutput:
         """
-        past_values (x): tensor [bs x seq_len x n_vars ]
+        past_values (x): tensor [bs x sequence_length x n_vars ]
         future_values (y): labels
         """
-        model_output = self.model(past_values)  # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token
+        model_output = self.model(past_values)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token
 
         # calculate masked_loss
         loss_val = self.loss(x_hat, model_output.patched_input)
@@ -877,7 +953,7 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x):
         """
-        x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         output: [bs x n_classes]
         """
         if self.use_cls_token:
@@ -930,7 +1006,7 @@ def __init__(self, config: PatchTSTConfig):
         self.n_vars = config.input_size
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
-        head_dimension = config.d_model if config.pooling else config.d_model * config.num_patch
+        head_dimension = config.d_model if config.pooling else config.d_model * config.num_patches
 
         if self.individual:
             self.linears = nn.ModuleList()
@@ -948,8 +1024,8 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x: torch.Tensor):
         """
-        x: [bs x nvars x num_patch x d_model]
-            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        x: [bs x nvars x num_patches x d_model]
+            or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         output: [bs x forecast_len x nvars]
         """
         if self.use_cls_token:
@@ -960,18 +1036,18 @@ def forward(self, x: torch.Tensor):
             elif self.pooling == 'max':
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
-                y = x       # y: [bs x nvars x num_patch x d_model]
+                y = x       # y: [bs x nvars x num_patches x d_model]
 
         if self.individual:
             x_out = []
             for i in range(self.n_vars):
-                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patch)] or [bs x d_model)]
+                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.linears[i](z)  # z: [bs x forecast_len]
                 z = self.dropouts[i](z)
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
-            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)]
+            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
             x = self.linear(z)  # x: [bs x nvars x forecast_len]
 
@@ -1075,7 +1151,7 @@ def __init__(self, config: PatchTSTConfig):
         self.n_vars = config.input_size
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
-        head_dim = config.d_model if self.pooling else config.d_model * config.num_patch
+        head_dim = config.d_model if self.pooling else config.d_model * config.num_patches
 
         if self.individual:
             self.linears = nn.ModuleList()
@@ -1093,8 +1169,8 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x: torch.Tensor):
         """
-        x: [bs x nvars x num_patch x d_model]
-            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        x: [bs x nvars x num_patches x d_model]
+            or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         output: [bs x forecast_len x nvars]
         """
 
@@ -1106,18 +1182,18 @@ def forward(self, x: torch.Tensor):
             elif self.pooling == 'max':
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
-                y = x       # y: [bs x nvars x num_patch x d_model]
+                y = x       # y: [bs x nvars x num_patches x d_model]
 
         if self.individual:
             x_out = []
             for i in range(self.n_vars):
-                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patch)] or [bs x d_model)]
+                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.linears[i](z)  # z: [bs x forecast_len]
                 z = self.dropouts[i](z)
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
-            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)]
+            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
             x = self.linear(z)  # x: [bs x nvars x forecast_len]
 

From bc22a87ccb6791b8fb1d5a9a9918b1415797ee64 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 27 Aug 2023 23:48:42 +0700
Subject: [PATCH 015/189] Update configuration_patchtst.py

---
 src/transformers/models/patchtst/configuration_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index c1547601335353..24d741867fd2d3 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -215,7 +215,7 @@ def __init__(
         # PatchTST
         self.patch_length = patch_length
         self.stride = stride
-        self.num_patch = self._num_patches()
+        self.num_patches = self._num_patches()
         self.attention_type = attention_type
         self.sampling_factor = sampling_factor
         self.distil = distil

From 9799a5bf9c16857085871439c387d371cedbe97e Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 28 Aug 2023 20:12:56 +0700
Subject: [PATCH 016/189] edit loss which considers mask in the pretraining

---
 .../models/patchtst/modeling_patchtst.py      | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index df7bc3281a37ab..6bddab979e4d28 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -164,11 +164,12 @@ def set_seed(x=42):
     if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
 
 
+
 def random_masking(
     xb: torch.Tensor,
     mask_ratio: float,
     unmasked_channel_indices: list = None,
-    channel_consistent_masking: bool = True,
+    channel_consistent_masking: bool = False,
     mask_value=0,
 ):
     """random_masking: Mask the input considering the control variables.
@@ -194,14 +195,14 @@ def random_masking(
     else:
         noise = torch.rand(bs, nvars, L, device=xb.device)  # noise in [0, 1], bs x nvars x L
 
-        mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
-        mask[:, :, :len_keep] = 0
+    mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
+    mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
     ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
     ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
-    mask = torch.gather(mask, dim=-1, index=ids_restore)
 
+    mask = torch.gather(mask, dim=-1, index=ids_restore)
     mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patches x patch_length]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
@@ -255,7 +256,7 @@ def forward(self, past_values: torch.Tensor):
             x: output tensor data [bs x n_vars x num_patches x patch_length]
         """
         sequence_length = past_values.shape[-2]
-        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model ({self.sequence_length})."
+        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
 
         x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
         x = x.unfold(
@@ -803,10 +804,12 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
-    def __init__(self, config: PatchTSTConfig):
+    def __init__(self, config: PatchTSTConfig, mask_input: bool = False):
         super().__init__(config)
         self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride)
-        if config.mask_input:
+        self.mask_input = mask_input #config.mask_input
+
+        if self.mask_input:
             self.masking = PatchMasking(
                 mask_type=config.mask_type,
                 mask_ratio=config.mask_ratio,
@@ -829,7 +832,10 @@ def forward(self,
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
-        masked_values, mask = self.masking(patched_values)
+        if self.mask_input:
+            masked_values, mask = self.masking(patched_values)
+        else:
+            masked_values, mask = self.masking(patched_values), None
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
         return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
                                                   hidden_states=encoder_output.hidden_states,
@@ -891,10 +897,10 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        config.mask_input = True
-        self.model = PatchTSTModel(config)
+        # config.mask_input = True
+        self.model = PatchTSTModel(config=config, mask_input=True)
         self.head = PretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction=None)
+        self.loss = torch.nn.MSELoss(reduction='none')
 
     def forward(
         self, past_values: torch.Tensor,
@@ -910,7 +916,7 @@ def forward(
 
         # calculate masked_loss
         loss_val = self.loss(x_hat, model_output.patched_input)
-        masked_loss = (loss_val * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
+        masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
         return PatchTSTForPreTrainingOutput(
             loss=masked_loss,

From 78f317377aa9cc94e5644a52f19580460cfb6eb4 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 28 Aug 2023 20:13:10 +0700
Subject: [PATCH 017/189] remove patch_last option

---
 src/transformers/models/patchtst/configuration_patchtst.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 24d741867fd2d3..a85c7035eccc36 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -148,7 +148,7 @@ def __init__(
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,
-        patch_last: bool = True,
+
         individual: bool = False,
         seed_number= None,
         mask_input: Optional[bool] = None,
@@ -156,7 +156,7 @@ def __init__(
         mask_ratio=0.5,
         mask_patches: list = [2, 3],
         mask_patch_ratios: list = [1, 1],
-        channel_consistent_masking: bool = True,
+        channel_consistent_masking: bool = False,
         d_size: str = "4D",
         unmasked_channel_indices: list = None,
         mask_value=0,
@@ -209,7 +209,7 @@ def __init__(
         self.positional_encoding = positional_encoding
         self.learn_pe = learn_pe
         self.use_cls_token = use_cls_token
-        self.patch_last = patch_last
+        # self.patch_last = patch_last
         self.individual = individual
 
         # PatchTST

From 30819f69069b2dd536685ea43474c5fc90abf19f Mon Sep 17 00:00:00 2001
From: Gift Sinthong <gift.sinthong@ibm.com>
Date: Mon, 28 Aug 2023 10:32:17 -0400
Subject: [PATCH 018/189] Add commits from internal repo

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6bddab979e4d28..862cf2253d0585 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -27,7 +27,7 @@
 from transformers.utils import ModelOutput
 from torch.nn.modules.activation import MultiheadAttention
 # from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-from patchtst.configuration_patchtst import PatchTSTConfig
+from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
 
 logger = logging.get_logger(__name__)
 

From 2060fb07c25f6597e9b29111c272f509f494f8c5 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Mon, 28 Aug 2023 11:30:16 -0400
Subject: [PATCH 019/189] Update ForecastHead

---
 .../models/patchtst/modeling_patchtst.py      | 145 ++++++++++--------
 1 file changed, 78 insertions(+), 67 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 862cf2253d0585..f7d873cfdb4ea5 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -26,14 +26,12 @@
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.utils import ModelOutput
 from torch.nn.modules.activation import MultiheadAttention
-# from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
 from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
 
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "PatchTSTConfig"
 
-
 PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ibm/patchtst-base",
     # See all PatchTST models at https://huggingface.co/models?filter=patchtst
@@ -132,8 +130,9 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=
     i = 0
     for i in range(100):
         cpe = (
-            2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
-            - 1
+                2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (
+                    torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
+                - 1
         )
 
         if abs(cpe.mean()) <= eps:
@@ -164,13 +163,12 @@ def set_seed(x=42):
     if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
 
 
-
 def random_masking(
-    xb: torch.Tensor,
-    mask_ratio: float,
-    unmasked_channel_indices: list = None,
-    channel_consistent_masking: bool = False,
-    mask_value=0,
+        xb: torch.Tensor,
+        mask_ratio: float,
+        unmasked_channel_indices: list = None,
+        channel_consistent_masking: bool = False,
+        mask_value=0,
 ):
     """random_masking: Mask the input considering the control variables.
 
@@ -227,16 +225,16 @@ class Patchify(nn.Module):
     """
 
     def __init__(
-        self,
-        sequence_length: int,
-        patch_length: int,
-        stride: int,
-        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+            self,
+            sequence_length: int,
+            patch_length: int,
+            stride: int,
+            padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
         assert (
-            sequence_length > patch_length
+                sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         self.sequence_length = sequence_length
@@ -276,17 +274,18 @@ class PatchEmbeddings(nn.Module):
     Returns:
         embeddings: output tensor data [bs x n_vars x num_patches x embed_dim]
     """
+
     def __init__(
-        self,
-        sequence_length: int,
-        patch_length: int,
-        stride: int,
-        embed_dim: int
+            self,
+            sequence_length: int,
+            patch_length: int,
+            stride: int,
+            embed_dim: int
     ):
         super().__init__()
 
         assert (
-            sequence_length > patch_length
+                sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
@@ -320,16 +319,16 @@ def forward(self, past_values: torch.Tensor):
 
         x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
         # convert past_values to shape [bs*n_vars x 1 x sequence_length ]
-        x = x.transpose(1, 2).reshape(bs*n_vars, 1, -1).contiguous()
+        x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous()
         # projection
-        embeddings = self.projection(x)   # embeddings: [bs*n_vars x emb_dim x num_patches]
+        embeddings = self.projection(x)  # embeddings: [bs*n_vars x emb_dim x num_patches]
         # reshape
-        embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim]
+        embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1,
+                                                     self.embed_dim).contiguous()  # embeddings: [bs x n_vars x num_patches x emb_dim]
         # embeddings = embeddings.flatten(2).transpose(1, 2)
         return embeddings
 
 
-
 class PatchMasking(nn.Module):
     """
     PatchMasking: Class to random or forcast masking.
@@ -344,16 +343,17 @@ class PatchMasking(nn.Module):
         channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
     """
+
     def __init__(
-        self,
-        mask_type: str = "random",
-        mask_ratio=0.5,
-        mask_patches: list = [2, 3],
-        mask_patch_ratios: list = [1, 1],
-        channel_consistent_masking: bool = False,
-        unmasked_channel_indices: list = None,
-        mask_value=0,
-        seed_number: Optional[int] = None
+            self,
+            mask_type: str = "random",
+            mask_ratio=0.5,
+            mask_patches: list = [2, 3],
+            mask_patch_ratios: list = [1, 1],
+            channel_consistent_masking: bool = False,
+            unmasked_channel_indices: list = None,
+            mask_value=0,
+            seed_number: Optional[int] = None
     ):
 
         if seed_number:
@@ -400,7 +400,6 @@ def forward(self, x: torch.Tensor):
         return x_mask, mask
 
 
-
 class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
@@ -476,35 +475,43 @@ def forward(self, src: torch.Tensor):
         bs, n_vars, sequence_length, d_model = src.shape
 
         # First sublayer: attention across time
-        src = src.view(bs*n_vars, sequence_length, d_model)      # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(bs * n_vars, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
-            src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) )  # Add: residual connection with residual dropout
+            src = src + self.dropout_path1(
+                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) )     # src: [(bs*nvars) x sequence_length x d_model]
-        src = src.reshape(bs, n_vars, sequence_length, d_model)     # [bs x nvars x sequence_length x d_model]
+            src = self.norm_sublayer1(
+                src + self.dropout_path1(self.self_attn(src)))  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
-        src = src.transpose(2, 1).contiguous().view(bs*sequence_length, n_vars, d_model)        # [(bs*sequence_length) x nvars x d_model]
+        src = src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars,
+                                                    d_model)  # [(bs*sequence_length) x nvars x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
-            src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) )  # Add: residual connection with residual dropout
+            src = src + self.dropout_path2(
+                self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) )     # src: [(bs*sequence_length) x nvars x d_model]
-        src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,2).contiguous()         # src: [bs x nvars x sequence_length x d_model]
+            src = self.norm_sublayer2(
+                src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
+        src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,
+                                                                          2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        src = src.view(bs*n_vars, sequence_length, d_model)      # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(bs * n_vars, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
-            src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) ))  # Add: residual connection with residual dropout
+            src = src + self.dropout_path3(
+                self.ff(self.norm_sublayer3(src)))  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout
-        src = src.reshape(bs, n_vars, sequence_length, d_model)     # [bs x nvars x sequence_length x d_model]
+            src = self.norm_sublayer3(
+                src + self.dropout_path3(self.ff(src)))  # Add: residual connection with residual dropout
+        src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         return src
 
@@ -547,9 +554,11 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1,
+                                             config.d_model)
         else:
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, config.d_model)
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches,
+                                             config.d_model)
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -560,7 +569,8 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor:
+    def forward(self, past_values: torch.Tensor,
+                output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention:
         """
         x: tensor [bs x nvars x num_patches x patch_length]
         return:
@@ -595,7 +605,7 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool
         # Encoder
         past_values, hidden_states = self.encoder(
             past_values, output_hidden_states)  # x: [bs x nvars x num_patches x d_model]
-                                                # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
 
         # return past_values, hidden_states
         return BaseModelOutputWithNoAttention(
@@ -779,8 +789,6 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool
     "The bare PatchTST Model outputting raw hidden-states without any specific head on top.",
     PATCHTST_START_DOCSTRING,
 )
-
-
 class PatchTSTModelOutputWithNoAttention(ModelOutput):
     """
     Base class for model's outputs, with potential hidden states.
@@ -807,7 +815,7 @@ class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig, mask_input: bool = False):
         super().__init__(config)
         self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride)
-        self.mask_input = mask_input #config.mask_input
+        self.mask_input = mask_input  # config.mask_input
 
         if self.mask_input:
             self.masking = PatchMasking(
@@ -826,12 +834,13 @@ def __init__(self, config: PatchTSTConfig, mask_input: bool = False):
 
     def forward(self,
                 past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor]=None,
+                future_values: Optional[torch.Tensor] = None,
                 output_hidden_states: Optional[bool] = None):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        patched_values = self.patching(past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
+        patched_values = self.patching(
+            past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:
@@ -903,7 +912,7 @@ def __init__(self, config: PatchTSTConfig):
         self.loss = torch.nn.MSELoss(reduction='none')
 
     def forward(
-        self, past_values: torch.Tensor,
+            self, past_values: torch.Tensor,
             future_values: Optional[torch.Tensor] = None,
             output_hidden_states: Optional[bool] = None
     ) -> PatchTSTForPreTrainingOutput:
@@ -911,8 +920,10 @@ def forward(
         past_values (x): tensor [bs x sequence_length x n_vars ]
         future_values (y): labels
         """
-        model_output = self.model(past_values)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        x_hat = self.head(model_output[0])  # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token
+        model_output = self.model(
+            past_values)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        x_hat = self.head(model_output[
+                              0])  # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token
 
         # calculate masked_loss
         loss_val = self.loss(x_hat, model_output.patched_input)
@@ -1035,14 +1046,14 @@ def forward(self, x: torch.Tensor):
         output: [bs x forecast_len x nvars]
         """
         if self.use_cls_token:
-            y = x[:, :, 0, :]      # y: [bs x nvars x d_model]
+            y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
         else:
             if self.pooling == 'mean':
                 y = x.mean(dim=2)  # y: [bs x nvars x d_model]
             elif self.pooling == 'max':
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
-                y = x       # y: [bs x nvars x num_patches x d_model]
+                y = x  # y: [bs x nvars x num_patches x d_model]
 
         if self.individual:
             x_out = []
@@ -1053,7 +1064,7 @@ def forward(self, x: torch.Tensor):
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
-            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
+            z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
             x = self.linear(z)  # x: [bs x nvars x forecast_len]
 
@@ -1166,7 +1177,7 @@ def __init__(self, config: PatchTSTConfig):
             for i in range(self.n_vars):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 self.linears.append(nn.Linear(head_dim, config.prediction_length))
-                self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity()
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
                                      )
         else:
             self.flatten = nn.Flatten(start_dim=2)
@@ -1181,14 +1192,14 @@ def forward(self, x: torch.Tensor):
         """
 
         if self.use_cls_token:
-            y = x[:, :, 0, :]      # y: [bs x nvars x d_model]
+            y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
         else:
             if self.pooling == 'mean':
                 y = x.mean(dim=2)  # y: [bs x nvars x d_model]
             elif self.pooling == 'max':
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
-                y = x       # y: [bs x nvars x num_patches x d_model]
+                y = x  # y: [bs x nvars x num_patches x d_model]
 
         if self.individual:
             x_out = []
@@ -1199,7 +1210,7 @@ def forward(self, x: torch.Tensor):
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
-            z = self.flatten(y)         # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
+            z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
             x = self.linear(z)  # x: [bs x nvars x forecast_len]
 

From 271b19bd89fea5db7feb647c0e5a4bc39df8d4c4 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Mon, 28 Aug 2023 18:19:03 -0400
Subject: [PATCH 020/189] Add model weight initilization + unittest

---
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/patchtst/__init__.py  |   4 +
 .../models/patchtst/configuration_patchtst.py |   3 +-
 .../models/patchtst/modeling_patchtst.py      |  39 +++-
 .../models/patchtst/test_modeling_patchtst.py | 188 ++----------------
 5 files changed, 63 insertions(+), 172 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b7cf99b0e0e4ae..d15f166fe6dc54 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -152,6 +152,7 @@
         ("openai-gpt", "OpenAIGPTModel"),
         ("opt", "OPTModel"),
         ("owlvit", "OwlViTModel"),
+        ("patchtst", "PatchTSTModel"),
         ("pegasus", "PegasusModel"),
         ("pegasus_x", "PegasusXModel"),
         ("perceiver", "PerceiverModel"),
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 88ed72154b826c..265eef2483805d 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -33,6 +33,7 @@
     _import_structure["modeling_patchtst"] = [
         "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
         "PatchTSTForPretraining",
+        "PatchTSTForPrediction"
         "PatchTSTModel",
         "PatchTSTPreTrainedModel",
     ]
@@ -51,6 +52,9 @@
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForPretraining,
             PatchTSTModel,
+            PatchTSTForPrediction,
+            PatchTSTForForecasting,
+            PatchTSTForClassification,
             PatchTSTPreTrainedModel,
         )
 
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index a85c7035eccc36..085d886d1bd86d 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -148,7 +148,7 @@ def __init__(
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,
-
+        init_std: float = 0.02,
         individual: bool = False,
         seed_number= None,
         mask_input: Optional[bool] = None,
@@ -211,6 +211,7 @@ def __init__(
         self.use_cls_token = use_cls_token
         # self.patch_last = patch_last
         self.individual = individual
+        self.init_std = init_std
 
         # PatchTST
         self.patch_length = patch_length
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f7d873cfdb4ea5..4235d96c7422f2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -169,10 +169,12 @@ def random_masking(
         unmasked_channel_indices: list = None,
         channel_consistent_masking: bool = False,
         mask_value=0,
+        seed_number: Optional[int] = None
 ):
     """random_masking: Mask the input considering the control variables.
 
     Args:
+        seed_number (int, optional): Value to set for the seed number
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
         unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
@@ -183,6 +185,9 @@ def random_masking(
         Tensor: xb_mask, masked input, same shape as input
         Tensor: Mask tensor of shape [bs x c x n]
     """
+    if seed_number:
+        set_seed(seed_number)
+
     bs, nvars, L, D = xb.shape
 
     len_keep = int(L * (1 - mask_ratio))
@@ -356,8 +361,8 @@ def __init__(
             seed_number: Optional[int] = None
     ):
 
-        if seed_number:
-            set_seed(seed_number)
+        # if seed_number:
+        #     set_seed(seed_number)
         self.mask_ratio = mask_ratio
         self.channel_consistent_masking = channel_consistent_masking
         self.mask_type = mask_type
@@ -367,6 +372,7 @@ def __init__(
         self.mask_value = mask_value
         if self.unmasked_channel_indices is not None:
             self.unmasked_channel_indices.sort()
+        self.seed_number = seed_number
 
         super().__init__()
 
@@ -390,6 +396,7 @@ def forward(self, x: torch.Tensor):
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
                 mask_value=self.mask_value,
+                seed_number=self.seed_number
             )
 
         else:
@@ -526,6 +533,18 @@ def _init_weights(self, module):
         """Initialize weights"""
         if self.config.use_cls_token:
             torch.nn.init.normal_(self.config.cls_token, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, MultiheadAttention):
+            module.in_proj_weight.data.normal_(mean=0.0, std=self.config.init_std)
+            module.bias_k.data.normal_(mean=0.0, std=self.config.init_std)
+            module.bias_v.data.normal_(mean=0.0, std=self.config.init_std)
+
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
@@ -550,7 +569,6 @@ def __init__(self, config: PatchTSTConfig):
                 self.w_p.append(nn.Linear(config.patch_length, config.d_model))
         else:
             self.w_p = nn.Linear(config.patch_length, config.d_model)
-
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
@@ -832,6 +850,9 @@ def __init__(self, config: PatchTSTConfig, mask_input: bool = False):
             self.masking = nn.Identity()
         self.encoder = ChannelAttentionPatchTSTEncoder(config)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(self,
                 past_values: torch.Tensor,
                 future_values: Optional[torch.Tensor] = None,
@@ -911,6 +932,9 @@ def __init__(self, config: PatchTSTConfig):
         self.head = PretrainHead(config)
         self.loss = torch.nn.MSELoss(reduction='none')
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(
             self, past_values: torch.Tensor,
             future_values: Optional[torch.Tensor] = None,
@@ -945,6 +969,9 @@ def __init__(self, config: PatchTSTConfig):
         self.head = ClassificationHead(config)
         self.loss = nn.CrossEntropyLoss()
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None):
         model_output = self.model(past_values)
         y_hat = self.head(model_output[0])
@@ -1110,6 +1137,9 @@ def __init__(self, config: PatchTSTConfig):
         self.head = PredictionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(self,
                 past_values: torch.Tensor,
                 future_values: Optional[torch.Tensor],
@@ -1227,6 +1257,9 @@ def __init__(self, config: PatchTSTConfig):
         self.head = ForecastHead(config)
         self.loss = nn.MSELoss(reduction='mean')
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(self,
                 past_values: torch.Tensor,
                 future_values: Optional[torch.Tensor],
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index efca51b1b4f4db..dd2ae7cf8ffb42 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -22,10 +22,10 @@
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
-from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, _config_zero_init
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -35,7 +35,8 @@
     import torch
 
     from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder
+    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel
+    # from transformers import PatchTSTConfig, PatchTSTModel, PatchTSTForPrediction
 
 
 @require_torch
@@ -112,14 +113,11 @@ def prepare_patchtst_inputs_dict(self, config):
 
         # [bs x seq_len x n_vars]
         past_values = floats_tensor([self.batch_size, _past_length, self.input_size])
-        # past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
 
         future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size])
 
         inputs_dict = {
             "past_values": past_values,
-            # "past_observed_mask": past_observed_mask,
-            # "future_time_features": future_time_features,
             "future_values": future_values,
         }
         return inputs_dict
@@ -133,25 +131,6 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-    # def check_encoder_model_standalone(self, config, inputs_dict):
-    #     model = PatchTSTModel(config=config).to(torch_device).eval()
-    #     outputs = model(**inputs_dict)
-    #
-    #     encoder_last_hidden_state = outputs.encoder_last_hidden_state
-    #
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         encoder = model.get_encoder()
-    #         encoder.save_pretrained(tmpdirname)
-    #         encoder = ChannelAttentionPatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device)
-    #
-    #     transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
-    #     # [bs x seq_len x n_vars] => bs, num_patch, n_vars, patch_len = x.shape
-    #     enc_input = transformer_inputs[:, : config.context_length, ...]
-    #
-    #     encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
-    #
-    #     self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
 
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
@@ -195,9 +174,6 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    # def test_encoder_model_standalone(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-    #     self.model_tester.check_encoder_model_standalone(*config_and_inputs)
 #
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
@@ -245,19 +221,19 @@ def test_model_outputs_equivalence(self):
     def test_determinism(self):
         pass
 
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
 
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
+    def test_save_load_fast_init_from_base(self):
+        # super().test_save_load_fast_init_from_base()
+        pass
+
+    def test_save_load_fast_init_to_base(self):
+        # super().test_save_load_fast_init_to_base()
+        pass
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -270,144 +246,20 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "past_values",
-                # "past_time_features",
-                # "past_observed_mask",
-                # "static_categorical_features",
-                # "static_real_features",
                 "future_values",
-                # "future_time_features",
             ]
 
             expected_arg_names.extend(
                 [
-                    # "future_observed_mask",
-                    # "decoder_attention_mask",
-                    # "head_mask",
-                    # "decoder_head_mask",
-                    # "cross_attn_head_mask",
-                    # "encoder_outputs",
-                    # "past_key_values",
                     "output_hidden_states",
-                    # "output_attentions",
-                    # "use_cache",
-                    # "return_dict",
                 ]
-                # if "future_observed_mask" in arg_names
-                # else [
-                #     "decoder_attention_mask",
-                #     "head_mask",
-                #     "decoder_head_mask",
-                #     "cross_attn_head_mask",
-                #     "encoder_outputs",
-                #     "past_key_values",
-                #     "output_hidden_states",
-                #     "output_attentions",
-                #     "use_cache",
-                #     "return_dict",
-                # ]
             )
 
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-#
-#     def test_attention_outputs(self):
-#         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-#         config.return_dict = True
-#
-#         seq_len = getattr(self.model_tester, "seq_length", None)
-#         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-#         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-#         context_length = getattr(self.model_tester, "context_length", seq_len)
-#         prediction_length = getattr(self.model_tester, "prediction_length", seq_len)
-#
-#         for model_class in self.all_model_classes:
-#             inputs_dict["output_attentions"] = True
-#             inputs_dict["output_hidden_states"] = False
-#             config.return_dict = True
-#             model = model_class(config)
-#             model.to(torch_device)
-#             model.eval()
-#             with torch.no_grad():
-#                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-#             attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-#             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-#
-#             # check that output_attentions also work using config
-#             del inputs_dict["output_attentions"]
-#             config.output_attentions = True
-#             model = model_class(config)
-#             model.to(torch_device)
-#             model.eval()
-#             with torch.no_grad():
-#                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-#             attentions = outputs.encoder_attentions
-#             self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-#
-#             self.assertListEqual(
-#                 list(attentions[0].shape[-3:]),
-#                 [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
-#             )
-#             out_len = len(outputs)
-#
-#             correct_outlen = 7
-#
-#             if "last_hidden_state" in outputs:
-#                 correct_outlen += 1
-#
-#             if "past_key_values" in outputs:
-#                 correct_outlen += 1  # past_key_values have been returned
-#
-#             if "loss" in outputs:
-#                 correct_outlen += 1
-#
-#             if "params" in outputs:
-#                 correct_outlen += 1
-#
-#             self.assertEqual(out_len, correct_outlen)
-#
-#             # decoder attentions
-#             decoder_attentions = outputs.decoder_attentions
-#             self.assertIsInstance(decoder_attentions, (list, tuple))
-#             self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-#             self.assertListEqual(
-#                 list(decoder_attentions[0].shape[-3:]),
-#                 [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length],
-#             )
-#
-#             # cross attentions
-#             cross_attentions = outputs.cross_attentions
-#             self.assertIsInstance(cross_attentions, (list, tuple))
-#             self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-#             self.assertListEqual(
-#                 list(cross_attentions[0].shape[-3:]),
-#                 [
-#                     self.model_tester.num_attention_heads,
-#                     decoder_seq_length,
-#                     encoder_seq_length,
-#                 ],
-#             )
-#
-#         # Check attention is always last and order is fine
-#         inputs_dict["output_attentions"] = True
-#         inputs_dict["output_hidden_states"] = True
-#         model = model_class(config)
-#         model.to(torch_device)
-#         model.eval()
-#         with torch.no_grad():
-#             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-#
-#         self.assertEqual(out_len + 2, len(outputs))
-#
-#         self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-#
-#         self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-#         self.assertListEqual(
-#             list(self_attentions[0].shape[-3:]),
-#             [self.model_tester.num_attention_heads, encoder_seq_length, context_length],
-#         )
-#
-#     @is_flaky()
-#     def test_retain_grad_hidden_states_attentions(self):
-#         super().test_retain_grad_hidden_states_attentions()
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
 #
 #
 # def prepare_batch(filename="train-batch.pt"):

From 9325a6a64ecabf07670887c98e4bd91d877f5a8a Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 29 Aug 2023 01:43:11 -0400
Subject: [PATCH 021/189] Update PatchTST unittest to use local import

---
 src/transformers/__init__.py                    |  6 ++++++
 src/transformers/models/patchtst/__init__.py    |  7 ++++---
 tests/models/patchtst/test_modeling_patchtst.py | 13 ++-----------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 051d4ef647f59c..5b294f4218bb73 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1995,6 +1995,9 @@
             "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
             "PatchTSTModel",
             "PatchTSTPreTrainedModel",
+            "PatchTSTForPrediction",
+            "PatchTSTForForecasting",
+            "PatchTSTForPretraining"
         ]
     )
     _import_structure["models.instructblip"].extend(
@@ -5859,6 +5862,9 @@
         from .models.patchtst import (
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTModel,
+            PatchTSTForPrediction,
+            PatchTSTForForecasting,
+            PatchTSTForPretraining,
             PatchTSTPreTrainedModel,
         )
         from .models.instructblip import (
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 265eef2483805d..35e2a01f166a29 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -14,7 +14,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -32,10 +32,11 @@
 else:
     _import_structure["modeling_patchtst"] = [
         "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "PatchTSTForPretraining",
-        "PatchTSTForPrediction"
         "PatchTSTModel",
         "PatchTSTPreTrainedModel",
+        "PatchTSTForPrediction",
+        "PatchTSTForForecasting",
+        "PatchTSTForPretraining"
     ]
 
 
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index dd2ae7cf8ffb42..c99cbe0a74900f 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -33,10 +33,9 @@
 
 if is_torch_available():
     import torch
+    from transformers import PatchTSTConfig
+    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining
 
-    from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-    from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel
-    # from transformers import PatchTSTConfig, PatchTSTModel, PatchTSTForPrediction
 
 
 @require_torch
@@ -227,14 +226,6 @@ def test_model_main_input_name(self):
         observed_main_input_name = list(model_signature.parameters.keys())[1]
         self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
 
-    def test_save_load_fast_init_from_base(self):
-        # super().test_save_load_fast_init_from_base()
-        pass
-
-    def test_save_load_fast_init_to_base(self):
-        # super().test_save_load_fast_init_to_base()
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 

From 0c5deb4126e8688cdc1633351d4aa051c5ba1316 Mon Sep 17 00:00:00 2001
From: Ngoc Diep Do <diiepy@gmail.com>
Date: Tue, 29 Aug 2023 13:43:06 +0200
Subject: [PATCH 022/189] PatchTST integration tests for pretraining and
 prediction

---
 .../models/patchtst/modeling_patchtst.py      |   2 +-
 .../models/patchtst/test_modeling_patchtst.py | 143 ++++++++----------
 2 files changed, 67 insertions(+), 78 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4235d96c7422f2..87f36dd8eb5867 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1142,7 +1142,7 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self,
                 past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor],
+                future_values: Optional[torch.Tensor] = None,
                 output_hidden_states: Optional[bool] = None):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index c99cbe0a74900f..c9a6afa7315757 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -22,7 +22,7 @@
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
-from transformers.testing_utils import is_flaky, require_torch, torch_device
+from transformers.testing_utils import is_flaky, require_torch, torch_device, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor
@@ -251,79 +251,68 @@ def test_forward_signature(self):
     @is_flaky()
     def test_retain_grad_hidden_states_attentions(self):
         super().test_retain_grad_hidden_states_attentions()
-#
-#
-# def prepare_batch(filename="train-batch.pt"):
-#     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-#     batch = torch.load(file, map_location=torch_device)
-#     return batch
-#
-#
-# @require_torch
-# @slow
-# class PatchTSTModelIntegrationTests(unittest.TestCase):
-#     def test_inference_no_head(self):
-#         model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-#         batch = prepare_batch()
-#
-#         torch.manual_seed(0)
-#         with torch.no_grad():
-#             output = model(
-#                 past_values=batch["past_values"],
-#                 past_time_features=batch["past_time_features"],
-#                 past_observed_mask=batch["past_observed_mask"],
-#                 static_categorical_features=batch["static_categorical_features"],
-#                 future_values=batch["future_values"],
-#                 future_time_features=batch["future_time_features"],
-#             ).last_hidden_state
-#         expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
-#         self.assertEqual(output.shape, expected_shape)
-#
-#         expected_slice = torch.tensor(
-#             [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]],
-#             device=torch_device,
-#         )
-#         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-#
-#     def test_inference_head(self):
-#         model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-#         batch = prepare_batch("val-batch.pt")
-#
-#         torch.manual_seed(0)
-#         with torch.no_grad():
-#             output = model(
-#                 past_values=batch["past_values"],
-#                 past_time_features=batch["past_time_features"],
-#                 past_observed_mask=batch["past_observed_mask"],
-#                 static_categorical_features=batch["static_categorical_features"],
-#                 future_time_features=batch["future_time_features"],
-#             ).encoder_last_hidden_state
-#
-#         # encoder distils the context length to 1/8th of the original length
-#         expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model))
-#         self.assertEqual(output.shape, expected_shape)
-#
-#         expected_slice = torch.tensor(
-#             [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device
-#         )
-#         self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-#
-#     def test_seq_to_seq_generation(self):
-#         model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device)
-#         batch = prepare_batch("val-batch.pt")
-#
-#         torch.manual_seed(0)
-#         with torch.no_grad():
-#             outputs = model.generate(
-#                 static_categorical_features=batch["static_categorical_features"],
-#                 past_time_features=batch["past_time_features"],
-#                 past_values=batch["past_values"],
-#                 future_time_features=batch["future_time_features"],
-#                 past_observed_mask=batch["past_observed_mask"],
-#             )
-#         expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
-#         self.assertEqual(outputs.sequences.shape, expected_shape)
-#
-#         expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
-#         mean_prediction = outputs.sequences.mean(dim=1)
-#         self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
+
+
+def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'):
+    file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class PatchTSTModelIntegrationTests(unittest.TestCase):
+    def test_pretrain_head(self):
+        model = PatchTSTForPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device)
+        batch = prepare_batch()
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"].to(torch_device)
+            ).prediction_outputs
+        num_patch = (max(model.config.context_length,
+                         model.config.patch_length) - model.config.patch_length) // model.config.stride + 1
+        expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length])
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]],
+                                       [[0.0246]], [[0.0090]]],
+                                      device=torch_device)
+        self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
+
+    # def test_classification_head(self):
+    #     # mock data, test
+    #     model = PatchTSTForClassification.from_pretrained('diepi/test_patchtst_classification_mock').to(torch_device)
+    #     batch = prepare_batch(repo_id="diepi/mock-data", file="test-mock-patchtst.pt")
+    #
+    #     torch.manual_seed(0)
+    #     with torch.no_grad():
+    #         output = model(
+    #             past_values=batch["past_values"].to(torch_device)
+    #         ).prediction_logits
+    #     expected_shape = torch.Size([1, model.config.num_classes])
+    #     self.assertEqual(output.shape, expected_shape)
+    #
+    #     expected_slice = torch.tensor([[-0.2774, -0.1081, 0.6771]],
+    #                                   device=torch_device,
+    #                                   )
+    #     self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE))
+
+    def test_prediction_head(self):
+        model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device)
+        batch = prepare_batch(file="test-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"].to(torch_device),
+                future_values=batch["future_values"].to(torch_device)
+            ).prediction_outputs
+        expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+                                      device=torch_device,
+                                      )
+        self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))

From 00c2af6cca1d30a0f180f40ebd2fa214ddb44879 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 29 Aug 2023 23:39:56 -0400
Subject: [PATCH 023/189] Added PatchTSTForRegression + update unittest to
 include label generation

---
 src/transformers/__init__.py                  | 10 ++-
 src/transformers/models/auto/__init__.py      |  4 +
 src/transformers/models/auto/modeling_auto.py | 11 ++-
 src/transformers/models/patchtst/__init__.py  |  5 +-
 .../models/patchtst/configuration_patchtst.py |  6 ++
 .../models/patchtst/modeling_patchtst.py      | 90 +++++++++++++++++--
 tests/models/bert/test_modeling_bert.py       |  3 +
 .../models/patchtst/test_modeling_patchtst.py | 85 +++++++++++++++---
 8 files changed, 193 insertions(+), 21 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5b294f4218bb73..d6a15e2aeebf93 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1142,6 +1142,8 @@
             "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
             "MODEL_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
+            "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
             "AutoBackbone",
             "AutoModel",
             "AutoModelForAudioClassification",
@@ -1997,7 +1999,9 @@
             "PatchTSTPreTrainedModel",
             "PatchTSTForPrediction",
             "PatchTSTForForecasting",
-            "PatchTSTForPretraining"
+            "PatchTSTForPretraining",
+            "PatchTSTForClassification",
+            "PatchTSTForRegression",
         ]
     )
     _import_structure["models.instructblip"].extend(
@@ -5163,6 +5167,8 @@
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             AutoBackbone,
             AutoModel,
             AutoModelForAudioClassification,
@@ -5866,6 +5872,8 @@
             PatchTSTForForecasting,
             PatchTSTForPretraining,
             PatchTSTPreTrainedModel,
+            PatchTSTForClassification,
+            PatchTSTForRegression,
         )
         from .models.instructblip import (
             INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 12d79822fd1d43..6b13313fd6be73 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -76,6 +76,8 @@
         "MODEL_WITH_LM_HEAD_MAPPING",
         "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
+        "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
         "AutoModel",
         "AutoBackbone",
         "AutoModelForAudioClassification",
@@ -254,6 +256,8 @@
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoBackbone,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d15f166fe6dc54..af27c099bceb77 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1110,7 +1110,13 @@
 
 MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
-        ("PatchTST", "PatchTSTForClassification"),
+        ("patchtst", "PatchTSTForClassification"),
+    ]
+)
+
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES = OrderedDict(
+    [
+        ("patchtst", "PatchTSTForRegression"),
     ]
 )
 
@@ -1203,6 +1209,9 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES
 )
 
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES
+)
 
 class AutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 35e2a01f166a29..c5836322f55b99 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -36,7 +36,9 @@
         "PatchTSTPreTrainedModel",
         "PatchTSTForPrediction",
         "PatchTSTForForecasting",
-        "PatchTSTForPretraining"
+        "PatchTSTForPretraining",
+        "PatchTSTForRegression",
+        "PatchTSTForClassification"
     ]
 
 
@@ -56,6 +58,7 @@
             PatchTSTForPrediction,
             PatchTSTForForecasting,
             PatchTSTForClassification,
+            PatchTSTForRegression,
             PatchTSTPreTrainedModel,
         )
 
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 085d886d1bd86d..4fc0946a51ec74 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -172,6 +172,8 @@ def __init__(
         is_encoder_decoder: bool = False,
         encoder_layerdrop: float = 0.1,
         prediction_length: int = 24,
+        prediction_range: List = [0, 1],
+        target_dimension: int = 1,
 
         # PatchTST arguments
         attention_type: str = "prob",
@@ -243,6 +245,10 @@ def __init__(
         # Forcasting
         self.prediction_length = prediction_length
 
+        # Regression
+        self.target_dimension = target_dimension
+        self.prediction_range = prediction_range
+
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     def _num_patches(self):
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 87f36dd8eb5867..6932b87ff2792b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -944,8 +944,11 @@ def forward(
         past_values (x): tensor [bs x sequence_length x n_vars ]
         future_values (y): labels
         """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         model_output = self.model(
-            past_values)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+            past_values, output_hidden_states=output_hidden_states)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         x_hat = self.head(model_output[
                               0])  # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token
 
@@ -972,13 +975,17 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None):
-        model_output = self.model(past_values)
+    def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] = None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output[0])
 
         loss_val = None
-        if future_values is not None:
-            loss_val = self.loss(y_hat, future_values)
+        if labels is not None:
+            loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
             loss=loss_val,
             prediction_logits=y_hat,
@@ -1129,7 +1136,7 @@ class PatchTSTForPredictionOutput(ModelOutput):
 
 
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
-    # PatchTST model + classification head
+    # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1250,7 +1257,7 @@ def forward(self, x: torch.Tensor):
 
 
 class PatchTSTForForecasting(PatchTSTPreTrainedModel):
-    # PatchTST model + classification head
+    # PatchTST model + Forecasting head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
@@ -1279,3 +1286,72 @@ def forward(self,
             hidden_states=model_output.hidden_states
         )
 
+
+class RegressionHead(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.y_range = config.prediction_range
+        self.use_cls_token = config.use_cls_token
+        self.pooling = config.pooling
+        # self.is_flatten = is_flatten
+
+        self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+        input_dim = config.input_size * config.d_model
+        # if is_flatten: input_dim *= num_patch
+        self.linear = nn.Linear(input_dim, config.target_dimension)
+
+    def forward(self, past_values):
+        """
+        x: [bs x nvars x num_patch x d_model]
+            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: [bs x output_dim]
+        """
+
+        if self.use_cls_token:
+            past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
+        elif self.pooling == 'mean':
+            past_values = past_values.mean(dim=2)  # x: [bs x nvars x d_model]
+        elif self.pooling == 'max':
+            past_values = past_values.max(dim=2)  # x: [bs x nvars x d_model]
+        else:
+            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
+        # flatten the input
+        past_values = self.flatten(past_values)  # x: bs x nvars * d_model
+        y = self.linear(self.dropout(past_values))  # y: bs x output_dim
+
+        if self.y_range:
+            y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
+
+        return y
+
+
+class PatchTSTForRegression(PatchTSTPreTrainedModel):
+    # PatchTST model + Regression head
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+        self.model = PatchTSTModel(config)
+        self.head = RegressionHead(config)
+        self.loss = nn.MSELoss(reduction='mean')
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                past_values: torch.Tensor,
+                labels: Optional[torch.Tensor],
+                output_hidden_states: Optional[bool] = None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        y_hat = self.head(model_output[0])
+        loss_val = None
+        if labels is not None:
+            loss_val = self.loss(y_hat, labels)
+        return PatchTSTForForecastingOutput(
+            loss=loss_val,
+            forecast_outputs=y_hat,
+            hidden_states=model_output.hidden_states
+        )
+    
\ No newline at end of file
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 9aec91367d8dda..dd0afca3ec81a2 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -485,6 +485,9 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_save_load(self):
+        super().test_save_load()
+
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index c9a6afa7315757..39951b9ae2f373 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -23,9 +23,10 @@
 
 from transformers import is_torch_available
 from transformers.testing_utils import is_flaky, require_torch, torch_device, slow
-
+from transformers.models.auto import get_values
+import random
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -33,8 +34,9 @@
 
 if is_torch_available():
     import torch
-    from transformers import PatchTSTConfig
-    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining
+    from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, \
+        PatchTSTForClassification, PatchTSTForRegression
 
 
 
@@ -61,7 +63,9 @@ def __init__(
         lags_sequence=[1, 2, 3, 4, 5],
         sampling_factor=10,
         distil=False,
-        seed_number=42
+        seed_number=42,
+        num_classes=2,
+        target_dimension=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -85,9 +89,11 @@ def __init__(
             sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
         )
         self.seed_number = seed_number
+        self.num_classes = num_classes
+        self.target_dimension = target_dimension
         self.sampling_factor = sampling_factor
         self.distil = distil
-        self.num_patch = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
+        self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
 
     def get_config(self):
         return PatchTSTConfig(
@@ -103,7 +109,9 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             context_length=self.context_length,
             activation_function=self.hidden_act,
-            seed_number=self.seed_number
+            seed_number=self.seed_number,
+            num_classes=self.num_classes,
+            target_dimension=self.target_dimension
         )
 
     def prepare_patchtst_inputs_dict(self, config):
@@ -133,7 +141,16 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
+    all_model_classes = (
+        (PatchTSTModel,
+         PatchTSTForPrediction,
+         PatchTSTForForecasting,
+         PatchTSTForPretraining,
+         PatchTSTForClassification,
+         PatchTSTForRegression)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     is_encoder_decoder = False
@@ -163,6 +180,22 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        # if classification model:
+        if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
+            rng = random.Random(self.model_tester.seed_number)
+            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_classes, rng=rng)
+            inputs_dict["labels"] = labels
+            inputs_dict.pop("future_values")
+        elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+            rng = random.Random(self.model_tester.seed_number)
+            labels = floats_tensor([self.model_tester.batch_size, self.model_tester.target_dimension], rng=rng)
+            inputs_dict["labels"] = labels
+            inputs_dict.pop("future_values")
+        return inputs_dict
+
     def test_save_load_strict(self):
         config, _ = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
@@ -190,7 +223,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
-            num_patch = self.model_tester.num_patch
+            num_patch = self.model_tester.num_patches
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
                 [num_patch, self.model_tester.hidden_size],
@@ -211,12 +244,13 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 #
 #     # Ignore since we have no tokens embeddings
+
     def test_resize_tokens_embeddings(self):
         pass
 
     def test_model_outputs_equivalence(self):
         pass
-#
+
     def test_determinism(self):
         pass
 
@@ -239,7 +273,10 @@ def test_forward_signature(self):
                 "past_values",
                 "future_values",
             ]
-
+            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \
+                    model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+                expected_arg_names.remove("future_values")
+                expected_arg_names.append("labels")
             expected_arg_names.extend(
                 [
                     "output_hidden_states",
@@ -316,3 +353,29 @@ def test_prediction_head(self):
                                       device=torch_device,
                                       )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
+
+    # def test_seq_to_seq_generation(self):
+    #     model = PatchTSTForPrediction.from_pretrained("diepi/test_patchtst_prediction_etth1").to(torch_device)
+    #     batch = prepare_batch("val-batch.pt")
+    #
+    #     torch.manual_seed(0)
+    #     with torch.no_grad():
+    #         outputs = model.generate(
+    #             past_values=batch["past_values"].to(torch_device),
+    #             future_values=batch["future_values"].to(torch_device)
+    #         ).prediction_outputs
+    #     expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+    #     # self.assertEqual(outputs.sequences.shape, expected_shape)
+    #     #
+    #     # expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
+    #     # mean_prediction = outputs.sequences.mean(dim=1)
+    #     # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
+    #
+    #     # expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
+    #     self.assertEqual(outputs.shape, expected_shape)
+    #
+    #     expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+    #                                   device=torch_device,
+    #                                   )
+    #     self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE))
+

From 5802f073917d479efe5de230a2e4e69ac759fc6a Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 29 Aug 2023 23:43:34 -0400
Subject: [PATCH 024/189] Revert unrelated model test file

---
 src/transformers/models/patchtst/modeling_patchtst.py | 1 -
 tests/models/bert/test_modeling_bert.py               | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6932b87ff2792b..d48c21d5c9889d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1354,4 +1354,3 @@ def forward(self,
             forecast_outputs=y_hat,
             hidden_states=model_output.hidden_states
         )
-    
\ No newline at end of file
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index dd0afca3ec81a2..9aec91367d8dda 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -485,9 +485,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_save_load(self):
-        super().test_save_load()
-
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:

From 3a6643804057d3a45bb04956a3212c4067fc899c Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Wed, 30 Aug 2023 00:15:38 -0400
Subject: [PATCH 025/189] Combine similar output classes

---
 .../models/patchtst/modeling_patchtst.py      | 55 +++++--------------
 .../models/patchtst/test_modeling_patchtst.py |  6 +-
 2 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d48c21d5c9889d..fc2b0ea188dd70 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -893,16 +893,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class PatchTSTForPreTrainingOutput(ModelOutput):
+class PatchTSTOutput(ModelOutput):
     """
-    Output type of [`BertForPreTraining`].
+    Output type of [`PatchTSTForPredictiontion`].
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patches, patch_length )`):
-            Prediction outputs of the modeling head.
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -917,7 +916,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
+    prediction_output: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -939,7 +938,7 @@ def forward(
             self, past_values: torch.Tensor,
             future_values: Optional[torch.Tensor] = None,
             output_hidden_states: Optional[bool] = None
-    ) -> PatchTSTForPreTrainingOutput:
+    ) -> PatchTSTOutput:
         """
         past_values (x): tensor [bs x sequence_length x n_vars ]
         future_values (y): labels
@@ -956,9 +955,9 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTForPreTrainingOutput(
+        return PatchTSTOutput(
             loss=masked_loss,
-            prediction_outputs=x_hat,
+            prediction_output=x_hat,
             hidden_states=model_output.hidden_states
         )
 
@@ -1107,34 +1106,6 @@ def forward(self, x: torch.Tensor):
         return x
 
 
-class PatchTSTForPredictionOutput(ModelOutput):
-    """
-    Output type of [`PatchTSTForPredictiontion`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction outputs of the time series modeling heads.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
@@ -1160,9 +1131,9 @@ def forward(self,
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
-        return PatchTSTForPredictionOutput(
+        return PatchTSTOutput(
             loss=loss_val,
-            prediction_outputs=y_hat,
+            prediction_output=y_hat,
             hidden_states=model_output.hidden_states
         )
 
@@ -1349,8 +1320,8 @@ def forward(self,
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTForForecastingOutput(
+        return PatchTSTOutput(
             loss=loss_val,
-            forecast_outputs=y_hat,
+            prediction_output=y_hat,
             hidden_states=model_output.hidden_states
         )
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 39951b9ae2f373..778bbda0e6bb5a 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -307,7 +307,7 @@ def test_pretrain_head(self):
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"].to(torch_device)
-            ).prediction_outputs
+            ).prediction_output
         num_patch = (max(model.config.context_length,
                          model.config.patch_length) - model.config.patch_length) // model.config.stride + 1
         expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length])
@@ -345,7 +345,7 @@ def test_prediction_head(self):
             output = model(
                 past_values=batch["past_values"].to(torch_device),
                 future_values=batch["future_values"].to(torch_device)
-            ).prediction_outputs
+            ).prediction_output
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
         self.assertEqual(output.shape, expected_shape)
 
@@ -363,7 +363,7 @@ def test_prediction_head(self):
     #         outputs = model.generate(
     #             past_values=batch["past_values"].to(torch_device),
     #             future_values=batch["future_values"].to(torch_device)
-    #         ).prediction_outputs
+    #         ).prediction_output
     #     expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
     #     # self.assertEqual(outputs.sequences.shape, expected_shape)
     #     #

From 00ddf8d81612e5dbd2b59e003b83b38844c8e816 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 30 Aug 2023 14:14:27 +0700
Subject: [PATCH 026/189] update PredictionHead

---
 .../models/patchtst/modeling_patchtst.py      | 66 +++++++------------
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index fc2b0ea188dd70..ece30f97c4b372 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1052,58 +1052,41 @@ class PatchTSTForClassificationOutput(ModelOutput):
 class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.individual = config.individual
-        self.n_vars = config.input_size
+
+        self.target_dimension = config.target_dimension
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
-        head_dimension = config.d_model if config.pooling else config.d_model * config.num_patches
 
-        if self.individual:
-            self.linears = nn.ModuleList()
-            self.dropouts = nn.ModuleList()
-            self.flattens = nn.ModuleList()
-            for i in range(self.n_vars):
-                self.flattens.append(nn.Flatten(start_dim=2))
-                self.linears.append(nn.Linear(head_dimension, config.prediction_length))
-                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-                                     )
-        else:
-            self.flatten = nn.Flatten(start_dim=2)
-            self.linear = nn.Linear(head_dimension, config.prediction_length)
-            self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+        head_dim = config.input_size * config.d_model
 
-    def forward(self, x: torch.Tensor):
+        self.flatten = nn.Flatten(start_dim=1)
+        self.linear = nn.Linear(head_dim, config.prediction_length * config.target_dimension)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+
+    def forward(self, x):
         """
-        x: [bs x nvars x num_patches x d_model]
-            or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        output: [bs x forecast_len x nvars]
+        x: [bs x nvars x num_patch x d_model]
+            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        output: [bs x pred_len x target_dimension]
         """
+        batch_size = x.shape[0]
         if self.use_cls_token:
-            y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
+            x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
+        elif self.pooling == 'mean':
+            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
+        elif self.pooling == 'max':
+            x = x.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            if self.pooling == 'mean':
-                y = x.mean(dim=2)  # y: [bs x nvars x d_model]
-            elif self.pooling == 'max':
-                y = x.max(dim=2)  # y: [bs x nvars x d_model]
-            else:
-                y = x  # y: [bs x nvars x num_patches x d_model]
+            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
 
-        if self.individual:
-            x_out = []
-            for i in range(self.n_vars):
-                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
-                z = self.linears[i](z)  # z: [bs x forecast_len]
-                z = self.dropouts[i](z)
-                x_out.append(z)
-            x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
-        else:
-            z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
-            z = self.dropout(z)
-            x = self.linear(z)  # x: [bs x nvars x forecast_len]
+        # flatten the input
+        x = self.flatten(x)  # x: bs x (nvars * d_model)
+        y = self.linear(self.dropout(x))  # y: bs x (pred_len * target_dimension)
 
-        x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
+        # reshape the data
+        y = y.reshape(batch_size, -1, self.target_dimension)  # [bs x pred_len x target_dimension]
+        return y
 
-        return x
 
 
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
@@ -1122,6 +1105,7 @@ def forward(self,
                 past_values: torch.Tensor,
                 future_values: Optional[torch.Tensor] = None,
                 output_hidden_states: Optional[bool] = None):
+
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )

From 78c26f2f7e36236cc2a6ad6699d9e2949d64257b Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 30 Aug 2023 14:14:39 +0700
Subject: [PATCH 027/189] Update configuration_patchtst.py

---
 .../models/patchtst/configuration_patchtst.py   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 4fc0946a51ec74..9a3bd99c2d3414 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -163,7 +163,7 @@ def __init__(
         pooling: str = 'mean',
         num_classes: int = 1,
         head_dropout: float = 0.0,
-        proj_dropout: float = 0.0,
+        # proj_dropout: float = 0.0,
         qkv_bias: bool = True,
         num_dynamic_real_features: int = 0,
         num_static_real_features: int = 0,
@@ -211,9 +211,8 @@ def __init__(
         self.positional_encoding = positional_encoding
         self.learn_pe = learn_pe
         self.use_cls_token = use_cls_token
-        # self.patch_last = patch_last
-        self.individual = individual
         self.init_std = init_std
+        self.qkv_bias = qkv_bias
 
         # PatchTST
         self.patch_length = patch_length
@@ -235,14 +234,16 @@ def __init__(
         self.unmasked_channel_indices = unmasked_channel_indices
         self.mask_value = mask_value
 
-        # Classification
+        # general head params
+        self.individual = individual
         self.pooling = pooling
-        self.num_classes = num_classes
         self.head_dropout = head_dropout
-        self.proj_dropout = proj_dropout
-        self.qkv_bias = qkv_bias
 
-        # Forcasting
+        # Classification
+        self.num_classes = num_classes
+        # self.proj_dropout = proj_dropout
+
+        # Forcasting and prediction
         self.prediction_length = prediction_length
 
         # Regression

From 5f7c1a06e1d0fe8982d95f8409f0cffc6a9bef22 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Wed, 30 Aug 2023 11:20:00 -0400
Subject: [PATCH 028/189] Add Revin

---
 .../models/patchtst/configuration_patchtst.py |   4 +-
 .../models/patchtst/modeling_patchtst.py      | 121 ++++++++++++++++--
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 9a3bd99c2d3414..feb6d324d52565 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -150,7 +150,8 @@ def __init__(
         use_cls_token: bool = False,
         init_std: float = 0.02,
         individual: bool = False,
-        seed_number= None,
+        seed_number: int = None,
+        revin: Optional[bool] = True,
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
         mask_ratio=0.5,
@@ -213,6 +214,7 @@ def __init__(
         self.use_cls_token = use_cls_token
         self.init_std = init_std
         self.qkv_bias = qkv_bias
+        self.revin = revin
 
         # PatchTST
         self.patch_length = patch_length
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ece30f97c4b372..a03ed0c17c1ef4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -826,14 +826,75 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     patched_input: torch.FloatTensor = None
     mask: torch.FloatTensor = None
+    revin_mean: torch.FloatTensor = None
+    revin_std: torch.FloatTensor = None
+
+
+class RevIN(nn.Module):
+    def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
+        """
+        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars]
+        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here.
+        """
+        super(RevIN, self).__init__()
+        self.stdev = None
+        self.mean = None
+        self.start_dim = start_dim
+        self.denorm_channels = denorm_channels
+        self.eps = eps
+
+    def set_statistics(self, mean, stdev):
+        self.mean = mean
+        self.stdev = stdev
+
+    def forward(self, x, mode: str):
+        if mode == 'norm':
+            self._get_statistics(x)
+            x = self._normalize(x)
+        elif mode == 'denorm':
+            x = self._denormalize(x)
+        elif mode == "transform":
+            x = self._normalize(x)
+
+        else:
+            raise NotImplementedError
+        return x
+
+    def _get_statistics(self, x):
+        dim2reduce = tuple(range(self.start_dim, x.ndim - 1))
+        self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
+        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+
+    def _normalize(self, x):
+        x = x - self.mean
+        x = x / self.stdev
+        return x
+
+    def _denormalize(self, x):
+
+        if self.denorm_channels is None:
+            x = x * self.stdev
+            x = x + self.mean
+        else:
+            x = x * self.stdev[..., self.denorm_channels]
+            x = x + self.mean[..., self.denorm_channels]
+
+        return x
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
-    def __init__(self, config: PatchTSTConfig, mask_input: bool = False):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
+        self.use_revin = config.revin
+
+        if self.use_revin:
+            self.revin = RevIN()
+        else:
+            self.revin = nn.Identity()
+
         self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride)
-        self.mask_input = mask_input  # config.mask_input
+        self.mask_input = config.mask_input
 
         if self.mask_input:
             self.masking = PatchMasking(
@@ -860,6 +921,9 @@ def forward(self,
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+
+        past_values = self.revin(past_values, mode="norm")  # x: tensor [bs x seq_len x in_channels]
+
         patched_values = self.patching(
             past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
         if self.mask_input:
@@ -870,7 +934,9 @@ def forward(self,
         return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
                                                   hidden_states=encoder_output.hidden_states,
                                                   patched_input=patched_values,
-                                                  mask=mask
+                                                  mask=mask,
+                                                  revin_mean=self.revin.mean,
+                                                  revin_stdev=self.revin.stdev
                                                   )
 
 
@@ -926,8 +992,8 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
-        # config.mask_input = True
-        self.model = PatchTSTModel(config=config, mask_input=True)
+        config.mask_input = True
+        self.model = PatchTSTModel(config=config)
         self.head = PretrainHead(config)
         self.loss = torch.nn.MSELoss(reduction='none')
 
@@ -946,10 +1012,14 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(
-            past_values, output_hidden_states=output_hidden_states)  # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        x_hat = self.head(model_output[
-                              0])  # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token
+
+        # past_values: [bs x nvars x num_patches x d_model] or
+        # [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+
+        # model_output[0]: [bs x nvars x num_patches x patch_length] or
+        # [bs x nvars x (num_patches+1) x patch_length] if use cls_token
+        x_hat = self.head(model_output[0])
 
         # calculate masked_loss
         loss_val = self.loss(x_hat, model_output.patched_input)
@@ -1097,6 +1167,11 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
+        self.use_revin = config.revin
+        if self.use_revin:
+            self.revin = RevIN()
+        else:
+            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1112,6 +1187,10 @@ def forward(self,
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
 
+        if self.use_revin:
+            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
+            y_hat = self.revin(y_hat, mode="denorm")
+
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
@@ -1218,6 +1297,11 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
         self.loss = nn.MSELoss(reduction='mean')
+        self.use_revin = config.revin
+        if self.use_revin:
+            self.revin = RevIN()
+        else:
+            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1230,7 +1314,12 @@ def forward(self,
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output[0])
+
+        y_hat = self.head(model_output.last_hidden_state)
+
+        if self.use_revin:
+            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
+            y_hat = self.revin(y_hat, mode="denorm")
 
         loss_val = None
         if future_values is not None:
@@ -1288,6 +1377,11 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = RegressionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
+        self.use_revin = config.revin
+        if self.use_revin:
+            self.revin = RevIN()
+        else:
+            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1300,7 +1394,12 @@ def forward(self,
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output[0])
+        y_hat = self.head(model_output.last_hidden_state)
+
+        if self.use_revin:
+            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
+            y_hat = self.revin(y_hat, mode="denorm")
+
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)

From ac8882e7caefc781b3cb784e483b6e4a76044425 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 30 Aug 2023 23:31:32 +0700
Subject: [PATCH 029/189] small edit to PatchTSTModelOutputWithNoAttention

---
 .../models/patchtst/modeling_patchtst.py        | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a03ed0c17c1ef4..330c79b71c1f0c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -812,14 +812,20 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     Base class for model's outputs, with potential hidden states.
 
     Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        patched_input
+        patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
+            patched input to the Transformer
+        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
+            Bool masked tensor indicating which patches are masked
+        revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
+            mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
+        revin_std: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
+            std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
     """
 
     last_hidden_state: torch.FloatTensor = None
@@ -935,11 +941,10 @@ def forward(self,
                                                   hidden_states=encoder_output.hidden_states,
                                                   patched_input=patched_values,
                                                   mask=mask,
-                                                  revin_mean=self.revin.mean,
-                                                  revin_stdev=self.revin.stdev
+                                                  revin_mean=self.revin.mean if self.use_revin else None,
+                                                  revin_stdev=self.revin.stdev if self.use_revin else None
                                                   )
 
-
 class PretrainHead(nn.Module):
     def __init__(self, config):
         super().__init__()

From 2457e587e109d47ff6cf99d0e0112a02c0cf5461 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 1 Sep 2023 00:30:42 +0700
Subject: [PATCH 030/189] Update modeling_patchtst.py

---
 .../models/patchtst/modeling_patchtst.py      | 30 ++++---------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 330c79b71c1f0c..891bd1cce54788 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -38,6 +38,7 @@
 ]
 
 
+
 class PatchTSTAttention(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
@@ -850,6 +851,7 @@ def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
         self.eps = eps
 
     def set_statistics(self, mean, stdev):
+        # get statistics
         self.mean = mean
         self.stdev = stdev
 
@@ -861,7 +863,6 @@ def forward(self, x, mode: str):
             x = self._denormalize(x)
         elif mode == "transform":
             x = self._normalize(x)
-
         else:
             raise NotImplementedError
         return x
@@ -877,7 +878,7 @@ def _normalize(self, x):
         return x
 
     def _denormalize(self, x):
-
+        # denormalize the data
         if self.denorm_channels is None:
             x = x * self.stdev
             x = x + self.mean
@@ -945,7 +946,7 @@ def forward(self,
                                                   revin_stdev=self.revin.stdev if self.use_revin else None
                                                   )
 
-class PretrainHead(nn.Module):
+class MaskPretrainHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dropout = nn.Dropout(config.dropout)
@@ -992,14 +993,14 @@ class PatchTSTOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-class PatchTSTForPretraining(PatchTSTPreTrainedModel):
+class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel):
     # PatchTSTModel + Pretraining Head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
-        self.head = PretrainHead(config)
+        self.head = MaskPretrainHead(config)
         self.loss = torch.nn.MSELoss(reduction='none')
 
         # Initialize weights and apply final processing
@@ -1172,11 +1173,6 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
-        self.use_revin = config.revin
-        if self.use_revin:
-            self.revin = RevIN()
-        else:
-            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1192,10 +1188,6 @@ def forward(self,
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
 
-        if self.use_revin:
-            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
-            y_hat = self.revin(y_hat, mode="denorm")
-
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
@@ -1356,7 +1348,6 @@ def forward(self, past_values):
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         output: [bs x output_dim]
         """
-
         if self.use_cls_token:
             past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
         elif self.pooling == 'mean':
@@ -1382,11 +1373,6 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = RegressionHead(config)
         self.loss = nn.MSELoss(reduction='mean')
-        self.use_revin = config.revin
-        if self.use_revin:
-            self.revin = RevIN()
-        else:
-            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1401,10 +1387,6 @@ def forward(self,
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
 
-        if self.use_revin:
-            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
-            y_hat = self.revin(y_hat, mode="denorm")
-
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)

From f1658b231c29f2e6f13ef0a6a614e6e365062ed6 Mon Sep 17 00:00:00 2001
From: Ngoc Diep Do <diiepy@gmail.com>
Date: Thu, 31 Aug 2023 17:17:53 +0200
Subject: [PATCH 031/189] Updating integration test for forecasting

---
 tests/models/patchtst/test_modeling_patchtst.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 778bbda0e6bb5a..364306fc8d0256 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -313,8 +313,8 @@ def test_pretrain_head(self):
         expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]],
-                                       [[0.0246]], [[0.0090]]],
+        expected_slice = torch.tensor([[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]],
+                                       [[0.0053]], [[0.0090]]],
                                       device=torch_device)
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
@@ -336,8 +336,8 @@ def test_pretrain_head(self):
     #                                   )
     #     self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE))
 
-    def test_prediction_head(self):
-        model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device)
+    def test_forecasting_head(self):
+        model = PatchTSTForForecasting.from_pretrained('diepi/test_patchtst_forecasting_etth1').to(torch_device)
         batch = prepare_batch(file="test-batch.pt")
 
         torch.manual_seed(0)
@@ -345,11 +345,11 @@ def test_prediction_head(self):
             output = model(
                 past_values=batch["past_values"].to(torch_device),
                 future_values=batch["future_values"].to(torch_device)
-            ).prediction_output
+            ).forecast_outputs
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+        expected_slice = torch.tensor([[-0.9027,  0.3814, -0.8322,  0.4250, -0.7183, -0.0635, -0.8747]],
                                       device=torch_device,
                                       )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))

From 43707d7b964a68da93e398f93d4a45b8fd49b35c Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Fri, 1 Sep 2023 13:43:34 -0400
Subject: [PATCH 032/189] Fix unittest after class structure changed

---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.md                       |   2 +
 docs/source/en/model_doc/patchtst.md          |  24 ++
 src/transformers/__init__.py                  |  56 +--
 src/transformers/models/__init__.py           |   2 +-
 src/transformers/models/auto/__init__.py      |   4 +-
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/patchtst/__init__.py  |  12 +-
 .../models/patchtst/configuration_patchtst.py |   9 +-
 .../models/patchtst/modeling_patchtst.py      | 353 +++++++++---------
 src/transformers/utils/dummy_pt_objects.py    |  58 +++
 .../models/patchtst/test_modeling_patchtst.py |  85 +++--
 utils/check_repo.py                           |   3 +
 19 files changed, 356 insertions(+), 260 deletions(-)

diff --git a/README.md b/README.md
index 5253b491bae5b2..a7246572381451 100644
--- a/README.md
+++ b/README.md
@@ -428,6 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_es.md b/README_es.md
index bcd84333ef99f8..62085093026a87 100644
--- a/README_es.md
+++ b/README_es.md
@@ -405,6 +405,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_hd.md b/README_hd.md
index d87ef37e8b23bb..5e93de459461a7 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -377,6 +377,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index c6b9fb0d790e4e..1067b2e57a25ee 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -439,6 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
diff --git a/README_ko.md b/README_ko.md
index 5d2056e4f7207b..202d3d4893561a 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -354,6 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 23ecd11c23218d..8fe1633f181115 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -378,6 +378,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 3c05c1962f5114..9c615363e61a81 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -390,6 +390,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ee2d984c981341..b9f65477b5fe1c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -194,6 +194,7 @@ The documentation is organized into five sections:
 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[PatchTST](model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@@ -414,6 +415,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           OpenLlama           |       ✅        |         ❌         |      ❌      |
 |              OPT              |       ✅        |         ✅         |      ✅      |
 |            OWL-ViT            |       ✅        |         ❌         |      ❌      |
+|           PatchTST            |       ✅        |         ❌         |      ❌      |
 |            Pegasus            |       ✅        |         ✅         |      ✅      |
 |           PEGASUS-X           |       ✅        |         ❌         |      ❌      |
 |           Perceiver           |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 14523d65c70f3d..9a30b8294571b0 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -47,4 +47,28 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 ## PatchTSTForPrediction
 
 [[autodoc]] PatchTSTForPrediction
+    - forward
+
+
+## PatchTSTForForecasting
+
+[[autodoc]] PatchTSTForForecasting
+    - forward
+
+
+## PatchTSTForClassification
+
+[[autodoc]] PatchTSTForClassification
+    - forward
+
+
+## PatchTSTForMaskPretraining
+
+[[autodoc]] PatchTSTForMaskPretraining
+    - forward
+
+
+## PatchTSTForRegression
+
+[[autodoc]] PatchTSTForRegression
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d6a15e2aeebf93..9e4fab76a4eba4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -372,7 +372,6 @@
     ],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
     "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"],
-    "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"],
     "models.instructblip": [
         "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "InstructBlipConfig",
@@ -467,6 +466,7 @@
         "OwlViTTextConfig",
         "OwlViTVisionConfig",
     ],
+    "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"],
     "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"],
     "models.pegasus_x": ["PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusXConfig"],
     "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"],
@@ -1133,6 +1133,8 @@
             "MODEL_FOR_TEXT_ENCODING_MAPPING",
             "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING",
             "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING",
+            "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -1142,8 +1144,6 @@
             "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
             "MODEL_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
-            "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
-            "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
             "AutoBackbone",
             "AutoModel",
             "AutoModelForAudioClassification",
@@ -1992,18 +1992,6 @@
             "InformerPreTrainedModel",
         ]
     )
-    _import_structure["models.patchtst"].extend(
-        [
-            "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "PatchTSTModel",
-            "PatchTSTPreTrainedModel",
-            "PatchTSTForPrediction",
-            "PatchTSTForForecasting",
-            "PatchTSTForPretraining",
-            "PatchTSTForClassification",
-            "PatchTSTForRegression",
-        ]
-    )
     _import_structure["models.instructblip"].extend(
         [
             "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2420,6 +2408,18 @@
             "OwlViTVisionModel",
         ]
     )
+    _import_structure["models.patchtst"].extend(
+        [
+            "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PatchTSTForClassification",
+            "PatchTSTForForecasting",
+            "PatchTSTForMaskPretraining",
+            "PatchTSTForPrediction",
+            "PatchTSTForRegression",
+            "PatchTSTModel",
+            "PatchTSTPreTrainedModel",
+        ]
+    )
     _import_structure["models.pegasus"].extend(
         ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"]
     )
@@ -4477,7 +4477,6 @@
     )
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
-    from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
     from .models.instructblip import (
         INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         InstructBlipConfig,
@@ -4562,6 +4561,7 @@
         OwlViTTextConfig,
         OwlViTVisionConfig,
     )
+    from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
     from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
     from .models.pegasus_x import PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusXConfig
     from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer
@@ -5158,6 +5158,8 @@
             MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
             MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
@@ -5167,8 +5169,6 @@
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
-            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
-            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             AutoBackbone,
             AutoModel,
             AutoModelForAudioClassification,
@@ -5865,16 +5865,6 @@
             InformerModel,
             InformerPreTrainedModel,
         )
-        from .models.patchtst import (
-            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
-            PatchTSTModel,
-            PatchTSTForPrediction,
-            PatchTSTForForecasting,
-            PatchTSTForPretraining,
-            PatchTSTPreTrainedModel,
-            PatchTSTForClassification,
-            PatchTSTForRegression,
-        )
         from .models.instructblip import (
             INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             InstructBlipForConditionalGeneration,
@@ -6211,6 +6201,16 @@
             OwlViTTextModel,
             OwlViTVisionModel,
         )
+        from .models.patchtst import (
+            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSTForClassification,
+            PatchTSTForForecasting,
+            PatchTSTForMaskPretraining,
+            PatchTSTForPrediction,
+            PatchTSTForRegression,
+            PatchTSTModel,
+            PatchTSTPreTrainedModel,
+        )
         from .models.pegasus import (
             PegasusForCausalLM,
             PegasusForConditionalGeneration,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 3b958ac5c1df40..cf2a6ce94d37a3 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -105,7 +105,6 @@
     idefics,
     imagegpt,
     informer,
-    patchtst,
     instructblip,
     jukebox,
     layoutlm,
@@ -152,6 +151,7 @@
     openai,
     opt,
     owlvit,
+    patchtst,
     pegasus,
     pegasus_x,
     perceiver,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 6b13313fd6be73..c606cb6c0f967b 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -249,6 +249,8 @@
             MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
             MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
@@ -256,8 +258,6 @@
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
-            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
-            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoBackbone,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index af27c099bceb77..4b1f7b43685da1 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1213,6 +1213,7 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES
 )
 
+
 class AutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
 
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index c5836322f55b99..8979bed2341ab2 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -36,9 +36,9 @@
         "PatchTSTPreTrainedModel",
         "PatchTSTForPrediction",
         "PatchTSTForForecasting",
-        "PatchTSTForPretraining",
+        "PatchTSTForMaskPretraining",
         "PatchTSTForRegression",
-        "PatchTSTForClassification"
+        "PatchTSTForClassification",
     ]
 
 
@@ -53,12 +53,12 @@
     else:
         from .modeling_patchtst import (
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
-            PatchTSTForPretraining,
-            PatchTSTModel,
-            PatchTSTForPrediction,
-            PatchTSTForForecasting,
             PatchTSTForClassification,
+            PatchTSTForForecasting,
+            PatchTSTForMaskPretraining,
+            PatchTSTForPrediction,
             PatchTSTForRegression,
+            PatchTSTModel,
             PatchTSTPreTrainedModel,
         )
 
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index feb6d324d52565..fbdca6db377155 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """PatchTST model configuration"""
 
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -161,7 +161,7 @@ def __init__(
         d_size: str = "4D",
         unmasked_channel_indices: list = None,
         mask_value=0,
-        pooling: str = 'mean',
+        pooling: str = "mean",
         num_classes: int = 1,
         head_dropout: float = 0.0,
         # proj_dropout: float = 0.0,
@@ -175,17 +175,15 @@ def __init__(
         prediction_length: int = 24,
         prediction_range: List = [0, 1],
         target_dimension: int = 1,
-
         # PatchTST arguments
         attention_type: str = "prob",
         sampling_factor: int = 5,
         distil: bool = True,
         **kwargs,
     ):
-
         # time series specific configuration
         self.context_length = context_length
-        self.input_size = input_size # n_vars
+        self.input_size = input_size  # n_vars
         self.num_time_features = num_time_features
         self.num_dynamic_real_features = num_dynamic_real_features
         self.num_static_real_features = num_static_real_features
@@ -256,4 +254,3 @@ def __init__(
 
     def _num_patches(self):
         return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
-
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 891bd1cce54788..7b485769d7f0c4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 TSFM team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,20 @@
 # limitations under the License.
 """ PyTorch PatchTST model."""
 
-from typing import Optional, Tuple
-import torch
-from torch import nn
 import math
 import random
+from typing import Optional, Tuple
+
 import numpy as np
+import torch
+from torch import nn
+from torch.nn.modules.activation import MultiheadAttention
 
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, logging
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
-from transformers.utils import ModelOutput
-from torch.nn.modules.activation import MultiheadAttention
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+from transformers.utils import ModelOutput, add_start_docstrings, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -38,7 +39,6 @@
 ]
 
 
-
 class PatchTSTAttention(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
@@ -85,7 +85,7 @@ def forward(self, x):
 
 def positional_encoding(pe, learn_pe, q_len, d_model):
     # Positional encoding
-    if pe == None:
+    if pe is None:
         w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
         nn.init.uniform_(w_pos, -0.02, 0.02)
         learn_pe = False
@@ -131,9 +131,8 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=
     i = 0
     for i in range(100):
         cpe = (
-                2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (
-                    torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
-                - 1
+            2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
+            - 1
         )
 
         if abs(cpe.mean()) <= eps:
@@ -161,16 +160,17 @@ def set_seed(x=42):
     random.seed(x)
     np.random.seed(x)
     torch.manual_seed(x)
-    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(x)
 
 
 def random_masking(
-        xb: torch.Tensor,
-        mask_ratio: float,
-        unmasked_channel_indices: list = None,
-        channel_consistent_masking: bool = False,
-        mask_value=0,
-        seed_number: Optional[int] = None
+    xb: torch.Tensor,
+    mask_ratio: float,
+    unmasked_channel_indices: list = None,
+    channel_consistent_masking: bool = False,
+    mask_value=0,
+    seed_number: Optional[int] = None,
 ):
     """random_masking: Mask the input considering the control variables.
 
@@ -178,13 +178,15 @@ def random_masking(
         seed_number (int, optional): Value to set for the seed number
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
-        unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        unmasked_channel_indices (list, optional):
+            indices of unmasked channels. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
 
     Returns:
-        Tensor: xb_mask, masked input, same shape as input
-        Tensor: Mask tensor of shape [bs x c x n]
+        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n]
     """
     if seed_number:
         set_seed(seed_number)
@@ -221,26 +223,25 @@ def compute_num_patches(sequence_length, patch_length, stride):
 
 class Patchify(nn.Module):
     """
-    A class to patchify the time series sequence into different patches
     Args:
-        sequence_length (int, required): input sequence length
-        patch_length (int, required): patch length
-        stride (int, required): stride between patches
+    A class to patchify the time series sequence into different patches
+        sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int,
+        required): stride between patches
     Returns:
         z: output tensor data [bs x n_vars x num_patches x patch_length]
     """
 
     def __init__(
-            self,
-            sequence_length: int,
-            patch_length: int,
-            stride: int,
-            padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+        self,
+        sequence_length: int,
+        patch_length: int,
+        stride: int,
+        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
         assert (
-                sequence_length > patch_length
+            sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         self.sequence_length = sequence_length
@@ -260,9 +261,11 @@ def forward(self, past_values: torch.Tensor):
             x: output tensor data [bs x n_vars x num_patches x patch_length]
         """
         sequence_length = past_values.shape[-2]
-        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+        assert (
+            sequence_length == self.sequence_length
+        ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
         x = x.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
         )  # x: [bs x num_patches x n_vars x patch_length]
@@ -272,26 +275,19 @@ def forward(self, past_values: torch.Tensor):
 
 class PatchEmbeddings(nn.Module):
     """
-    A class to patchify the time series sequence into different patches
     Args:
-        sequence_length (int, required): input sequence length
-        patch_length (int, required): patch length
-        stride (int, required): stride between patches
+    A class to patchify the time series sequence into different patches
+        sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int,
+        required): stride between patches
     Returns:
         embeddings: output tensor data [bs x n_vars x num_patches x embed_dim]
     """
 
-    def __init__(
-            self,
-            sequence_length: int,
-            patch_length: int,
-            stride: int,
-            embed_dim: int
-    ):
+    def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int):
         super().__init__()
 
         assert (
-                sequence_length > patch_length
+            sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
@@ -307,11 +303,12 @@ def __init__(
         self.s_begin = sequence_length - new_sequence_length
 
         # Embedding
-        self.projection = nn.Conv1d(in_channels=1,
-                                    out_channels=embed_dim,
-                                    kernel_size=patch_length,
-                                    stride=stride,
-                                    )
+        self.projection = nn.Conv1d(
+            in_channels=1,
+            out_channels=embed_dim,
+            kernel_size=patch_length,
+            stride=stride,
+        )
 
     def forward(self, past_values: torch.Tensor):
         """
@@ -321,16 +318,19 @@ def forward(self, past_values: torch.Tensor):
             embeddings: output tensor data [bs x n_vars x num_patches x emb_dim]
         """
         bs, sequence_length, n_vars = past_values.shape
-        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
+        assert (
+            sequence_length == self.sequence_length
+        ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
         # convert past_values to shape [bs*n_vars x 1 x sequence_length ]
         x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous()
         # projection
         embeddings = self.projection(x)  # embeddings: [bs*n_vars x emb_dim x num_patches]
         # reshape
-        embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1,
-                                                     self.embed_dim).contiguous()  # embeddings: [bs x n_vars x num_patches x emb_dim]
+        embeddings = (
+            embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous()
+        )  # embeddings: [bs x n_vars x num_patches x emb_dim]
         # embeddings = embeddings.flatten(2).transpose(1, 2)
         return embeddings
 
@@ -345,23 +345,25 @@ class PatchMasking(nn.Module):
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
         mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
         if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        unmasked_channel_indices (list, optional):
+            Control Variable channel indices. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
     """
 
     def __init__(
-            self,
-            mask_type: str = "random",
-            mask_ratio=0.5,
-            mask_patches: list = [2, 3],
-            mask_patch_ratios: list = [1, 1],
-            channel_consistent_masking: bool = False,
-            unmasked_channel_indices: list = None,
-            mask_value=0,
-            seed_number: Optional[int] = None
+        self,
+        mask_type: str = "random",
+        mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        mask_patch_ratios: list = [1, 1],
+        channel_consistent_masking: bool = False,
+        unmasked_channel_indices: list = None,
+        mask_value=0,
+        seed_number: Optional[int] = None,
     ):
-
         # if seed_number:
         #     set_seed(seed_number)
         self.mask_ratio = mask_ratio
@@ -381,11 +383,11 @@ def forward(self, x: torch.Tensor):
         """
         Input:
             x: patched input
-                4D: [bs x n_vars x num_patches  x patch_length]
+                4D: [bs x n_vars x num_patches x patch_length]
 
         Output:
             x_mask: Masked patched input
-                4D: [bs x n_vars x num_patches  x patch_length]
+                4D: [bs x n_vars x num_patches x patch_length]
             mask: bool tensor indicating True on masked points
                 4D: [bs x n_vars x num_patch]
         """
@@ -397,7 +399,7 @@ def forward(self, x: torch.Tensor):
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
                 mask_value=self.mask_value,
-                seed_number=self.seed_number
+                seed_number=self.seed_number,
             )
 
         else:
@@ -412,17 +414,11 @@ class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.layers = nn.ModuleList(
-            [
-                ChannelAttentionTSTEncoderLayer(config)
-                for i in range(config.encoder_layers)
-            ]
-        )
+        self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
     def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        src: tensor [bs x nvars x sequence_length x d_model]
-        Return:
+        src: tensor [bs x nvars x sequence_length x d_model] Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
@@ -476,8 +472,7 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x nvars x sequence_length x d_model]
-        Return:
+        src: tensor [bs x nvars x sequence_length x d_model] Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         bs, n_vars, sequence_length, d_model = src.shape
@@ -487,38 +482,46 @@ def forward(self, src: torch.Tensor):
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer1(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer1(
-                src + self.dropout_path1(self.self_attn(src)))  # src: [(bs*nvars) x sequence_length x d_model]
+                src + self.dropout_path1(self.self_attn(src))
+            )  # src: [(bs*nvars) x sequence_length x d_model]
         src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
-        src = src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars,
-                                                    d_model)  # [(bs*sequence_length) x nvars x d_model]
+        src = (
+            src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, d_model)
+        )  # [(bs*sequence_length) x nvars x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path2(
-                self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer2(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer2(
-                src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
-        src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,
-                                                                          2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
+                src + self.dropout_path2(self.self_attn(src))
+            )  # src: [(bs*sequence_length) x nvars x d_model]
+        src = (
+            src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, 2).contiguous()
+        )  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
         src = src.view(bs * n_vars, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(
-                self.ff(self.norm_sublayer3(src)))  # Add: residual connection with residual dropout
+                self.ff(self.norm_sublayer3(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer3(
-                src + self.dropout_path3(self.ff(src)))  # Add: residual connection with residual dropout
+                src + self.dropout_path3(self.ff(src))
+            )  # Add: residual connection with residual dropout
         src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         return src
@@ -546,7 +549,6 @@ def _init_weights(self, module):
             module.bias_k.data.normal_(mean=0.0, std=self.config.init_std)
             module.bias_v.data.normal_(mean=0.0, std=self.config.init_std)
 
-
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
             module.gradient_checkpointing = value
@@ -573,11 +575,13 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1,
-                                             config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model
+            )
         else:
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches,
-                                             config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model
+            )
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -588,11 +592,11 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, past_values: torch.Tensor,
-                output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention:
+    def forward(
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
+    ) -> BaseModelOutputWithNoAttention:
         """
-        x: tensor [bs x nvars x num_patches x patch_length]
-        return:
+        x: tensor [bs x nvars x num_patches x patch_length] return:
             tensor [bs x nvars x num_patches x d_model]
                 or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         """
@@ -623,14 +627,12 @@ def forward(self, past_values: torch.Tensor,
 
         # Encoder
         past_values, hidden_states = self.encoder(
-            past_values, output_hidden_states)  # x: [bs x nvars x num_patches x d_model]
+            past_values, output_hidden_states
+        )  # x: [bs x nvars x num_patches x d_model]
         # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
 
         # return past_values, hidden_states
-        return BaseModelOutputWithNoAttention(
-            last_hidden_state=past_values,
-            hidden_states=hidden_states
-        )
+        return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states)
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -817,8 +819,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
         patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             patched input to the Transformer
         mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
@@ -841,7 +843,8 @@ class RevIN(nn.Module):
     def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
         """
         :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars]
-        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here.
+        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm
+        input here.
         """
         super(RevIN, self).__init__()
         self.stdev = None
@@ -856,10 +859,10 @@ def set_statistics(self, mean, stdev):
         self.stdev = stdev
 
     def forward(self, x, mode: str):
-        if mode == 'norm':
+        if mode == "norm":
             self._get_statistics(x)
             x = self._normalize(x)
-        elif mode == 'denorm':
+        elif mode == "denorm":
             x = self._denormalize(x)
         elif mode == "transform":
             x = self._normalize(x)
@@ -889,7 +892,6 @@ def _denormalize(self, x):
         return x
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -912,7 +914,7 @@ def __init__(self, config: PatchTSTConfig):
                 channel_consistent_masking=config.channel_consistent_masking,
                 unmasked_channel_indices=config.unmasked_channel_indices,
                 mask_value=config.mask_value,
-                seed_number=config.seed_number
+                seed_number=config.seed_number,
             )
         else:
             self.masking = nn.Identity()
@@ -921,10 +923,12 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor] = None,
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -932,19 +936,22 @@ def forward(self,
         past_values = self.revin(past_values, mode="norm")  # x: tensor [bs x seq_len x in_channels]
 
         patched_values = self.patching(
-            past_values)  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
+            past_values
+        )  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
-        return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
-                                                  hidden_states=encoder_output.hidden_states,
-                                                  patched_input=patched_values,
-                                                  mask=mask,
-                                                  revin_mean=self.revin.mean if self.use_revin else None,
-                                                  revin_stdev=self.revin.stdev if self.use_revin else None
-                                                  )
+        return PatchTSTModelOutputWithNoAttention(
+            last_hidden_state=encoder_output.last_hidden_state,
+            hidden_states=encoder_output.hidden_states,
+            patched_input=patched_values,
+            mask=mask,
+            revin_mean=self.revin.mean if self.use_revin else None,
+            revin_stdev=self.revin.stdev if self.use_revin else None,
+        )
+
 
 class MaskPretrainHead(nn.Module):
     def __init__(self, config):
@@ -1001,19 +1008,19 @@ def __init__(self, config: PatchTSTConfig):
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
         self.head = MaskPretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction='none')
+        self.loss = torch.nn.MSELoss(reduction="none")
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
-            self, past_values: torch.Tensor,
-            future_values: Optional[torch.Tensor] = None,
-            output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTOutput:
         """
-        past_values (x): tensor [bs x sequence_length x n_vars ]
-        future_values (y): labels
+        past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1031,11 +1038,7 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTOutput(
-            loss=masked_loss,
-            prediction_output=x_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states)
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1062,9 +1065,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool]
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
-            loss=loss_val,
-            prediction_logits=y_hat,
-            hidden_states=model_output.hidden_states
+            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
         )
 
 
@@ -1079,8 +1080,8 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x):
         """
-        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        output: [bs x n_classes]
+        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output:
+        [bs x n_classes]
         """
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: bs x nvars x d_model
@@ -1148,12 +1149,12 @@ def forward(self, x):
         batch_size = x.shape[0]
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == 'mean':
+        elif self.pooling == "mean":
             x = x.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == 'max':
+        elif self.pooling == "max":
             x = x.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
+            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
         # flatten the input
         x = self.flatten(x)  # x: bs x (nvars * d_model)
@@ -1164,7 +1165,6 @@ def forward(self, x):
         return y
 
 
-
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
@@ -1172,16 +1172,17 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor] = None,
-                output_hidden_states: Optional[bool] = None):
-
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1191,11 +1192,7 @@ def forward(self,
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(
-            loss=loss_val,
-            prediction_output=y_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
 
 class PatchTSTForForecastingOutput(ModelOutput):
@@ -1245,8 +1242,7 @@ def __init__(self, config: PatchTSTConfig):
             for i in range(self.n_vars):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 self.linears.append(nn.Linear(head_dim, config.prediction_length))
-                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-                                     )
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
             self.flatten = nn.Flatten(start_dim=2)
             self.linear = nn.Linear(head_dim, config.prediction_length)
@@ -1262,9 +1258,9 @@ def forward(self, x: torch.Tensor):
         if self.use_cls_token:
             y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
         else:
-            if self.pooling == 'mean':
+            if self.pooling == "mean":
                 y = x.mean(dim=2)  # y: [bs x nvars x d_model]
-            elif self.pooling == 'max':
+            elif self.pooling == "max":
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
                 y = x  # y: [bs x nvars x num_patches x d_model]
@@ -1293,7 +1289,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
         self.use_revin = config.revin
         if self.use_revin:
             self.revin = RevIN()
@@ -1303,10 +1299,12 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor],
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor],
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1322,9 +1320,7 @@ def forward(self,
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
         return PatchTSTForForecastingOutput(
-            loss=loss_val,
-            forecast_outputs=y_hat,
-            hidden_states=model_output.hidden_states
+            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
         )
 
 
@@ -1350,12 +1346,12 @@ def forward(self, past_values):
         """
         if self.use_cls_token:
             past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == 'mean':
+        elif self.pooling == "mean":
             past_values = past_values.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == 'max':
+        elif self.pooling == "max":
             past_values = past_values.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
+            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
         # flatten the input
         past_values = self.flatten(past_values)  # x: bs x nvars * d_model
         y = self.linear(self.dropout(past_values))  # y: bs x output_dim
@@ -1372,15 +1368,14 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = RegressionHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                labels: Optional[torch.Tensor],
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1390,8 +1385,4 @@ def forward(self,
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(
-            loss=loss_val,
-            prediction_output=y_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ab7b7c18d62f5a..95d56d3caf25c6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -624,6 +624,12 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = None
 
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = None
+
+
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = None
+
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -5815,6 +5821,58 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PatchTSTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForForecasting(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForMaskPretraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForRegression(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PegasusForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 364306fc8d0256..65bbb309c815a0 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -15,6 +15,7 @@
 """ Testing suite for the PyTorch PatchTST model. """
 
 import inspect
+import random
 import tempfile
 import unittest
 
@@ -22,9 +23,9 @@
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
-from transformers.testing_utils import is_flaky, require_torch, torch_device, slow
 from transformers.models.auto import get_values
-import random
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -34,10 +35,18 @@
 
 if is_torch_available():
     import torch
-    from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
-    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, \
-        PatchTSTForClassification, PatchTSTForRegression
 
+    from transformers import (
+        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
+        PatchTSTConfig,
+        PatchTSTForClassification,
+        PatchTSTForForecasting,
+        PatchTSTForMaskPretraining,
+        PatchTSTForPrediction,
+        PatchTSTForRegression,
+        PatchTSTModel,
+    )
 
 
 @require_torch
@@ -111,7 +120,7 @@ def get_config(self):
             activation_function=self.hidden_act,
             seed_number=self.seed_number,
             num_classes=self.num_classes,
-            target_dimension=self.target_dimension
+            target_dimension=self.target_dimension,
         )
 
     def prepare_patchtst_inputs_dict(self, config):
@@ -142,16 +151,20 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (PatchTSTModel,
-         PatchTSTForPrediction,
-         PatchTSTForForecasting,
-         PatchTSTForPretraining,
-         PatchTSTForClassification,
-         PatchTSTForRegression)
+        (
+            PatchTSTModel,
+            PatchTSTForPrediction,
+            PatchTSTForForecasting,
+            PatchTSTForMaskPretraining,
+            PatchTSTForClassification,
+            PatchTSTForRegression,
+        )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
+    all_generative_model_classes = (
+        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
+    )
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     is_encoder_decoder = False
     test_pruning = False
@@ -161,7 +174,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     test_inputs_embeds = False
     test_model_common_attributes = False
 
-
     test_resize_embeddings = True
     test_resize_position_embeddings = False
     test_mismatched_shapes = True
@@ -206,7 +218,7 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-#
+    #
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -233,7 +245,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
-            print('model_class: ', model_class)
+            print("model_class: ", model_class)
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
@@ -242,8 +254,9 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             config.output_hidden_states = True
 
             check_hidden_states_output(inputs_dict, config, model_class)
-#
-#     # Ignore since we have no tokens embeddings
+
+    #
+    #     # Ignore since we have no tokens embeddings
 
     def test_resize_tokens_embeddings(self):
         pass
@@ -273,8 +286,9 @@ def test_forward_signature(self):
                 "past_values",
                 "future_values",
             ]
-            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \
-                    model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
+                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+            ):
                 expected_arg_names.remove("future_values")
                 expected_arg_names.append("labels")
             expected_arg_names.extend(
@@ -290,7 +304,7 @@ def test_retain_grad_hidden_states_attentions(self):
         super().test_retain_grad_hidden_states_attentions()
 
 
-def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'):
+def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
@@ -300,22 +314,21 @@ def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'):
 @slow
 class PatchTSTModelIntegrationTests(unittest.TestCase):
     def test_pretrain_head(self):
-        model = PatchTSTForPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device)
+        model = PatchTSTForMaskPretraining.from_pretrained("diepi/test_patchtst_pretrained_etth1").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
         with torch.no_grad():
-            output = model(
-                past_values=batch["past_values"].to(torch_device)
-            ).prediction_output
-        num_patch = (max(model.config.context_length,
-                         model.config.patch_length) - model.config.patch_length) // model.config.stride + 1
+            output = model(past_values=batch["past_values"].to(torch_device)).prediction_output
+        num_patch = (
+            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
+        ) // model.config.stride + 1
         expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]],
-                                       [[0.0053]], [[0.0090]]],
-                                      device=torch_device)
+        expected_slice = torch.tensor(
+            [[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], [[0.0053]], [[0.0090]]], device=torch_device
+        )
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # def test_classification_head(self):
@@ -337,21 +350,22 @@ def test_pretrain_head(self):
     #     self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE))
 
     def test_forecasting_head(self):
-        model = PatchTSTForForecasting.from_pretrained('diepi/test_patchtst_forecasting_etth1').to(torch_device)
+        model = PatchTSTForForecasting.from_pretrained("./hf_etth_forecasting").to(torch_device)
         batch = prepare_batch(file="test-batch.pt")
 
         torch.manual_seed(0)
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"].to(torch_device),
-                future_values=batch["future_values"].to(torch_device)
+                future_values=batch["future_values"].to(torch_device),
             ).forecast_outputs
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[-0.9027,  0.3814, -0.8322,  0.4250, -0.7183, -0.0635, -0.8747]],
-                                      device=torch_device,
-                                      )
+        expected_slice = torch.tensor(
+            [[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]],
+            device=torch_device,
+        )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
 
     # def test_seq_to_seq_generation(self):
@@ -378,4 +392,3 @@ def test_forecasting_head(self):
     #                                   device=torch_device,
     #                                   )
     #     self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE))
-
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c46b82b7c67ecb..d28679c78ef58a 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -176,6 +176,9 @@
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
     "AutoformerForPrediction",
+    "PatchTSTForForecasting",
+    "PatchTSTForMaskPretraining",
+    "PatchTSTForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
     "SamModel",

From a69cb59be055650a513d84b72858e078bf545d7a Mon Sep 17 00:00:00 2001
From: "Wesley M. Gifford" <wmgifford@us.ibm.com>
Date: Fri, 1 Sep 2023 14:32:21 -0400
Subject: [PATCH 033/189] docstring updates

---
 .../models/patchtst/configuration_patchtst.py | 11 ++--
 .../models/patchtst/modeling_patchtst.py      | 55 +++++++++++++++----
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index fbdca6db377155..e18a98d51d1fe5 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -45,9 +45,6 @@ class PatchTSTConfig(PretrainedConfig):
         input_size (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
-            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
-            scaler is set to "mean".
         num_time_features (`int`, *optional*, defaults to 0):
             The number of time features in the input time series.
         num_dynamic_real_features (`int`, *optional*, defaults to 0):
@@ -155,11 +152,11 @@ def __init__(
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
         mask_ratio=0.5,
-        mask_patches: list = [2, 3],
-        mask_patch_ratios: list = [1, 1],
+        mask_patches: List[int] = [2, 3],
+        mask_patch_ratios: List[int] = [1, 1],
         channel_consistent_masking: bool = False,
         d_size: str = "4D",
-        unmasked_channel_indices: list = None,
+        unmasked_channel_indices: Optional[List[int]] = None,
         mask_value=0,
         pooling: str = "mean",
         num_classes: int = 1,
@@ -173,7 +170,7 @@ def __init__(
         is_encoder_decoder: bool = False,
         encoder_layerdrop: float = 0.1,
         prediction_length: int = 24,
-        prediction_range: List = [0, 1],
+        prediction_range: List[int] = [0, 1],
         target_dimension: int = 1,
         # PatchTST arguments
         attention_type: str = "prob",
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 7b485769d7f0c4..285fa961b532c9 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -175,7 +175,6 @@ def random_masking(
     """random_masking: Mask the input considering the control variables.
 
     Args:
-        seed_number (int, optional): Value to set for the seed number
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
         unmasked_channel_indices (list, optional):
@@ -184,6 +183,7 @@ def random_masking(
             When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
             across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
+        seed_number (int, optional): Value to set for the random seed.
 
     Returns:
         Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n]
@@ -351,6 +351,7 @@ class PatchMasking(nn.Module):
             When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
             across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
+        seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
     """
 
     def __init__(
@@ -576,11 +577,17 @@ def __init__(self, config: PatchTSTConfig):
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
             self.w_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model
+                config.positional_encoding,
+                config.learn_pe,
+                config.num_patches + 1,
+                config.d_model,
             )
         else:
             self.w_pos = positional_encoding(
-                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model
+                config.positional_encoding,
+                config.learn_pe,
+                config.num_patches,
+                config.d_model,
             )
 
         # Positional dropout
@@ -596,7 +603,8 @@ def forward(
         self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
     ) -> BaseModelOutputWithNoAttention:
         """
-        x: tensor [bs x nvars x num_patches x patch_length] return:
+        past_values: tensor [bs x nvars x num_patches x patch_length] output_hidden_states (bool, optional): Boolean
+        indicating if hidden states should be outtput return:
             tensor [bs x nvars x num_patches x d_model]
                 or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         """
@@ -807,7 +815,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The bare PatchTST Model outputting raw hidden-states without any specific head on top.",
+    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
     PATCHTST_START_DOCSTRING,
 )
 class PatchTSTModelOutputWithNoAttention(ModelOutput):
@@ -902,7 +910,11 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.revin = nn.Identity()
 
-        self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride)
+        self.patching = Patchify(
+            config.context_length,
+            patch_length=config.patch_length,
+            stride=config.stride,
+        )
         self.mask_input = config.mask_input
 
         if self.mask_input:
@@ -1038,7 +1050,11 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(
+            loss=masked_loss,
+            prediction_output=x_hat,
+            hidden_states=model_output.hidden_states,
+        )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1065,7 +1081,9 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool]
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
-            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
+            loss=loss_val,
+            prediction_logits=y_hat,
+            hidden_states=model_output.hidden_states,
         )
 
 
@@ -1192,7 +1210,11 @@ def forward(
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(
+            loss=loss_val,
+            prediction_output=y_hat,
+            hidden_states=model_output.hidden_states,
+        )
 
 
 class PatchTSTForForecastingOutput(ModelOutput):
@@ -1320,7 +1342,9 @@ def forward(
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
         return PatchTSTForForecastingOutput(
-            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
+            loss=loss_val,
+            forecast_outputs=y_hat,
+            hidden_states=model_output.hidden_states,
         )
 
 
@@ -1374,7 +1398,10 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        labels: Optional[torch.Tensor],
+        output_hidden_states: Optional[bool] = None,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1385,4 +1412,8 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(
+            loss=loss_val,
+            prediction_output=y_hat,
+            hidden_states=model_output.hidden_states,
+        )

From 2be37c5823df50424ea65cfcba79e168f511b382 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sat, 2 Sep 2023 01:17:20 +0700
Subject: [PATCH 034/189] change input_size to num_input_channels

---
 .../models/patchtst/configuration_patchtst.py |  18 +-
 .../models/patchtst/modeling_patchtst.py      | 435 +++++++++---------
 .../models/patchtst/test_modeling_patchtst.py | 115 ++---
 3 files changed, 270 insertions(+), 298 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index e18a98d51d1fe5..df870b35f2c05d 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """PatchTST model configuration"""
 
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -42,7 +42,7 @@ class PatchTSTConfig(PretrainedConfig):
         context_length (`int`, *optional*, defaults to `prediction_length`):
             The context length for the encoder. If `None`, the context length will be the same as the
             `prediction_length`.
-        input_size (`int`, *optional*, defaults to 1):
+        num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
         num_time_features (`int`, *optional*, defaults to 0):
@@ -122,7 +122,7 @@ class PatchTSTConfig(PretrainedConfig):
 
     def __init__(
         self,
-        input_size: int = 1,
+        num_input_channels: int = 1,
         context_length: int = 32,
         patch_length: int = 8,
         stride: int = 8,
@@ -158,7 +158,7 @@ def __init__(
         d_size: str = "4D",
         unmasked_channel_indices: Optional[List[int]] = None,
         mask_value=0,
-        pooling: str = "mean",
+        pooling: str = 'mean',
         num_classes: int = 1,
         head_dropout: float = 0.0,
         # proj_dropout: float = 0.0,
@@ -170,17 +170,18 @@ def __init__(
         is_encoder_decoder: bool = False,
         encoder_layerdrop: float = 0.1,
         prediction_length: int = 24,
-        prediction_range: List[int] = [0, 1],
-        target_dimension: int = 1,
+        prediction_range: List = [0, 1],
+        num_output_channels: int = 1,
         # PatchTST arguments
         attention_type: str = "prob",
         sampling_factor: int = 5,
         distil: bool = True,
         **kwargs,
     ):
+
         # time series specific configuration
         self.context_length = context_length
-        self.input_size = input_size  # n_vars
+        self.num_input_channels = num_input_channels # n_vars
         self.num_time_features = num_time_features
         self.num_dynamic_real_features = num_dynamic_real_features
         self.num_static_real_features = num_static_real_features
@@ -244,10 +245,11 @@ def __init__(
         self.prediction_length = prediction_length
 
         # Regression
-        self.target_dimension = target_dimension
+        self.num_output_channels = num_output_channels
         self.prediction_range = prediction_range
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 
     def _num_patches(self):
         return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
+
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 285fa961b532c9..6abbfd08839bb2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 TSFM team. All rights reserved.
+# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,20 +14,19 @@
 # limitations under the License.
 """ PyTorch PatchTST model."""
 
-import math
-import random
 from typing import Optional, Tuple
-
-import numpy as np
 import torch
 from torch import nn
-from torch.nn.modules.activation import MultiheadAttention
+import math
+import random
+import numpy as np
 
-from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, logging
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.utils import ModelOutput
+from torch.nn.modules.activation import MultiheadAttention
 from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-from transformers.utils import ModelOutput, add_start_docstrings, logging
-
 
 logger = logging.get_logger(__name__)
 
@@ -39,6 +38,7 @@
 ]
 
 
+
 class PatchTSTAttention(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
@@ -85,7 +85,7 @@ def forward(self, x):
 
 def positional_encoding(pe, learn_pe, q_len, d_model):
     # Positional encoding
-    if pe is None:
+    if pe == None:
         w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
         nn.init.uniform_(w_pos, -0.02, 0.02)
         learn_pe = False
@@ -131,8 +131,9 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=
     i = 0
     for i in range(100):
         cpe = (
-            2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
-            - 1
+                2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (
+                    torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
+                - 1
         )
 
         if abs(cpe.mean()) <= eps:
@@ -160,33 +161,30 @@ def set_seed(x=42):
     random.seed(x)
     np.random.seed(x)
     torch.manual_seed(x)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(x)
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
 
 
 def random_masking(
-    xb: torch.Tensor,
-    mask_ratio: float,
-    unmasked_channel_indices: list = None,
-    channel_consistent_masking: bool = False,
-    mask_value=0,
-    seed_number: Optional[int] = None,
+        xb: torch.Tensor,
+        mask_ratio: float,
+        unmasked_channel_indices: list = None,
+        channel_consistent_masking: bool = False,
+        mask_value=0,
+        seed_number: Optional[int] = None
 ):
     """random_masking: Mask the input considering the control variables.
 
     Args:
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
-        unmasked_channel_indices (list, optional):
-            indices of unmasked channels. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional):
-            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
-            across channels. Defaults to True.
+        unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
         seed_number (int, optional): Value to set for the random seed.
 
     Returns:
-        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n]
+        Tensor: xb_mask, masked input, same shape as input
+        Tensor: Mask tensor of shape [bs x c x n]
     """
     if seed_number:
         set_seed(seed_number)
@@ -223,25 +221,26 @@ def compute_num_patches(sequence_length, patch_length, stride):
 
 class Patchify(nn.Module):
     """
-    Args:
     A class to patchify the time series sequence into different patches
-        sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int,
-        required): stride between patches
+    Args:
+        sequence_length (int, required): input sequence length
+        patch_length (int, required): patch length
+        stride (int, required): stride between patches
     Returns:
-        z: output tensor data [bs x n_vars x num_patches x patch_length]
+        z: output tensor data [bs x num_input_channels x num_patches x patch_length]
     """
 
     def __init__(
-        self,
-        sequence_length: int,
-        patch_length: int,
-        stride: int,
-        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+            self,
+            sequence_length: int,
+            patch_length: int,
+            stride: int,
+            padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
         assert (
-            sequence_length > patch_length
+                sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         self.sequence_length = sequence_length
@@ -256,38 +255,43 @@ def __init__(
     def forward(self, past_values: torch.Tensor):
         """
         Args:
-            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars]
+            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
         Returns:
-            x: output tensor data [bs x n_vars x num_patches x patch_length]
+            x: output tensor data [bs x num_input_channels x num_patches x patch_length]
         """
         sequence_length = past_values.shape[-2]
-        assert (
-            sequence_length == self.sequence_length
-        ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
         x = x.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
-        )  # x: [bs x num_patches x n_vars x patch_length]
-        x = x.transpose(-2, -3).contiguous()  # xb: [bs x n_vars x num_patches x patch_length]
+        )  # x: [bs x num_patches x num_input_channels x patch_length]
+        x = x.transpose(-2, -3).contiguous()  # xb: [bs x num_input_channels x num_patches x patch_length]
         return x
 
 
 class PatchEmbeddings(nn.Module):
     """
-    Args:
     A class to patchify the time series sequence into different patches
-        sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int,
-        required): stride between patches
+    Args:
+        sequence_length (int, required): input sequence length
+        patch_length (int, required): patch length
+        stride (int, required): stride between patches
     Returns:
-        embeddings: output tensor data [bs x n_vars x num_patches x embed_dim]
+        embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim]
     """
 
-    def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int):
+    def __init__(
+            self,
+            sequence_length: int,
+            patch_length: int,
+            stride: int,
+            embed_dim: int
+    ):
         super().__init__()
 
         assert (
-            sequence_length > patch_length
+                sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
@@ -303,34 +307,30 @@ def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_d
         self.s_begin = sequence_length - new_sequence_length
 
         # Embedding
-        self.projection = nn.Conv1d(
-            in_channels=1,
-            out_channels=embed_dim,
-            kernel_size=patch_length,
-            stride=stride,
-        )
+        self.projection = nn.Conv1d(in_channels=1,
+                                    out_channels=embed_dim,
+                                    kernel_size=patch_length,
+                                    stride=stride,
+                                    )
 
     def forward(self, past_values: torch.Tensor):
         """
         Args:
-            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars]
+            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
         Returns:
-            embeddings: output tensor data [bs x n_vars x num_patches x emb_dim]
+            embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim]
         """
-        bs, sequence_length, n_vars = past_values.shape
-        assert (
-            sequence_length == self.sequence_length
-        ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
+        bs, sequence_length, num_input_channels = past_values.shape
+        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
-        # convert past_values to shape [bs*n_vars x 1 x sequence_length ]
-        x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous()
+        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ]
+        x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous()
         # projection
-        embeddings = self.projection(x)  # embeddings: [bs*n_vars x emb_dim x num_patches]
+        embeddings = self.projection(x)  # embeddings: [bs*num_input_channels x emb_dim x num_patches]
         # reshape
-        embeddings = (
-            embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous()
-        )  # embeddings: [bs x n_vars x num_patches x emb_dim]
+        embeddings = embeddings.transpose(1, 2).view(bs, num_input_channels, -1,
+                                                     self.embed_dim).contiguous()  # embeddings: [bs x num_input_channels x num_patches x emb_dim]
         # embeddings = embeddings.flatten(2).transpose(1, 2)
         return embeddings
 
@@ -345,26 +345,24 @@ class PatchMasking(nn.Module):
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
         mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
         if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (list, optional):
-            Control Variable channel indices. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional):
-            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
-            across channels. Defaults to True.
+        unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
         seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
     """
 
     def __init__(
-        self,
-        mask_type: str = "random",
-        mask_ratio=0.5,
-        mask_patches: list = [2, 3],
-        mask_patch_ratios: list = [1, 1],
-        channel_consistent_masking: bool = False,
-        unmasked_channel_indices: list = None,
-        mask_value=0,
-        seed_number: Optional[int] = None,
+            self,
+            mask_type: str = "random",
+            mask_ratio=0.5,
+            mask_patches: list = [2, 3],
+            mask_patch_ratios: list = [1, 1],
+            channel_consistent_masking: bool = False,
+            unmasked_channel_indices: list = None,
+            mask_value=0,
+            seed_number: Optional[int] = None
     ):
+
         # if seed_number:
         #     set_seed(seed_number)
         self.mask_ratio = mask_ratio
@@ -384,13 +382,13 @@ def forward(self, x: torch.Tensor):
         """
         Input:
             x: patched input
-                4D: [bs x n_vars x num_patches x patch_length]
+                4D: [bs x num_input_channels x num_patches  x patch_length]
 
         Output:
             x_mask: Masked patched input
-                4D: [bs x n_vars x num_patches x patch_length]
+                4D: [bs x num_input_channels x num_patches  x patch_length]
             mask: bool tensor indicating True on masked points
-                4D: [bs x n_vars x num_patch]
+                4D: [bs x num_input_channels x num_patch]
         """
 
         if self.mask_type == "random":
@@ -400,13 +398,13 @@ def forward(self, x: torch.Tensor):
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
                 mask_value=self.mask_value,
-                seed_number=self.seed_number,
+                seed_number=self.seed_number
             )
 
         else:
             raise Exception("Invalid mask type")
 
-        mask = mask.bool()  # mask: [bs x n_vars x num_patch]
+        mask = mask.bool()  # mask: [bs x num_input_channels x num_patch]
 
         return x_mask, mask
 
@@ -415,11 +413,17 @@ class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)])
+        self.layers = nn.ModuleList(
+            [
+                ChannelAttentionTSTEncoderLayer(config)
+                for i in range(config.encoder_layers)
+            ]
+        )
 
     def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        src: tensor [bs x nvars x sequence_length x d_model] Return:
+        src: tensor [bs x nvars x sequence_length x d_model]
+        Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
@@ -473,57 +477,50 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x nvars x sequence_length x d_model] Return:
+        src: tensor [bs x nvars x sequence_length x d_model]
+        Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
-        bs, n_vars, sequence_length, d_model = src.shape
+        bs, num_input_channels, sequence_length, d_model = src.shape
 
         # First sublayer: attention across time
-        src = src.view(bs * n_vars, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(bs * num_input_channels, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src))
-            )  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer1(
-                src + self.dropout_path1(self.self_attn(src))
-            )  # src: [(bs*nvars) x sequence_length x d_model]
-        src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+                src + self.dropout_path1(self.self_attn(src)))  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
-        src = (
-            src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, d_model)
-        )  # [(bs*sequence_length) x nvars x d_model]
+        src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels,
+                                                    d_model)  # [(bs*sequence_length) x nvars x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path2(
-                self.self_attn(self.norm_sublayer2(src))
-            )  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer2(
-                src + self.dropout_path2(self.self_attn(src))
-            )  # src: [(bs*sequence_length) x nvars x d_model]
-        src = (
-            src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, 2).contiguous()
-        )  # src: [bs x nvars x sequence_length x d_model]
+                src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
+        src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1,
+                                                                          2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        src = src.view(bs * n_vars, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(bs * num_input_channels, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(
-                self.ff(self.norm_sublayer3(src))
-            )  # Add: residual connection with residual dropout
+                self.ff(self.norm_sublayer3(src)))  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer3(
-                src + self.dropout_path3(self.ff(src))
-            )  # Add: residual connection with residual dropout
-        src = src.reshape(bs, n_vars, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+                src + self.dropout_path3(self.ff(src)))  # Add: residual connection with residual dropout
+        src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         return src
 
@@ -550,6 +547,7 @@ def _init_weights(self, module):
             module.bias_k.data.normal_(mean=0.0, std=self.config.init_std)
             module.bias_v.data.normal_(mean=0.0, std=self.config.init_std)
 
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
             module.gradient_checkpointing = value
@@ -558,7 +556,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-        self.n_vars = config.input_size
+        self.num_input_channels = config.num_input_channels
         self.num_patches = config.num_patches
         self.patch_length = config.patch_length
         self.d_model = config.d_model
@@ -569,26 +567,18 @@ def __init__(self, config: PatchTSTConfig):
         # Input encoding: projection of feature vectors onto a d-dim vector space
         if not config.shared_embedding:
             self.w_p = nn.ModuleList()
-            for _ in range(self.n_vars):
+            for _ in range(self.num_input_channels):
                 self.w_p.append(nn.Linear(config.patch_length, config.d_model))
         else:
             self.w_p = nn.Linear(config.patch_length, config.d_model)
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(
-                config.positional_encoding,
-                config.learn_pe,
-                config.num_patches + 1,
-                config.d_model,
-            )
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1,
+                                             config.d_model)
         else:
-            self.w_pos = positional_encoding(
-                config.positional_encoding,
-                config.learn_pe,
-                config.num_patches,
-                config.d_model,
-            )
+            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches,
+                                             config.d_model)
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -599,17 +589,16 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
-    ) -> BaseModelOutputWithNoAttention:
+    def forward(self, past_values: torch.Tensor,
+                output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention:
         """
-        past_values: tensor [bs x nvars x num_patches x patch_length] output_hidden_states (bool, optional): Boolean
-        indicating if hidden states should be outtput return:
+        x: tensor [bs x nvars x num_patches x patch_length]
+        return:
             tensor [bs x nvars x num_patches x d_model]
                 or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         """
-        # bs, num_patches, n_vars, patch_length = x.shape
-        bs, n_vars, num_patches, patch_length = past_values.shape
+        # bs, num_patches, num_input_channels, patch_length = x.shape
+        bs, num_input_channels, num_patches, patch_length = past_values.shape
 
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -617,7 +606,7 @@ def forward(
         # Input encoding
         if not self.shared_embedding:
             x_out = []
-            for i in range(n_vars):
+            for i in range(num_input_channels):
                 z = self.w_p[i](past_values[:, i, :, :])
                 x_out.append(z)
             past_values = torch.stack(x_out, dim=1)
@@ -635,12 +624,14 @@ def forward(
 
         # Encoder
         past_values, hidden_states = self.encoder(
-            past_values, output_hidden_states
-        )  # x: [bs x nvars x num_patches x d_model]
+            past_values, output_hidden_states)  # x: [bs x nvars x num_patches x d_model]
         # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
 
         # return past_values, hidden_states
-        return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states)
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=past_values,
+            hidden_states=hidden_states
+        )
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -661,7 +652,7 @@ def forward(
 
 PATCHTST_INPUTS_DOCSTRING = r"""
     Args:
-        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`):
             Past values of the time series, that serve as context in order to predict the future. The sequence size of
             this tensor must be larger than the `context_length` of the model, since the model will use the larger size
             to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
@@ -677,7 +668,7 @@ def forward(
 
             Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
 
-            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of
             variates in the time series per time step.
         past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
             Required time features, which the model internally will add to `past_values`. These could be things like
@@ -695,7 +686,7 @@ def forward(
             must but known at prediction time.
 
             The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*):
             Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
             `[0, 1]`:
 
@@ -715,7 +706,7 @@ def forward(
             Static real features are features which have the same value for all time steps (static over time).
 
             A typical example of a static real feature is promotion information.
-        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*):
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
             Transformer needs during training to learn to output, given the `past_values`.
 
@@ -726,7 +717,7 @@ def forward(
             Optionally, during training any missing values need to be replaced with zeros and indicated via the
             `future_observed_mask`.
 
-            For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of
+            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of
             variates in the time series per time step.
         future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
             Required time features for the prediction window, which the model internally will add to `future_values`.
@@ -745,7 +736,7 @@ def forward(
             must but known at prediction time.
 
             The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*):
             Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
             in `[0, 1]`:
 
@@ -827,8 +818,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
-            the model at the output of each layer plus the optional initial embedding outputs.
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             patched input to the Transformer
         mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
@@ -850,9 +841,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 class RevIN(nn.Module):
     def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
         """
-        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars]
-        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm
-        input here.
+        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x num_input_channels]
+        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here.
         """
         super(RevIN, self).__init__()
         self.stdev = None
@@ -867,10 +857,10 @@ def set_statistics(self, mean, stdev):
         self.stdev = stdev
 
     def forward(self, x, mode: str):
-        if mode == "norm":
+        if mode == 'norm':
             self._get_statistics(x)
             x = self._normalize(x)
-        elif mode == "denorm":
+        elif mode == 'denorm':
             x = self._denormalize(x)
         elif mode == "transform":
             x = self._normalize(x)
@@ -900,6 +890,7 @@ def _denormalize(self, x):
         return x
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -926,7 +917,7 @@ def __init__(self, config: PatchTSTConfig):
                 channel_consistent_masking=config.channel_consistent_masking,
                 unmasked_channel_indices=config.unmasked_channel_indices,
                 mask_value=config.mask_value,
-                seed_number=config.seed_number,
+                seed_number=config.seed_number
             )
         else:
             self.masking = nn.Identity()
@@ -935,12 +926,10 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-    ):
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor] = None,
+                output_hidden_states: Optional[bool] = None):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -948,22 +937,19 @@ def forward(
         past_values = self.revin(past_values, mode="norm")  # x: tensor [bs x seq_len x in_channels]
 
         patched_values = self.patching(
-            past_values
-        )  # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain
+            past_values)  # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
-        return PatchTSTModelOutputWithNoAttention(
-            last_hidden_state=encoder_output.last_hidden_state,
-            hidden_states=encoder_output.hidden_states,
-            patched_input=patched_values,
-            mask=mask,
-            revin_mean=self.revin.mean if self.use_revin else None,
-            revin_stdev=self.revin.stdev if self.use_revin else None,
-        )
-
+        return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
+                                                  hidden_states=encoder_output.hidden_states,
+                                                  patched_input=patched_values,
+                                                  mask=mask,
+                                                  revin_mean=self.revin.mean if self.use_revin else None,
+                                                  revin_stdev=self.revin.stdev if self.use_revin else None
+                                                  )
 
 class MaskPretrainHead(nn.Module):
     def __init__(self, config):
@@ -1020,19 +1006,19 @@ def __init__(self, config: PatchTSTConfig):
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
         self.head = MaskPretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction="none")
+        self.loss = torch.nn.MSELoss(reduction='none')
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
-        self,
-        past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
+            self, past_values: torch.Tensor,
+            future_values: Optional[torch.Tensor] = None,
+            output_hidden_states: Optional[bool] = None
     ) -> PatchTSTOutput:
         """
-        past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels
+        past_values (x): tensor [bs x sequence_length x num_input_channels ]
+        future_values (y): labels
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1053,7 +1039,7 @@ def forward(
         return PatchTSTOutput(
             loss=masked_loss,
             prediction_output=x_hat,
-            hidden_states=model_output.hidden_states,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -1083,7 +1069,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool]
         return PatchTSTForClassificationOutput(
             loss=loss_val,
             prediction_logits=y_hat,
-            hidden_states=model_output.hidden_states,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -1094,12 +1080,12 @@ def __init__(self, config: PatchTSTConfig):
         self.pooling = config.pooling
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        self.linear = nn.Linear(config.input_size * config.d_model, config.num_classes)
+        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes)
 
     def forward(self, x):
         """
-        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output:
-        [bs x n_classes]
+        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        output: [bs x n_classes]
         """
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: bs x nvars x d_model
@@ -1148,41 +1134,42 @@ class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.target_dimension = config.target_dimension
+        self.num_output_channels = config.num_output_channels
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
-        head_dim = config.input_size * config.d_model
+        head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
-        self.linear = nn.Linear(head_dim, config.prediction_length * config.target_dimension)
+        self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
     def forward(self, x):
         """
         x: [bs x nvars x num_patch x d_model]
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
-        output: [bs x pred_len x target_dimension]
+        output: [bs x pred_len x num_output_channels]
         """
         batch_size = x.shape[0]
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == "mean":
+        elif self.pooling == 'mean':
             x = x.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == "max":
+        elif self.pooling == 'max':
             x = x.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
+            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
 
         # flatten the input
         x = self.flatten(x)  # x: bs x (nvars * d_model)
-        y = self.linear(self.dropout(x))  # y: bs x (pred_len * target_dimension)
+        y = self.linear(self.dropout(x))  # y: bs x (pred_len * num_output_channels)
 
         # reshape the data
-        y = y.reshape(batch_size, -1, self.target_dimension)  # [bs x pred_len x target_dimension]
+        y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
         return y
 
 
+
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
@@ -1190,17 +1177,16 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+        self.loss = nn.MSELoss(reduction='mean')
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-    ):
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor] = None,
+                output_hidden_states: Optional[bool] = None):
+
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1213,7 +1199,7 @@ def forward(
         return PatchTSTOutput(
             loss=loss_val,
             prediction_output=y_hat,
-            hidden_states=model_output.hidden_states,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -1252,7 +1238,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
         self.individual = config.individual
-        self.n_vars = config.input_size
+        self.num_input_channels = config.num_input_channels
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
         head_dim = config.d_model if self.pooling else config.d_model * config.num_patches
@@ -1261,10 +1247,11 @@ def __init__(self, config: PatchTSTConfig):
             self.linears = nn.ModuleList()
             self.dropouts = nn.ModuleList()
             self.flattens = nn.ModuleList()
-            for i in range(self.n_vars):
+            for i in range(self.num_input_channels):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 self.linears.append(nn.Linear(head_dim, config.prediction_length))
-                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+                                     )
         else:
             self.flatten = nn.Flatten(start_dim=2)
             self.linear = nn.Linear(head_dim, config.prediction_length)
@@ -1280,16 +1267,16 @@ def forward(self, x: torch.Tensor):
         if self.use_cls_token:
             y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
         else:
-            if self.pooling == "mean":
+            if self.pooling == 'mean':
                 y = x.mean(dim=2)  # y: [bs x nvars x d_model]
-            elif self.pooling == "max":
+            elif self.pooling == 'max':
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
                 y = x  # y: [bs x nvars x num_patches x d_model]
 
         if self.individual:
             x_out = []
-            for i in range(self.n_vars):
+            for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.linears[i](z)  # z: [bs x forecast_len]
                 z = self.dropouts[i](z)
@@ -1311,7 +1298,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+        self.loss = nn.MSELoss(reduction='mean')
         self.use_revin = config.revin
         if self.use_revin:
             self.revin = RevIN()
@@ -1321,12 +1308,10 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor],
-        output_hidden_states: Optional[bool] = None,
-    ):
+    def forward(self,
+                past_values: torch.Tensor,
+                future_values: Optional[torch.Tensor],
+                output_hidden_states: Optional[bool] = None):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1344,7 +1329,7 @@ def forward(
         return PatchTSTForForecastingOutput(
             loss=loss_val,
             forecast_outputs=y_hat,
-            hidden_states=model_output.hidden_states,
+            hidden_states=model_output.hidden_states
         )
 
 
@@ -1358,9 +1343,9 @@ def __init__(self, config: PatchTSTConfig):
 
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        input_dim = config.input_size * config.d_model
-        # if is_flatten: input_dim *= num_patch
-        self.linear = nn.Linear(input_dim, config.target_dimension)
+        head_dim = config.num_input_channels * config.d_model
+        # if is_flatten: head_dim *= num_patch
+        self.linear = nn.Linear(head_dim, config.num_output_channels)
 
     def forward(self, past_values):
         """
@@ -1370,12 +1355,12 @@ def forward(self, past_values):
         """
         if self.use_cls_token:
             past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == "mean":
+        elif self.pooling == 'mean':
             past_values = past_values.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == "max":
+        elif self.pooling == 'max':
             past_values = past_values.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
+            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
         # flatten the input
         past_values = self.flatten(past_values)  # x: bs x nvars * d_model
         y = self.linear(self.dropout(past_values))  # y: bs x output_dim
@@ -1392,17 +1377,15 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = RegressionHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+        self.loss = nn.MSELoss(reduction='mean')
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        labels: Optional[torch.Tensor],
-        output_hidden_states: Optional[bool] = None,
-    ):
+    def forward(self,
+                past_values: torch.Tensor,
+                labels: Optional[torch.Tensor],
+                output_hidden_states: Optional[bool] = None):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1415,5 +1398,5 @@ def forward(
         return PatchTSTOutput(
             loss=loss_val,
             prediction_output=y_hat,
-            hidden_states=model_output.hidden_states,
+            hidden_states=model_output.hidden_states
         )
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 65bbb309c815a0..911d5160db46c7 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch PatchTST model. """
 
 import inspect
-import random
 import tempfile
 import unittest
 
@@ -23,9 +22,9 @@
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
+from transformers.testing_utils import is_flaky, require_torch, torch_device, slow
 from transformers.models.auto import get_values
-from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
-
+import random
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -35,18 +34,10 @@
 
 if is_torch_available():
     import torch
+    from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining, \
+        PatchTSTForClassification, PatchTSTForRegression
 
-    from transformers import (
-        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
-        PatchTSTConfig,
-        PatchTSTForClassification,
-        PatchTSTForForecasting,
-        PatchTSTForMaskPretraining,
-        PatchTSTForPrediction,
-        PatchTSTForRegression,
-        PatchTSTModel,
-    )
 
 
 @require_torch
@@ -59,7 +50,7 @@ def __init__(
         context_length=14,
         patch_length=5,
         stride=5,
-        input_size=1,
+        num_input_channels=1,
         num_time_features=1,
         is_training=True,
         hidden_size=16,
@@ -74,7 +65,7 @@ def __init__(
         distil=False,
         seed_number=42,
         num_classes=2,
-        target_dimension=2,
+        num_output_channels=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -82,7 +73,7 @@ def __init__(
         self.context_length = context_length
         self.patch_length = patch_length
         self.stride = stride
-        self.input_size = input_size
+        self.num_input_channels = num_input_channels
         self.num_time_features = num_time_features
         self.lags_sequence = lags_sequence
         self.is_training = is_training
@@ -99,7 +90,7 @@ def __init__(
         )
         self.seed_number = seed_number
         self.num_classes = num_classes
-        self.target_dimension = target_dimension
+        self.num_output_channels = num_output_channels
         self.sampling_factor = sampling_factor
         self.distil = distil
         self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
@@ -109,7 +100,7 @@ def get_config(self):
             prediction_length=self.prediction_length,
             patch_length=self.patch_length,
             stride=self.stride,
-            input_size=self.input_size,
+            num_input_channels=self.num_input_channels,
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
@@ -120,17 +111,17 @@ def get_config(self):
             activation_function=self.hidden_act,
             seed_number=self.seed_number,
             num_classes=self.num_classes,
-            target_dimension=self.target_dimension,
+            num_output_channels=self.num_output_channels
         )
 
     def prepare_patchtst_inputs_dict(self, config):
         _past_length = config.context_length
-        # bs, n_vars, num_patch, patch_len
+        # bs, num_input_channels, num_patch, patch_len
 
-        # [bs x seq_len x n_vars]
-        past_values = floats_tensor([self.batch_size, _past_length, self.input_size])
+        # [bs x seq_len x num_input_channels]
+        past_values = floats_tensor([self.batch_size, _past_length, self.num_input_channels])
 
-        future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size])
+        future_values = floats_tensor([self.batch_size, config.prediction_length, self.num_input_channels])
 
         inputs_dict = {
             "past_values": past_values,
@@ -151,20 +142,16 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            PatchTSTModel,
-            PatchTSTForPrediction,
-            PatchTSTForForecasting,
-            PatchTSTForMaskPretraining,
-            PatchTSTForClassification,
-            PatchTSTForRegression,
-        )
+        (PatchTSTModel,
+         PatchTSTForPrediction,
+         PatchTSTForForecasting,
+         PatchTSTForMaskPretraining,
+         PatchTSTForClassification,
+         PatchTSTForRegression)
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (
-        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
-    )
+    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     is_encoder_decoder = False
     test_pruning = False
@@ -174,6 +161,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     test_inputs_embeds = False
     test_model_common_attributes = False
 
+
     test_resize_embeddings = True
     test_resize_position_embeddings = False
     test_mismatched_shapes = True
@@ -203,7 +191,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             inputs_dict.pop("future_values")
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
-            labels = floats_tensor([self.model_tester.batch_size, self.model_tester.target_dimension], rng=rng)
+            labels = floats_tensor([self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng)
             inputs_dict["labels"] = labels
             inputs_dict.pop("future_values")
         return inputs_dict
@@ -218,7 +206,7 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    #
+#
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -245,7 +233,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
-            print("model_class: ", model_class)
+            print('model_class: ', model_class)
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
@@ -254,9 +242,8 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             config.output_hidden_states = True
 
             check_hidden_states_output(inputs_dict, config, model_class)
-
-    #
-    #     # Ignore since we have no tokens embeddings
+#
+#     # Ignore since we have no tokens embeddings
 
     def test_resize_tokens_embeddings(self):
         pass
@@ -286,9 +273,8 @@ def test_forward_signature(self):
                 "past_values",
                 "future_values",
             ]
-            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
-                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
-            ):
+            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \
+                    model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
                 expected_arg_names.remove("future_values")
                 expected_arg_names.append("labels")
             expected_arg_names.extend(
@@ -304,7 +290,7 @@ def test_retain_grad_hidden_states_attentions(self):
         super().test_retain_grad_hidden_states_attentions()
 
 
-def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"):
+def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
@@ -314,21 +300,22 @@ def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"):
 @slow
 class PatchTSTModelIntegrationTests(unittest.TestCase):
     def test_pretrain_head(self):
-        model = PatchTSTForMaskPretraining.from_pretrained("diepi/test_patchtst_pretrained_etth1").to(torch_device)
+        model = PatchTSTForMaskPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
         with torch.no_grad():
-            output = model(past_values=batch["past_values"].to(torch_device)).prediction_output
-        num_patch = (
-            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
-        ) // model.config.stride + 1
-        expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length])
+            output = model(
+                past_values=batch["past_values"].to(torch_device)
+            ).prediction_output
+        num_patch = (max(model.config.context_length,
+                         model.config.patch_length) - model.config.patch_length) // model.config.stride + 1
+        expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor(
-            [[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], [[0.0053]], [[0.0090]]], device=torch_device
-        )
+        expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]],
+                                       [[0.0246]], [[0.0090]]],
+                                      device=torch_device)
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # def test_classification_head(self):
@@ -349,23 +336,22 @@ def test_pretrain_head(self):
     #                                   )
     #     self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE))
 
-    def test_forecasting_head(self):
-        model = PatchTSTForForecasting.from_pretrained("./hf_etth_forecasting").to(torch_device)
+    def test_prediction_head(self):
+        model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device)
         batch = prepare_batch(file="test-batch.pt")
 
         torch.manual_seed(0)
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"].to(torch_device),
-                future_values=batch["future_values"].to(torch_device),
-            ).forecast_outputs
-        expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
+                future_values=batch["future_values"].to(torch_device)
+            ).prediction_output
+        expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor(
-            [[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]],
-            device=torch_device,
-        )
+        expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+                                      device=torch_device,
+                                      )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
 
     # def test_seq_to_seq_generation(self):
@@ -385,10 +371,11 @@ def test_forecasting_head(self):
     #     # mean_prediction = outputs.sequences.mean(dim=1)
     #     # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
     #
-    #     # expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size])
+    #     # expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
     #     self.assertEqual(outputs.shape, expected_shape)
     #
     #     expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
     #                                   device=torch_device,
     #                                   )
     #     self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE))
+

From 22adead4c72d8b13e2f03b445d0b88baaae8853c Mon Sep 17 00:00:00 2001
From: "Wesley M. Gifford" <wmgifford@us.ibm.com>
Date: Fri, 1 Sep 2023 14:59:11 -0400
Subject: [PATCH 035/189] more formatting

---
 .../models/patchtst/modeling_patchtst.py      | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6abbfd08839bb2..7b796500b755de 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -222,10 +222,12 @@ def compute_num_patches(sequence_length, patch_length, stride):
 class Patchify(nn.Module):
     """
     A class to patchify the time series sequence into different patches
+
     Args:
-        sequence_length (int, required): input sequence length
-        patch_length (int, required): patch length
-        stride (int, required): stride between patches
+        sequence_length (int, required): input sequence length.
+        patch_length (int, required): patch length.
+        stride (int, required): stride between patches.
+
     Returns:
         z: output tensor data [bs x num_input_channels x num_patches x patch_length]
     """
@@ -256,6 +258,7 @@ def forward(self, past_values: torch.Tensor):
         """
         Args:
             past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
+
         Returns:
             x: output tensor data [bs x num_input_channels x num_patches x patch_length]
         """
@@ -274,9 +277,10 @@ class PatchEmbeddings(nn.Module):
     """
     A class to patchify the time series sequence into different patches
     Args:
-        sequence_length (int, required): input sequence length
-        patch_length (int, required): patch length
-        stride (int, required): stride between patches
+        sequence_length (int, required): input sequence length.
+        patch_length (int, required): patch length.
+        stride (int, required): stride between patches.
+
     Returns:
         embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim]
     """
@@ -592,7 +596,10 @@ def __init__(self, config: PatchTSTConfig):
     def forward(self, past_values: torch.Tensor,
                 output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention:
         """
-        x: tensor [bs x nvars x num_patches x patch_length]
+        Args:
+            past_values: tensor [bs x nvars x num_patches x patch_length].
+            output_hidden_states (bool, optional): Indicates if hidden states should be output.
+
         return:
             tensor [bs x nvars x num_patches x d_model]
                 or [bs x nvars x (num_patches+1) x d_model] if use cls_token
@@ -890,7 +897,6 @@ def _denormalize(self, x):
         return x
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)

From 76adaaefa4dff5d597f12e28cee5d9b2af7633f0 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sat, 2 Sep 2023 23:10:52 +0700
Subject: [PATCH 036/189] Remove some unused params

---
 .../models/patchtst/configuration_patchtst.py | 44 +++++++------------
 .../models/patchtst/test_modeling_patchtst.py |  5 ---
 2 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index df870b35f2c05d..66220c802fc8c6 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -28,6 +28,7 @@
 
 
 class PatchTSTConfig(PretrainedConfig):
+    model_type = "patchtst"
     r"""
     This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an
     PatchTST model according to the specified arguments, defining the model architecture.
@@ -89,13 +90,7 @@ class PatchTSTConfig(PretrainedConfig):
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-        attention_type (`str`, *optional*, defaults to "prob"):
-            Attention used in encoder. This can be set to "prob" (PatchTST's ProbAttention) or "full" (vanilla
-            transformer's canonical self-attention).
-        sampling_factor (`int`, *optional*, defaults to 5):
-            ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the
-            reduced query matrix (Q_reduce) input length.
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.        
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 
@@ -113,7 +108,6 @@ class PatchTSTConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "patchtst"
     attribute_map = {
         "hidden_size": "d_model",
         "num_attention_heads": "encoder_attention_heads",
@@ -122,10 +116,18 @@ class PatchTSTConfig(PretrainedConfig):
 
     def __init__(
         self,
+        # time series specific configuration
         num_input_channels: int = 1,
         context_length: int = 32,
+        num_dynamic_real_features: int = 0,
+        num_static_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_time_features: int = 0,
+        is_encoder_decoder: bool = False,
+        # PatchTST arguments
         patch_length: int = 8,
         stride: int = 8,
+        # Transformer architecture configuration
         encoder_layers: int = 3,
         d_model: int = 128,
         encoder_attention_heads: int = 16,
@@ -149,33 +151,23 @@ def __init__(
         individual: bool = False,
         seed_number: int = None,
         revin: Optional[bool] = True,
+        qkv_bias: bool = True,
+        # mask pretraining
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
         mask_ratio=0.5,
         mask_patches: List[int] = [2, 3],
         mask_patch_ratios: List[int] = [1, 1],
         channel_consistent_masking: bool = False,
-        d_size: str = "4D",
         unmasked_channel_indices: Optional[List[int]] = None,
         mask_value=0,
-        pooling: str = 'mean',
+        # head
+        pooling: str = "mean",
         num_classes: int = 1,
         head_dropout: float = 0.0,
-        # proj_dropout: float = 0.0,
-        qkv_bias: bool = True,
-        num_dynamic_real_features: int = 0,
-        num_static_real_features: int = 0,
-        num_static_categorical_features: int = 0,
-        num_time_features: int = 0,
-        is_encoder_decoder: bool = False,
-        encoder_layerdrop: float = 0.1,
         prediction_length: int = 24,
         prediction_range: List = [0, 1],
         num_output_channels: int = 1,
-        # PatchTST arguments
-        attention_type: str = "prob",
-        sampling_factor: int = 5,
-        distil: bool = True,
         **kwargs,
     ):
 
@@ -194,7 +186,6 @@ def __init__(
         self.encoder_layers = encoder_layers
         self.dropout = dropout
         self.attention_dropout = attention_dropout
-        self.encoder_layerdrop = encoder_layerdrop
         self.shared_embedding = shared_embedding
         self.channel_attention = channel_attention
         self.norm = norm
@@ -216,11 +207,8 @@ def __init__(
         self.patch_length = patch_length
         self.stride = stride
         self.num_patches = self._num_patches()
-        self.attention_type = attention_type
-        self.sampling_factor = sampling_factor
-        self.distil = distil
 
-        # Masking
+        # Mask pretraining
         self.seed_number = seed_number
         self.mask_input = mask_input
         self.mask_type = mask_type
@@ -228,7 +216,6 @@ def __init__(
         self.mask_patches = mask_patches
         self.mask_patch_ratios = mask_patch_ratios
         self.channel_consistent_masking = channel_consistent_masking
-        self.d_size = d_size
         self.unmasked_channel_indices = unmasked_channel_indices
         self.mask_value = mask_value
 
@@ -239,7 +226,6 @@ def __init__(
 
         # Classification
         self.num_classes = num_classes
-        # self.proj_dropout = proj_dropout
 
         # Forcasting and prediction
         self.prediction_length = prediction_length
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 911d5160db46c7..8208070116999a 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -61,7 +61,6 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         lags_sequence=[1, 2, 3, 4, 5],
-        sampling_factor=10,
         distil=False,
         seed_number=42,
         num_classes=2,
@@ -85,13 +84,9 @@ def __init__(
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
-        self.encoder_seq_length = min(
-            sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length
-        )
         self.seed_number = seed_number
         self.num_classes = num_classes
         self.num_output_channels = num_output_channels
-        self.sampling_factor = sampling_factor
         self.distil = distil
         self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
 

From 1b078c7d442667215f0b2d1156fc7ea3676d3bf9 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Sat, 2 Sep 2023 21:28:23 -0400
Subject: [PATCH 037/189] Add a comment for pretrained models

---
 .../models/patchtst/modeling_patchtst.py      | 143 ++----------------
 .../models/patchtst/test_modeling_patchtst.py |  55 +------
 2 files changed, 18 insertions(+), 180 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 7b796500b755de..03e028b9c8e99e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 TSFM team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -665,54 +665,14 @@ def forward(self, past_values: torch.Tensor,
             to construct lag features, i.e. additional values from the past which are added in order to serve as "extra
             context".
 
-            The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no
-            `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
-            look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of
-            the past.
+            The `sequence_length` here is equal to `config.context_length`
 
             The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
-            `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+            `static_categorical_features`, `static_real_features`).
 
-            Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
-
-            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of
-            variates in the time series per time step.
-        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
-            Required time features, which the model internally will add to `past_values`. These could be things like
-            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
-            could also be so-called "age" features, which basically help the model know "at which point in life" a
-            time-series is. Age features have small values for distant past time steps and increase monotonically the
-            more we approach the current time step. Holiday features are also a good example of time features.
-
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features. The Time Series Transformer only learns
-            additional embeddings for `static_categorical_features`.
-
-            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
-            must but known at prediction time.
-
-            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
-        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*):
-            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
-            `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
-            Optional static categorical features for which the model will learn an embedding, which it will add to the
-            values of the time series.
-
-            Static categorical features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static categorical feature is a time series ID.
-        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
-            Optional static real features which the model will add to the values of the time series.
-
-            Static real features are features which have the same value for all time steps (static over time).
-
-            A typical example of a static real feature is promotion information.
+            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the
+            number of variates in the time series per time step.
+            
         future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
             Transformer needs during training to learn to output, given the `past_values`.
@@ -721,94 +681,11 @@ def forward(self, past_values: torch.Tensor,
 
             See the demo notebook and code snippets for details.
 
-            Optionally, during training any missing values need to be replaced with zeros and indicated via the
-            `future_observed_mask`.
-
-            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of
-            variates in the time series per time step.
-        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
-            Required time features for the prediction window, which the model internally will add to `future_values`.
-            These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as
-            Fourier features). These could also be so-called "age" features, which basically help the model know "at
-            which point in life" a time-series is. Age features have small values for distant past time steps and
-            increase monotonically the more we approach the current time step. Holiday features are also a good example
-            of time features.
-
-            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
-            the position encodings are learned from scratch internally as parameters of the model, the Time Series
-            Transformer requires to provide additional time features. The Time Series Transformer only learns
-            additional embeddings for `static_categorical_features`.
-
-            Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features
-            must but known at prediction time.
-
-            The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
-        future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*):
-            Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
-            in `[0, 1]`:
-
-            - 1 for values that are **observed**,
-            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-            This mask is used to filter out missing values for the final loss calculation.
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
-            make sure the model can only look at previous inputs in order to predict the future.
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
+            For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the
+            number of variates in the time series per time step.
+
         output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            Whether or not to return the hidden states of all layers. 
 """
 
 
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 8208070116999a..8a444f1eecd5d3 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -285,17 +285,20 @@ def test_retain_grad_hidden_states_attentions(self):
         super().test_retain_grad_hidden_states_attentions()
 
 
-def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'):
+# Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable.
+def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
 
 
+# Note: Publishing of pretrained weights is under internal review. Pretrained model is not yet downloadable.
 @require_torch
 @slow
 class PatchTSTModelIntegrationTests(unittest.TestCase):
+    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_pretrain_head(self):
-        model = PatchTSTForMaskPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device)
+        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst_pretrained_etth1").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
@@ -313,26 +316,10 @@ def test_pretrain_head(self):
                                       device=torch_device)
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
-    # def test_classification_head(self):
-    #     # mock data, test
-    #     model = PatchTSTForClassification.from_pretrained('diepi/test_patchtst_classification_mock').to(torch_device)
-    #     batch = prepare_batch(repo_id="diepi/mock-data", file="test-mock-patchtst.pt")
-    #
-    #     torch.manual_seed(0)
-    #     with torch.no_grad():
-    #         output = model(
-    #             past_values=batch["past_values"].to(torch_device)
-    #         ).prediction_logits
-    #     expected_shape = torch.Size([1, model.config.num_classes])
-    #     self.assertEqual(output.shape, expected_shape)
-    #
-    #     expected_slice = torch.tensor([[-0.2774, -0.1081, 0.6771]],
-    #                                   device=torch_device,
-    #                                   )
-    #     self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE))
-
+    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_prediction_head(self):
-        model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device)
+        model = PatchTSTForPrediction.from_pretrained("ibm/patchtst_prediction_etth1").to(torch_device)
+
         batch = prepare_batch(file="test-batch.pt")
 
         torch.manual_seed(0)
@@ -348,29 +335,3 @@ def test_prediction_head(self):
                                       device=torch_device,
                                       )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
-
-    # def test_seq_to_seq_generation(self):
-    #     model = PatchTSTForPrediction.from_pretrained("diepi/test_patchtst_prediction_etth1").to(torch_device)
-    #     batch = prepare_batch("val-batch.pt")
-    #
-    #     torch.manual_seed(0)
-    #     with torch.no_grad():
-    #         outputs = model.generate(
-    #             past_values=batch["past_values"].to(torch_device),
-    #             future_values=batch["future_values"].to(torch_device)
-    #         ).prediction_output
-    #     expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
-    #     # self.assertEqual(outputs.sequences.shape, expected_shape)
-    #     #
-    #     # expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
-    #     # mean_prediction = outputs.sequences.mean(dim=1)
-    #     # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
-    #
-    #     # expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
-    #     self.assertEqual(outputs.shape, expected_shape)
-    #
-    #     expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
-    #                                   device=torch_device,
-    #                                   )
-    #     self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE))
-

From f718c042d6a08323d270dd521271464c6fde5202 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 3 Sep 2023 23:50:34 +0700
Subject: [PATCH 038/189] add channel_attention option

add channel_attention option and remove unused positional encoders.
---
 .../models/patchtst/modeling_patchtst.py      | 86 +++++--------------
 1 file changed, 23 insertions(+), 63 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 03e028b9c8e99e..c77a94a4e1c0fb 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -89,26 +89,15 @@ def positional_encoding(pe, learn_pe, q_len, d_model):
         w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
         nn.init.uniform_(w_pos, -0.02, 0.02)
         learn_pe = False
-    elif pe == "zero":
-        w_pos = torch.empty((q_len, 1))
-        nn.init.uniform_(w_pos, -0.02, 0.02)
     elif pe == "zeros":
         w_pos = torch.empty((q_len, d_model))
         nn.init.uniform_(w_pos, -0.02, 0.02)
-    elif pe == "normal" or pe == "gauss":
+    elif pe == "normal":
         w_pos = torch.zeros((q_len, 1))
         torch.nn.init.normal_(w_pos, mean=0.0, std=0.1)
     elif pe == "uniform":
         w_pos = torch.zeros((q_len, 1))
         nn.init.uniform_(w_pos, a=0.0, b=0.1)
-    elif pe == "lin1d":
-        w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True)
-    elif pe == "exp1d":
-        w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True)
-    elif pe == "lin2d":
-        w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True)
-    elif pe == "exp2d":
-        w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True)
     elif pe == "sincos":
         pos_enc = torch.zeros(q_len, d_model)
         position = torch.arange(0, q_len).unsqueeze(1)
@@ -120,43 +109,11 @@ def positional_encoding(pe, learn_pe, q_len, d_model):
         w_pos = pos_enc
     else:
         raise ValueError(
-            f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \
-        'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)"
+            f"{pe} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
         )
     return nn.Parameter(w_pos, requires_grad=learn_pe)
 
 
-def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False):
-    x = 0.5 if exponential else 1
-    i = 0
-    for i in range(100):
-        cpe = (
-                2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (
-                    torch.linspace(0, 1, d_model).reshape(1, -1) ** x)
-                - 1
-        )
-
-        if abs(cpe.mean()) <= eps:
-            break
-        elif cpe.mean() > eps:
-            x += 0.001
-        else:
-            x -= 0.001
-        i += 1
-    if normalize:
-        cpe = cpe - cpe.mean()
-        cpe = cpe / (cpe.std() * 10)
-    return cpe
-
-
-def coord1d_pos_encoding(q_len, exponential=False, normalize=True):
-    cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** (0.5 if exponential else 1)) - 1
-    if normalize:
-        cpe = cpe - cpe.mean()
-        cpe = cpe / (cpe.std() * 10)
-    return cpe
-
-
 def set_seed(x=42):
     random.seed(x)
     np.random.seed(x)
@@ -444,6 +401,7 @@ class ChannelAttentionTSTEncoderLayer(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
+        self.channel_attention = config.channel_attention
         # Multi-Head attention
         self.self_attn = PatchTSTAttention(config)
 
@@ -455,11 +413,12 @@ def __init__(self, config: PatchTSTConfig):
             self.norm_sublayer1 = nn.LayerNorm(config.d_model)
 
         # Add & Norm of the sublayer 2
-        self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
-        if "batch" in config.norm.lower():
-            self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
-        else:
-            self.norm_sublayer2 = nn.LayerNorm(config.d_model)
+        if self.channel_attention:
+            self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
+            if "batch" in config.norm.lower():
+                self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+            else:
+                self.norm_sublayer2 = nn.LayerNorm(config.d_model)
 
         # Position-wise Feed-Forward
         self.ff = nn.Sequential(
@@ -501,18 +460,19 @@ def forward(self, src: torch.Tensor):
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
-        src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels,
-                                                    d_model)  # [(bs*sequence_length) x nvars x d_model]
-        if self.pre_norm:
-            ## Norm and Multi-Head attention and Add residual connection
-            src = src + self.dropout_path2(
-                self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
-        else:
-            ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer2(
-                src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
-        src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1,
-                                                                          2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
+        if self.channel_attention:
+            src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels,
+                                                        d_model)  # [(bs*sequence_length) x nvars x d_model]
+            if self.pre_norm:
+                ## Norm and Multi-Head attention and Add residual connection
+                src = src + self.dropout_path2(
+                    self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
+            else:
+                ## Multi-Head attention and Add residual connection and Norm
+                src = self.norm_sublayer2(
+                    src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
+            src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1,
+                                                                              2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
         src = src.view(bs * num_input_channels, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
@@ -521,7 +481,7 @@ def forward(self, src: torch.Tensor):
             src = src + self.dropout_path3(
                 self.ff(self.norm_sublayer3(src)))  # Add: residual connection with residual dropout
         else:
-            ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT
+            ## Position-wise Feed-Forward and Add residual connection and Norm
             src = self.norm_sublayer3(
                 src + self.dropout_path3(self.ff(src)))  # Add: residual connection with residual dropout
         src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]

From 3c09a3393c0fd5a75fb2c47a1a12497b3ead2f60 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 5 Sep 2023 19:14:13 -0400
Subject: [PATCH 039/189] Update PatchTST models to use HF's MultiHeadAttention
 module

---
 .../models/patchtst/configuration_patchtst.py |  25 +-
 .../models/patchtst/modeling_patchtst.py      | 523 +++++++++++-------
 .../models/patchtst/test_modeling_patchtst.py |  82 +--
 3 files changed, 383 insertions(+), 247 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 66220c802fc8c6..71efa3e480f6b6 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """PatchTST model configuration"""
 
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -24,6 +24,7 @@
 
 PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "ibm/patchtst-base": "https://huggingface.co/ibm/patchtst-base/resolve/main/config.json",
+    # See all PatchTST models at https://huggingface.co/ibm/models?filter=patchtst
 }
 
 
@@ -32,6 +33,7 @@ class PatchTSTConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an
     PatchTST model according to the specified arguments, defining the model architecture.
+    [ibm/patchtst](https://huggingface.co/ibm/patchtst) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -90,7 +92,7 @@ class PatchTSTConfig(PretrainedConfig):
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
         use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.        
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
         distil (`bool`, *optional*, defaults to `True`):
             Whether to use distilling in encoder.
 
@@ -119,11 +121,6 @@ def __init__(
         # time series specific configuration
         num_input_channels: int = 1,
         context_length: int = 32,
-        num_dynamic_real_features: int = 0,
-        num_static_real_features: int = 0,
-        num_static_categorical_features: int = 0,
-        num_time_features: int = 0,
-        is_encoder_decoder: bool = False,
         # PatchTST arguments
         patch_length: int = 8,
         stride: int = 8,
@@ -143,7 +140,6 @@ def __init__(
         bias: bool = True,
         activation_function: str = "gelu",
         pre_norm: bool = False,
-        store_attn: bool = False,
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,
@@ -151,7 +147,6 @@ def __init__(
         individual: bool = False,
         seed_number: int = None,
         revin: Optional[bool] = True,
-        qkv_bias: bool = True,
         # mask pretraining
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
@@ -170,14 +165,9 @@ def __init__(
         num_output_channels: int = 1,
         **kwargs,
     ):
-
         # time series specific configuration
         self.context_length = context_length
-        self.num_input_channels = num_input_channels # n_vars
-        self.num_time_features = num_time_features
-        self.num_dynamic_real_features = num_dynamic_real_features
-        self.num_static_real_features = num_static_real_features
-        self.num_static_categorical_features = num_static_categorical_features
+        self.num_input_channels = num_input_channels  # n_vars
 
         # Transformer architecture configuration
         self.d_model = d_model
@@ -195,12 +185,10 @@ def __init__(
         self.bias = bias
         self.activation_function = activation_function
         self.pre_norm = pre_norm
-        self.store_attention = store_attn
         self.positional_encoding = positional_encoding
         self.learn_pe = learn_pe
         self.use_cls_token = use_cls_token
         self.init_std = init_std
-        self.qkv_bias = qkv_bias
         self.revin = revin
 
         # PatchTST
@@ -234,8 +222,7 @@ def __init__(
         self.num_output_channels = num_output_channels
         self.prediction_range = prediction_range
 
-        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+        super().__init__(**kwargs)
 
     def _num_patches(self):
         return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
-
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index c77a94a4e1c0fb..4289fdb2d41fbd 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -14,19 +14,20 @@
 # limitations under the License.
 """ PyTorch PatchTST model."""
 
-from typing import Optional, Tuple
-import torch
-from torch import nn
 import math
 import random
+from typing import Optional, Tuple
+
 import numpy as np
+import torch
+from torch import nn
+from torch.nn.modules.activation import MultiheadAttention
 
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, logging
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
-from transformers.utils import ModelOutput
-from torch.nn.modules.activation import MultiheadAttention
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
+from transformers.utils import ModelOutput, add_start_docstrings, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -38,27 +39,159 @@
 ]
 
 
-
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST
 class PatchTSTAttention(nn.Module):
-    def __init__(self, config: PatchTSTConfig):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
         super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
 
-        self.self_attn = MultiheadAttention(
-            embed_dim=config.d_model,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-            bias=config.bias,
-            add_bias_kv=True,
-            add_zero_attn=False,
-            batch_first=True,
-        )
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def forward(self, src: torch.Tensor) -> torch.Tensor:
-        """
-        src: Tensor [bs x q_len x d_model]
-        """
-        src, _ = self.self_attn(src, src, src, need_weights=False)
-        return src
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
 
 
 def get_activation_fn(activation):
@@ -85,7 +218,7 @@ def forward(self, x):
 
 def positional_encoding(pe, learn_pe, q_len, d_model):
     # Positional encoding
-    if pe == None:
+    if pe is None:
         w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
         nn.init.uniform_(w_pos, -0.02, 0.02)
         learn_pe = False
@@ -118,30 +251,33 @@ def set_seed(x=42):
     random.seed(x)
     np.random.seed(x)
     torch.manual_seed(x)
-    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(x)
 
 
 def random_masking(
-        xb: torch.Tensor,
-        mask_ratio: float,
-        unmasked_channel_indices: list = None,
-        channel_consistent_masking: bool = False,
-        mask_value=0,
-        seed_number: Optional[int] = None
+    xb: torch.Tensor,
+    mask_ratio: float,
+    unmasked_channel_indices: list = None,
+    channel_consistent_masking: bool = False,
+    mask_value=0,
+    seed_number: Optional[int] = None,
 ):
     """random_masking: Mask the input considering the control variables.
 
     Args:
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
-        unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        unmasked_channel_indices (list, optional):
+            indices of unmasked channels. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
         seed_number (int, optional): Value to set for the random seed.
 
     Returns:
-        Tensor: xb_mask, masked input, same shape as input
-        Tensor: Mask tensor of shape [bs x c x n]
+        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n]
     """
     if seed_number:
         set_seed(seed_number)
@@ -190,16 +326,16 @@ class Patchify(nn.Module):
     """
 
     def __init__(
-            self,
-            sequence_length: int,
-            patch_length: int,
-            stride: int,
-            padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
+        self,
+        sequence_length: int,
+        patch_length: int,
+        stride: int,
+        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
         assert (
-                sequence_length > patch_length
+            sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         self.sequence_length = sequence_length
@@ -220,9 +356,11 @@ def forward(self, past_values: torch.Tensor):
             x: output tensor data [bs x num_input_channels x num_patches x patch_length]
         """
         sequence_length = past_values.shape[-2]
-        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+        assert (
+            sequence_length == self.sequence_length
+        ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
         x = x.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
         )  # x: [bs x num_patches x num_input_channels x patch_length]
@@ -232,27 +370,20 @@ def forward(self, past_values: torch.Tensor):
 
 class PatchEmbeddings(nn.Module):
     """
-    A class to patchify the time series sequence into different patches
     Args:
-        sequence_length (int, required): input sequence length.
-        patch_length (int, required): patch length.
-        stride (int, required): stride between patches.
+    A class to patchify the time series sequence into different patches
+        sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride
+        (int, required): stride between patches.
 
     Returns:
         embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim]
     """
 
-    def __init__(
-            self,
-            sequence_length: int,
-            patch_length: int,
-            stride: int,
-            embed_dim: int
-    ):
+    def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int):
         super().__init__()
 
         assert (
-                sequence_length > patch_length
+            sequence_length > patch_length
         ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
 
         # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
@@ -268,11 +399,12 @@ def __init__(
         self.s_begin = sequence_length - new_sequence_length
 
         # Embedding
-        self.projection = nn.Conv1d(in_channels=1,
-                                    out_channels=embed_dim,
-                                    kernel_size=patch_length,
-                                    stride=stride,
-                                    )
+        self.projection = nn.Conv1d(
+            in_channels=1,
+            out_channels=embed_dim,
+            kernel_size=patch_length,
+            stride=stride,
+        )
 
     def forward(self, past_values: torch.Tensor):
         """
@@ -282,16 +414,19 @@ def forward(self, past_values: torch.Tensor):
             embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim]
         """
         bs, sequence_length, num_input_channels = past_values.shape
-        assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
+        assert (
+            sequence_length == self.sequence_length
+        ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
 
-        x = past_values[:, self.s_begin:, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
         # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ]
         x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous()
         # projection
         embeddings = self.projection(x)  # embeddings: [bs*num_input_channels x emb_dim x num_patches]
         # reshape
-        embeddings = embeddings.transpose(1, 2).view(bs, num_input_channels, -1,
-                                                     self.embed_dim).contiguous()  # embeddings: [bs x num_input_channels x num_patches x emb_dim]
+        embeddings = (
+            embeddings.transpose(1, 2).view(bs, num_input_channels, -1, self.embed_dim).contiguous()
+        )  # embeddings: [bs x num_input_channels x num_patches x emb_dim]
         # embeddings = embeddings.flatten(2).transpose(1, 2)
         return embeddings
 
@@ -306,24 +441,26 @@ class PatchMasking(nn.Module):
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
         mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
         if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True.
+        unmasked_channel_indices (list, optional):
+            Control Variable channel indices. These channels will not be masked. Defaults to None.
+        channel_consistent_masking (bool, optional):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
         seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
     """
 
     def __init__(
-            self,
-            mask_type: str = "random",
-            mask_ratio=0.5,
-            mask_patches: list = [2, 3],
-            mask_patch_ratios: list = [1, 1],
-            channel_consistent_masking: bool = False,
-            unmasked_channel_indices: list = None,
-            mask_value=0,
-            seed_number: Optional[int] = None
+        self,
+        mask_type: str = "random",
+        mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        mask_patch_ratios: list = [1, 1],
+        channel_consistent_masking: bool = False,
+        unmasked_channel_indices: list = None,
+        mask_value=0,
+        seed_number: Optional[int] = None,
     ):
-
         # if seed_number:
         #     set_seed(seed_number)
         self.mask_ratio = mask_ratio
@@ -343,11 +480,11 @@ def forward(self, x: torch.Tensor):
         """
         Input:
             x: patched input
-                4D: [bs x num_input_channels x num_patches  x patch_length]
+                4D: [bs x num_input_channels x num_patches x patch_length]
 
         Output:
             x_mask: Masked patched input
-                4D: [bs x num_input_channels x num_patches  x patch_length]
+                4D: [bs x num_input_channels x num_patches x patch_length]
             mask: bool tensor indicating True on masked points
                 4D: [bs x num_input_channels x num_patch]
         """
@@ -359,7 +496,7 @@ def forward(self, x: torch.Tensor):
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
                 mask_value=self.mask_value,
-                seed_number=self.seed_number
+                seed_number=self.seed_number,
             )
 
         else:
@@ -374,17 +511,11 @@ class ChannelAttentionTSTEncoder(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.layers = nn.ModuleList(
-            [
-                ChannelAttentionTSTEncoderLayer(config)
-                for i in range(config.encoder_layers)
-            ]
-        )
+        self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
     def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        src: tensor [bs x nvars x sequence_length x d_model]
-        Return:
+        src: tensor [bs x nvars x sequence_length x d_model] Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
@@ -403,7 +534,13 @@ def __init__(self, config: PatchTSTConfig):
 
         self.channel_attention = config.channel_attention
         # Multi-Head attention
-        self.self_attn = PatchTSTAttention(config)
+        # self.self_attn = PatchTSTAttention(config)
+
+        self.self_attn = PatchTSTAttention(
+            embed_dim=config.d_model,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
 
         # Add & Norm of the sublayer 1
         self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
@@ -436,54 +573,64 @@ def __init__(self, config: PatchTSTConfig):
             self.norm_sublayer3 = nn.LayerNorm(config.d_model)
 
         self.pre_norm = config.pre_norm
-        self.store_attn = config.store_attention
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x nvars x sequence_length x d_model]
-        Return:
+        src: tensor [bs x nvars x sequence_length x d_model] Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         bs, num_input_channels, sequence_length, d_model = src.shape
 
         # First sublayer: attention across time
-        src = src.view(bs * num_input_channels, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(
+            bs * num_input_channels, sequence_length, d_model
+        )  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src)))  # Add: residual connection with residual dropout
+                self.self_attn(self.norm_sublayer1(src)[0])
+            )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer1(
-                src + self.dropout_path1(self.self_attn(src)))  # src: [(bs*nvars) x sequence_length x d_model]
+                src + self.dropout_path1(self.self_attn(src)[0])
+            )  # src: [(bs*nvars) x sequence_length x d_model]
         src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
         if self.channel_attention:
-            src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels,
-                                                        d_model)  # [(bs*sequence_length) x nvars x d_model]
+            src = (
+                src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, d_model)
+            )  # [(bs*sequence_length) x nvars x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 src = src + self.dropout_path2(
-                    self.self_attn(self.norm_sublayer2(src)))  # Add: residual connection with residual dropout
+                    self.self_attn(self.norm_sublayer2(src)[0])
+                )  # Add: residual connection with residual dropout
             else:
                 ## Multi-Head attention and Add residual connection and Norm
                 src = self.norm_sublayer2(
-                    src + self.dropout_path2(self.self_attn(src)))  # src: [(bs*sequence_length) x nvars x d_model]
-            src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1,
-                                                                              2).contiguous()  # src: [bs x nvars x sequence_length x d_model]
+                    src + self.dropout_path2(self.self_attn(src)[0])
+                )  # src: [(bs*sequence_length) x nvars x d_model]
+            src = (
+                src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
+            )  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        src = src.view(bs * num_input_channels, sequence_length, d_model)  # src: [(bs*nvars) x sequence_length x d_model]
+        src = src.view(
+            bs * num_input_channels, sequence_length, d_model
+        )  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(
-                self.ff(self.norm_sublayer3(src)))  # Add: residual connection with residual dropout
+                self.ff(self.norm_sublayer3(src))
+            )  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm
             src = self.norm_sublayer3(
-                src + self.dropout_path3(self.ff(src)))  # Add: residual connection with residual dropout
+                src + self.dropout_path3(self.ff(src))
+            )  # Add: residual connection with residual dropout
         src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         return src
@@ -511,7 +658,6 @@ def _init_weights(self, module):
             module.bias_k.data.normal_(mean=0.0, std=self.config.init_std)
             module.bias_v.data.normal_(mean=0.0, std=self.config.init_std)
 
-
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
             module.gradient_checkpointing = value
@@ -538,11 +684,13 @@ def __init__(self, config: PatchTSTConfig):
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1,
-                                             config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model
+            )
         else:
-            self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches,
-                                             config.d_model)
+            self.w_pos = positional_encoding(
+                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model
+            )
 
         # Positional dropout
         self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
@@ -553,8 +701,9 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, past_values: torch.Tensor,
-                output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention:
+    def forward(
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
+    ) -> BaseModelOutputWithNoAttention:
         """
         Args:
             past_values: tensor [bs x nvars x num_patches x patch_length].
@@ -591,14 +740,12 @@ def forward(self, past_values: torch.Tensor,
 
         # Encoder
         past_values, hidden_states = self.encoder(
-            past_values, output_hidden_states)  # x: [bs x nvars x num_patches x d_model]
+            past_values, output_hidden_states
+        )  # x: [bs x nvars x num_patches x d_model]
         # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
 
         # return past_values, hidden_states
-        return BaseModelOutputWithNoAttention(
-            last_hidden_state=past_values,
-            hidden_states=hidden_states
-        )
+        return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states)
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -632,7 +779,7 @@ def forward(self, past_values: torch.Tensor,
 
             For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the
             number of variates in the time series per time step.
-            
+
         future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*):
             Future values of the time series, that serve as labels for the model. The `future_values` is what the
             Transformer needs during training to learn to output, given the `past_values`.
@@ -645,7 +792,7 @@ def forward(self, past_values: torch.Tensor,
             number of variates in the time series per time step.
 
         output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. 
+            Whether or not to return the hidden states of all layers.
 """
 
 
@@ -662,8 +809,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
         patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             patched input to the Transformer
         mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
@@ -685,8 +832,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 class RevIN(nn.Module):
     def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
         """
-        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x num_input_channels]
-        :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here.
+        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x
+        num_input_channels] :denorm_channels if the denorm input shape has less number of channels, mention the
+        channels in the denorm input here.
         """
         super(RevIN, self).__init__()
         self.stdev = None
@@ -701,10 +849,10 @@ def set_statistics(self, mean, stdev):
         self.stdev = stdev
 
     def forward(self, x, mode: str):
-        if mode == 'norm':
+        if mode == "norm":
             self._get_statistics(x)
             x = self._normalize(x)
-        elif mode == 'denorm':
+        elif mode == "denorm":
             x = self._denormalize(x)
         elif mode == "transform":
             x = self._normalize(x)
@@ -760,7 +908,7 @@ def __init__(self, config: PatchTSTConfig):
                 channel_consistent_masking=config.channel_consistent_masking,
                 unmasked_channel_indices=config.unmasked_channel_indices,
                 mask_value=config.mask_value,
-                seed_number=config.seed_number
+                seed_number=config.seed_number,
             )
         else:
             self.masking = nn.Identity()
@@ -769,10 +917,12 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor] = None,
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -780,19 +930,22 @@ def forward(self,
         past_values = self.revin(past_values, mode="norm")  # x: tensor [bs x seq_len x in_channels]
 
         patched_values = self.patching(
-            past_values)  # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
+            past_values
+        )  # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
-        return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state,
-                                                  hidden_states=encoder_output.hidden_states,
-                                                  patched_input=patched_values,
-                                                  mask=mask,
-                                                  revin_mean=self.revin.mean if self.use_revin else None,
-                                                  revin_stdev=self.revin.stdev if self.use_revin else None
-                                                  )
+        return PatchTSTModelOutputWithNoAttention(
+            last_hidden_state=encoder_output.last_hidden_state,
+            hidden_states=encoder_output.hidden_states,
+            patched_input=patched_values,
+            mask=mask,
+            revin_mean=self.revin.mean if self.use_revin else None,
+            revin_stdev=self.revin.stdev if self.use_revin else None,
+        )
+
 
 class MaskPretrainHead(nn.Module):
     def __init__(self, config):
@@ -849,19 +1002,19 @@ def __init__(self, config: PatchTSTConfig):
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
         self.head = MaskPretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction='none')
+        self.loss = torch.nn.MSELoss(reduction="none")
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
-            self, past_values: torch.Tensor,
-            future_values: Optional[torch.Tensor] = None,
-            output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTOutput:
         """
-        past_values (x): tensor [bs x sequence_length x num_input_channels ]
-        future_values (y): labels
+        past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -879,11 +1032,7 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTOutput(
-            loss=masked_loss,
-            prediction_output=x_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states)
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -910,9 +1059,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool]
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
-            loss=loss_val,
-            prediction_logits=y_hat,
-            hidden_states=model_output.hidden_states
+            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
         )
 
 
@@ -927,8 +1074,8 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, x):
         """
-        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        output: [bs x n_classes]
+        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output:
+        [bs x n_classes]
         """
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: bs x nvars x d_model
@@ -996,12 +1143,12 @@ def forward(self, x):
         batch_size = x.shape[0]
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == 'mean':
+        elif self.pooling == "mean":
             x = x.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == 'max':
+        elif self.pooling == "max":
             x = x.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
+            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
         # flatten the input
         x = self.flatten(x)  # x: bs x (nvars * d_model)
@@ -1012,7 +1159,6 @@ def forward(self, x):
         return y
 
 
-
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
@@ -1020,16 +1166,17 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         self.head = PredictionHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor] = None,
-                output_hidden_states: Optional[bool] = None):
-
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1039,11 +1186,7 @@ def forward(self,
         loss_val = None
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(
-            loss=loss_val,
-            prediction_output=y_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
 
 class PatchTSTForForecastingOutput(ModelOutput):
@@ -1093,8 +1236,7 @@ def __init__(self, config: PatchTSTConfig):
             for i in range(self.num_input_channels):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 self.linears.append(nn.Linear(head_dim, config.prediction_length))
-                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-                                     )
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
             self.flatten = nn.Flatten(start_dim=2)
             self.linear = nn.Linear(head_dim, config.prediction_length)
@@ -1110,9 +1252,9 @@ def forward(self, x: torch.Tensor):
         if self.use_cls_token:
             y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
         else:
-            if self.pooling == 'mean':
+            if self.pooling == "mean":
                 y = x.mean(dim=2)  # y: [bs x nvars x d_model]
-            elif self.pooling == 'max':
+            elif self.pooling == "max":
                 y = x.max(dim=2)  # y: [bs x nvars x d_model]
             else:
                 y = x  # y: [bs x nvars x num_patches x d_model]
@@ -1141,7 +1283,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
         self.use_revin = config.revin
         if self.use_revin:
             self.revin = RevIN()
@@ -1151,10 +1293,12 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                future_values: Optional[torch.Tensor],
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: Optional[torch.Tensor],
+        output_hidden_states: Optional[bool] = None,
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1170,9 +1314,7 @@ def forward(self,
         if future_values is not None:
             loss_val = self.loss(y_hat, future_values)
         return PatchTSTForForecastingOutput(
-            loss=loss_val,
-            forecast_outputs=y_hat,
-            hidden_states=model_output.hidden_states
+            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
         )
 
 
@@ -1198,12 +1340,12 @@ def forward(self, past_values):
         """
         if self.use_cls_token:
             past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
-        elif self.pooling == 'mean':
+        elif self.pooling == "mean":
             past_values = past_values.mean(dim=2)  # x: [bs x nvars x d_model]
-        elif self.pooling == 'max':
+        elif self.pooling == "max":
             past_values = past_values.max(dim=2)  # x: [bs x nvars x d_model]
         else:
-            raise Exception(f'pooling operator {self.pooling} is not implemented yet')
+            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
         # flatten the input
         past_values = self.flatten(past_values)  # x: bs x nvars * d_model
         y = self.linear(self.dropout(past_values))  # y: bs x output_dim
@@ -1220,15 +1362,14 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
         self.head = RegressionHead(config)
-        self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.MSELoss(reduction="mean")
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self,
-                past_values: torch.Tensor,
-                labels: Optional[torch.Tensor],
-                output_hidden_states: Optional[bool] = None):
+    def forward(
+        self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None
+    ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1238,8 +1379,4 @@ def forward(self,
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(
-            loss=loss_val,
-            prediction_output=y_hat,
-            hidden_states=model_output.hidden_states
-        )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 8a444f1eecd5d3..f3a045ef756de4 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -15,16 +15,16 @@
 """ Testing suite for the PyTorch PatchTST model. """
 
 import inspect
+import random
 import tempfile
 import unittest
 
-import numpy as np
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
-from transformers.testing_utils import is_flaky, require_torch, torch_device, slow
 from transformers.models.auto import get_values
-import random
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -34,10 +34,18 @@
 
 if is_torch_available():
     import torch
-    from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
-    from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining, \
-        PatchTSTForClassification, PatchTSTForRegression
 
+    from transformers import (
+        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
+        PatchTSTConfig,
+        PatchTSTForClassification,
+        PatchTSTForForecasting,
+        PatchTSTForMaskPretraining,
+        PatchTSTForPrediction,
+        PatchTSTForRegression,
+        PatchTSTModel,
+    )
 
 
 @require_torch
@@ -106,7 +114,7 @@ def get_config(self):
             activation_function=self.hidden_act,
             seed_number=self.seed_number,
             num_classes=self.num_classes,
-            num_output_channels=self.num_output_channels
+            num_output_channels=self.num_output_channels,
         )
 
     def prepare_patchtst_inputs_dict(self, config):
@@ -137,18 +145,21 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (PatchTSTModel,
-         PatchTSTForPrediction,
-         PatchTSTForForecasting,
-         PatchTSTForMaskPretraining,
-         PatchTSTForClassification,
-         PatchTSTForRegression)
+        (
+            PatchTSTModel,
+            PatchTSTForPrediction,
+            PatchTSTForForecasting,
+            PatchTSTForMaskPretraining,
+            PatchTSTForClassification,
+            PatchTSTForRegression,
+        )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
+    all_generative_model_classes = (
+        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
+    )
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
-    is_encoder_decoder = False
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
@@ -156,7 +167,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     test_inputs_embeds = False
     test_model_common_attributes = False
 
-
     test_resize_embeddings = True
     test_resize_position_embeddings = False
     test_mismatched_shapes = True
@@ -201,7 +211,7 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-#
+    #
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -211,7 +221,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            hidden_states = outputs.hidden_states
 
             expected_num_layers = getattr(
                 self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers
@@ -228,7 +238,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
         for model_class in self.all_model_classes:
             inputs_dict["output_hidden_states"] = True
-            print('model_class: ', model_class)
+            print("model_class: ", model_class)
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
@@ -237,8 +247,9 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             config.output_hidden_states = True
 
             check_hidden_states_output(inputs_dict, config, model_class)
-#
-#     # Ignore since we have no tokens embeddings
+
+    #
+    #     # Ignore since we have no tokens embeddings
 
     def test_resize_tokens_embeddings(self):
         pass
@@ -268,8 +279,9 @@ def test_forward_signature(self):
                 "past_values",
                 "future_values",
             ]
-            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \
-                    model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
+                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+            ):
                 expected_arg_names.remove("future_values")
                 expected_arg_names.append("labels")
             expected_arg_names.extend(
@@ -303,17 +315,16 @@ def test_pretrain_head(self):
 
         torch.manual_seed(0)
         with torch.no_grad():
-            output = model(
-                past_values=batch["past_values"].to(torch_device)
-            ).prediction_output
-        num_patch = (max(model.config.context_length,
-                         model.config.patch_length) - model.config.patch_length) // model.config.stride + 1
+            output = model(past_values=batch["past_values"].to(torch_device)).prediction_output
+        num_patch = (
+            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
+        ) // model.config.stride + 1
         expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]],
-                                       [[0.0246]], [[0.0090]]],
-                                      device=torch_device)
+        expected_slice = torch.tensor(
+            [[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], [[0.0246]], [[0.0090]]], device=torch_device
+        )
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
@@ -326,12 +337,13 @@ def test_prediction_head(self):
         with torch.no_grad():
             output = model(
                 past_values=batch["past_values"].to(torch_device),
-                future_values=batch["future_values"].to(torch_device)
+                future_values=batch["future_values"].to(torch_device),
             ).prediction_output
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
-                                      device=torch_device,
-                                      )
+        expected_slice = torch.tensor(
+            [[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+            device=torch_device,
+        )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))

From 3bada036d8dd0349ef7f2a75c59a17d37c80bc5b Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Thu, 7 Sep 2023 14:21:23 -0400
Subject: [PATCH 040/189] Update paper + github urls

---
 README.md                                             | 2 +-
 README_es.md                                          | 2 +-
 README_hd.md                                          | 2 +-
 README_ja.md                                          | 2 +-
 README_ko.md                                          | 2 +-
 README_zh-hans.md                                     | 2 +-
 README_zh-hant.md                                     | 2 +-
 docs/source/en/index.md                               | 2 +-
 docs/source/en/model_doc/patchtst.md                  | 6 +++---
 src/transformers/models/patchtst/modeling_patchtst.py | 1 +
 10 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index a7246572381451..471a64174c3296 100644
--- a/README.md
+++ b/README.md
@@ -428,7 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_es.md b/README_es.md
index 62085093026a87..67e6559faee2cf 100644
--- a/README_es.md
+++ b/README_es.md
@@ -405,7 +405,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_hd.md b/README_hd.md
index 5e93de459461a7..914064bb151e9a 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -377,7 +377,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 1067b2e57a25ee..b0026ec09b3326 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -439,7 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
diff --git a/README_ko.md b/README_ko.md
index 202d3d4893561a..722199cb2950a3 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -354,7 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 8fe1633f181115..59e0c58dcf69cd 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -378,7 +378,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9c615363e61a81..34b2eee82cb8f4 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -390,7 +390,7 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index b9f65477b5fe1c..d059a059bcac27 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -194,7 +194,7 @@ The documentation is organized into five sections:
 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](model_doc/patchtst)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 9a30b8294571b0..9d08bdd628f0ba 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -18,19 +18,19 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The PatchTST model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*We propose an efficient design of Transformer-based models for multivariate time series forecasting and self-supervised representation learning. It is based on two key components: (i) segmentation of time series into subseries-level patches which are served as input tokens to Transformer; (ii) channel-independence where each channel contains a single univariate time series that shares the same embedding and Transformer weights across all the series. Patching design naturally has three-fold benefit: local semantic information is retained in the embedding; computation and memory usage of the attention maps are quadratically reduced given the same look-back window; and the model can attend longer history. Our channel-independent patch time series Transformer (PatchTST) can improve the long-term forecasting accuracy significantly when compared with that of SOTA Transformer-based models. We also apply our model to self-supervised pre-training tasks and attain excellent fine-tuning performance, which outperforms supervised training on large datasets. Transferring of masked pre-trained representation on one dataset to others also produces SOTA forecasting accuracy.*
 
 Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
 This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+The original code can be found [here](https://github.com/yuqinie98/PatchTST).
 
 
 ## PatchTSTConfig
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4289fdb2d41fbd..c230fc10622095 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1289,6 +1289,7 @@ def __init__(self, config: PatchTSTConfig):
             self.revin = RevIN()
         else:
             self.revin = nn.Identity()
+        config.pooling = None
 
         # Initialize weights and apply final processing
         self.post_init()

From 55065926064955ff3de149f23d831ff421423fcd Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Thu, 7 Sep 2023 14:39:59 -0400
Subject: [PATCH 041/189] Fix hidden_state return value

---
 README.md                                             | 2 +-
 docs/source/en/index.md                               | 2 +-
 src/transformers/models/patchtst/modeling_patchtst.py | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 471a64174c3296..37400c7d14d93b 100644
--- a/README.md
+++ b/README.md
@@ -428,7 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index d059a059bcac27..f56263447b10d8 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -194,7 +194,7 @@ The documentation is organized into five sections:
 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
+1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index c230fc10622095..14ff3b2aadeb36 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -520,9 +520,8 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None
         """
         all_hidden_states = []
         for mod in self.layers:
-            if output_hidden_states:
-                src = mod(src)
-                all_hidden_states.append(src)
+            src = mod(src)
+            all_hidden_states.append(src)
         if output_hidden_states:
             return src, all_hidden_states
         return src, None
@@ -1289,7 +1288,6 @@ def __init__(self, config: PatchTSTConfig):
             self.revin = RevIN()
         else:
             self.revin = nn.Identity()
-        config.pooling = None
 
         # Initialize weights and apply final processing
         self.post_init()

From bd2a1c542e8aefff85370c07da4e37ab096133de Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Sun, 10 Sep 2023 21:49:58 -0400
Subject: [PATCH 042/189] Update integration test to use PatchTSTForForecasting

---
 docs/source/en/model_doc/patchtst.md              |  2 +-
 .../models/patchtst/modeling_patchtst.py          | 15 ++++++++-------
 tests/models/patchtst/test_modeling_patchtst.py   | 12 ++++++------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 9d08bdd628f0ba..209e50a6b12480 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -29,7 +29,7 @@ Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif).
 The original code can be found [here](https://github.com/yuqinie98/PatchTST).
 
 
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 14ff3b2aadeb36..15a588c158fecf 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -23,10 +23,10 @@
 from torch import nn
 from torch.nn.modules.activation import MultiheadAttention
 
-from transformers.modeling_outputs import BaseModelOutputWithNoAttention
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig
-from transformers.utils import ModelOutput, add_start_docstrings, logging
+from ...modeling_outputs import BaseModelOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, add_start_docstrings, logging
+from .configuration_patchtst import PatchTSTConfig
 
 
 logger = logging.get_logger(__name__)
@@ -519,10 +519,11 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None
             Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
-        for mod in self.layers:
-            src = mod(src)
-            all_hidden_states.append(src)
+
         if output_hidden_states:
+            for mod in self.layers:
+                src = mod(src)
+                all_hidden_states.append(src)
             return src, all_hidden_states
         return src, None
 
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index f3a045ef756de4..32d9e5fdcedde8 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -298,7 +298,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 # Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable.
-def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"):
+def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
@@ -310,7 +310,7 @@ def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"):
 class PatchTSTModelIntegrationTests(unittest.TestCase):
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_pretrain_head(self):
-        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst_pretrained_etth1").to(torch_device)
+        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth-pretrain-dev").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
@@ -323,13 +323,13 @@ def test_pretrain_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], [[0.0246]], [[0.0090]]], device=torch_device
+            [[[0.0100]], [[0.0242]], [[0.0128]], [[0.0125]], [[-0.0160]], [[0.0395]], [[0.0135]]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_prediction_head(self):
-        model = PatchTSTForPrediction.from_pretrained("ibm/patchtst_prediction_etth1").to(torch_device)
+        model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth-forecasting-dev").to(torch_device)
 
         batch = prepare_batch(file="test-batch.pt")
 
@@ -338,12 +338,12 @@ def test_prediction_head(self):
             output = model(
                 past_values=batch["past_values"].to(torch_device),
                 future_values=batch["future_values"].to(torch_device),
-            ).prediction_output
+            ).forecast_outputs
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]],
+            [[0.2781, 0.4699, 0.4292, 0.4278, -0.2669, 0.4660, -0.8898]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))

From 76416156b5478e6cd397b686e19319ae5a344eb2 Mon Sep 17 00:00:00 2001
From: diepi <diiepy@gmail.com>
Date: Mon, 11 Sep 2023 11:22:07 +0200
Subject: [PATCH 043/189] Adding dataclass decorator for model output classes

---
 src/transformers/models/patchtst/modeling_patchtst.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 15a588c158fecf..98fa57a103f896 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -17,6 +17,7 @@
 import math
 import random
 from typing import Optional, Tuple
+from dataclasses import dataclass
 
 import numpy as np
 import torch
@@ -796,6 +797,7 @@ def forward(
 """
 
 
+@dataclass
 @add_start_docstrings(
     "The bare PatchTST Model outputting raw hidden-states without any specific head.",
     PATCHTST_START_DOCSTRING,
@@ -817,7 +819,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
             Bool masked tensor indicating which patches are masked
         revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
             mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
-        revin_std: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
+        revin_stdev: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
             std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
     """
 
@@ -826,7 +828,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     patched_input: torch.FloatTensor = None
     mask: torch.FloatTensor = None
     revin_mean: torch.FloatTensor = None
-    revin_std: torch.FloatTensor = None
+    revin_stdev: torch.FloatTensor = None
 
 
 class RevIN(nn.Module):
@@ -966,6 +968,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
+@dataclass
 class PatchTSTOutput(ModelOutput):
     """
     Output type of [`PatchTSTForPredictiontion`].
@@ -1091,6 +1094,7 @@ def forward(self, x):
         return y
 
 
+@dataclass
 class PatchTSTForClassificationOutput(ModelOutput):
     """
     Output type of [`PatchTSTForClassification`].
@@ -1189,6 +1193,7 @@ def forward(
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
 
+@dataclass
 class PatchTSTForForecastingOutput(ModelOutput):
     """
     Output type of [`PatchTSTForPredictiontion`].

From a14053f4c25c6e9b3d265cf6f406c21fd46121e0 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Tue, 12 Sep 2023 11:30:29 -0400
Subject: [PATCH 044/189] Run fixup script

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 98fa57a103f896..87582e874dc563 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -16,8 +16,8 @@
 
 import math
 import random
-from typing import Optional, Tuple
 from dataclasses import dataclass
+from typing import Optional, Tuple
 
 import numpy as np
 import torch

From 2b704b450a7628e1abab15e8774aedfe9932858c Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Wed, 13 Sep 2023 13:59:04 -0400
Subject: [PATCH 045/189] Rename model repos for integration test

---
 tests/models/patchtst/test_modeling_patchtst.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 32d9e5fdcedde8..83c457d9c43fd1 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -298,7 +298,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 # Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable.
-def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"):
+def prepare_batch(repo_id="ibm/etth1-forecast-test", file="train-batch.pt"):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
@@ -310,7 +310,7 @@ def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"):
 class PatchTSTModelIntegrationTests(unittest.TestCase):
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_pretrain_head(self):
-        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth-pretrain-dev").to(torch_device)
+        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
@@ -329,7 +329,7 @@ def test_pretrain_head(self):
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_prediction_head(self):
-        model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth-forecasting-dev").to(torch_device)
+        model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device)
 
         batch = prepare_batch(file="test-batch.pt")
 

From d46e0c8bb1d4cea7ebf0ce32a2567cdd1ed8cf5b Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 13 Sep 2023 16:19:23 -0400
Subject: [PATCH 046/189] edit argument explanation

---
 .../models/patchtst/configuration_patchtst.py | 136 +++++++++++-------
 1 file changed, 82 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 71efa3e480f6b6..11b320ef85e866 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -39,63 +39,89 @@ class PatchTSTConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        prediction_length (`int`):
-            The prediction length for the decoder. In other words, the prediction horizon of the model. This value is
-            typically dictated by the dataset and we recommend to set it appropriately.
-        context_length (`int`, *optional*, defaults to `prediction_length`):
-            The context length for the encoder. If `None`, the context length will be the same as the
-            `prediction_length`.
         num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        num_time_features (`int`, *optional*, defaults to 0):
-            The number of time features in the input time series.
-        num_dynamic_real_features (`int`, *optional*, defaults to 0):
-            The number of dynamic real valued features.
-        num_static_categorical_features (`int`, *optional*, defaults to 0):
-            The number of static categorical features.
-        num_static_real_features (`int`, *optional*, defaults to 0):
-            The number of static real valued features.
-        embedding_dimension (`list[int]`, *optional*):
-            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
-            having the same length as `num_static_categorical_features`. Cannot be `None` if
-            `num_static_categorical_features` is > 0.
-        d_model (`int`, *optional*, defaults to 64):
-            Dimensionality of the transformer layers.
+        context_length (`int`, defaults to 32):
+            The context length for the encoder. 
+        
+        patch_length (`int`, *optional*, defaults to 1):
+            Define the patch length of the patchification process. Default to 1
+        stride (`int`, *optional*, defaults to 1):
+            define the stride of the patchification process. Default to 1
+        
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 2):
-            Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 2):
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.    
+        encoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 2):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+        shared_embedding (`bool`, *optional*, defaults to True):
+            Sharing the input embedding across all channels.
+        channel_attention (`bool`, *optional*, defaults to False):
+            Activate channel attention block in the Transformer to allow channels to attend each other.
+        encoder_ffn_dim (`int`, *optional*, defaults to 256):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 32):
-            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
-            `"relu"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the encoder, and decoder.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each encoder layer.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for the attention and fully connected layers for each decoder layer.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
+        norm (`str` , *optional*, defaults to `"BatchNorm"`):
+            Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        positional_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability in the positional embedding layer.            
+        dropout_path (`float`, *optional*, defaults to 0.0):
+            The dropout path in the residual block. 
+        ff_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability used between the two layers of the feed-forward networks.
-        num_parallel_samples (`int`, *optional*, defaults to 100):
-            The number of samples to generate in parallel for each time step of inference.
+        bias (`bool`, *optional*, defaults to True):
+            Consider bias in the feed-forward networks.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
+        positional_encoding (`str`, *optional*, defaults to `"sincos"`):
+            Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.    
+        learn_pe (`bool`, *optional*, defaults to False):
+            Whether the positional encoding is updated during training.                   
+        use_cls_token (`bool`, *optional*, defaults to False):
+            Whether cls token is used.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
-        distil (`bool`, *optional*, defaults to `True`):
-            Whether to use distilling in encoder.
-
+        shared_projection (`bool`, *optional*, defaults to True):
+            Sharing the projection layer across different channels in the forecast head. 
+        seed_number (`int`, *optional*, defaults to None):
+            Use seed number for random masking.
+        revin (`bool`, *optional*, defaults to True):
+            Apply reverse instance normalization on each input batch.
+        
+        mask_input (`bool`, *optional*, defaults to False):
+            Apply masking during the pretraining.
+        mask_type (`str`, *optional*, defaults to `"random"`):
+            Masking type. Only `"random"` is currently supported.
+        mask_ratio (`float`, *optional*, defaults to 0.5):
+            Masking ratio is applied to mask the input data during pretraining.           
+        channel_consistent_masking (`bool`, *optional*, defaults to False):
+            If channel consistent masking is True, all the channels will have the same masking.
+        unmasked_channel_indices (`list`, *optional*, defaults to None):
+            Channels are not masked during pretraining.
+        mask_value (`int`, *optional*, defaults to 0):
+            Mask value to set.
+        
+        pooling (`str`, *optional*, defaults to `"mean"`):
+            Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
+        num_classes (`int`, *optional*, defaults to 1):
+            Number of classes is defined for classification task.
+        head_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for head.                                            
+        prediction_length (`int`):
+            The prediction length for the encoder. In other words, the prediction horizon of the model. 
+        prediction_length (`int`):
+            The prediction length for the encoder. In other words, the prediction horizon of the model. 
+        num_output_channels (`int`, *optional*, defaults to 1):
+            Number of output channels.
+        prediction_range (`list`, *optional*, defaults to None):
+            The range of prediction values can be set to enforce the model to produce values within a range. 
+            
+                
     Example:
 
     ```python
@@ -122,12 +148,12 @@ def __init__(
         num_input_channels: int = 1,
         context_length: int = 32,
         # PatchTST arguments
-        patch_length: int = 8,
-        stride: int = 8,
+        patch_length: int = 1,
+        stride: int = 1,
         # Transformer architecture configuration
         encoder_layers: int = 3,
-        d_model: int = 128,
-        encoder_attention_heads: int = 16,
+        d_model: int = 64,
+        encoder_attention_heads: int = 4,
         shared_embedding: bool = True,
         channel_attention: bool = False,
         encoder_ffn_dim: int = 256,
@@ -144,13 +170,13 @@ def __init__(
         learn_pe: bool = False,
         use_cls_token: bool = False,
         init_std: float = 0.02,
-        individual: bool = False,
+        shared_projection: bool = True,
         seed_number: int = None,
         revin: Optional[bool] = True,
         # mask pretraining
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
-        mask_ratio=0.5,
+        mask_ratio: float = 0.5,
         mask_patches: List[int] = [2, 3],
         mask_patch_ratios: List[int] = [1, 1],
         channel_consistent_masking: bool = False,
@@ -161,8 +187,8 @@ def __init__(
         num_classes: int = 1,
         head_dropout: float = 0.0,
         prediction_length: int = 24,
-        prediction_range: List = [0, 1],
         num_output_channels: int = 1,
+        prediction_range: List = None,
         **kwargs,
     ):
         # time series specific configuration
@@ -208,10 +234,12 @@ def __init__(
         self.mask_value = mask_value
 
         # general head params
-        self.individual = individual
         self.pooling = pooling
         self.head_dropout = head_dropout
 
+        # Forecast head
+        self.shared_projection = shared_projection
+
         # Classification
         self.num_classes = num_classes
 

From 5c240ddcecf64ac4bd4731e455ca4095b4b91a63 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 13 Sep 2023 16:20:42 -0400
Subject: [PATCH 047/189] change individual option to shared_projection

---
 .../models/patchtst/modeling_patchtst.py            | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 87582e874dc563..74d66f7d81638b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -682,6 +682,7 @@ def __init__(self, config: PatchTSTConfig):
                 self.w_p.append(nn.Linear(config.patch_length, config.d_model))
         else:
             self.w_p = nn.Linear(config.patch_length, config.d_model)
+
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
@@ -694,7 +695,7 @@ def __init__(self, config: PatchTSTConfig):
             )
 
         # Positional dropout
-        self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
+        self.positional_dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
 
         # Encoder
         self.encoder = ChannelAttentionTSTEncoder(config)
@@ -731,13 +732,13 @@ def forward(
             past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patches x d_model]
 
         if self.use_cls_token:
-            past_values = self.dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patches x d_model]
+            past_values = self.positional_dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patches x d_model]
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
             past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x nvars x (num_patches+1) x d_model]
         else:
-            past_values = self.dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patches x d_model]
+            past_values = self.positional_dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patches x d_model]
 
         # Encoder
         past_values, hidden_states = self.encoder(
@@ -1228,13 +1229,13 @@ class ForecastHead(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.individual = config.individual
+        self.shared_projection = config.shared_projection
         self.num_input_channels = config.num_input_channels
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
         head_dim = config.d_model if self.pooling else config.d_model * config.num_patches
 
-        if self.individual:
+        if not self.shared_projection:
             self.linears = nn.ModuleList()
             self.dropouts = nn.ModuleList()
             self.flattens = nn.ModuleList()
@@ -1264,7 +1265,7 @@ def forward(self, x: torch.Tensor):
             else:
                 y = x  # y: [bs x nvars x num_patches x d_model]
 
-        if self.individual:
+        if not self.shared_projection:
             x_out = []
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]

From 2916ec09de53f308b9d447ede630bd46dda65566 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 14 Sep 2023 19:51:09 +0200
Subject: [PATCH 048/189] style

---
 .../models/patchtst/configuration_patchtst.py | 36 +++++++++----------
 .../models/patchtst/modeling_patchtst.py      |  7 ++--
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 11b320ef85e866..c044692824d95a 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -43,17 +43,15 @@ class PatchTSTConfig(PretrainedConfig):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
         context_length (`int`, defaults to 32):
-            The context length for the encoder. 
-        
+            The context length for the encoder.
         patch_length (`int`, *optional*, defaults to 1):
-            Define the patch length of the patchification process. Default to 1
+            Define the patch length of the patchification process. Default to 1.
         stride (`int`, *optional*, defaults to 1):
-            define the stride of the patchification process. Default to 1
-        
+            define the stride of the patchification process. Default to 1.
         encoder_layers (`int`, *optional*, defaults to 2):
             Number of encoder layers.
         d_model (`int`, *optional*, defaults to 64):
-            Dimensionality of the transformer layers.    
+            Dimensionality of the transformer layers.
         encoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
         shared_embedding (`bool`, *optional*, defaults to True):
@@ -69,9 +67,9 @@ class PatchTSTConfig(PretrainedConfig):
         dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the encoder, and decoder.
         positional_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability in the positional embedding layer.            
+            The dropout probability in the positional embedding layer.
         dropout_path (`float`, *optional*, defaults to 0.0):
-            The dropout path in the residual block. 
+            The dropout path in the residual block.
         ff_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability used between the two layers of the feed-forward networks.
         bias (`bool`, *optional*, defaults to True):
@@ -79,49 +77,47 @@ class PatchTSTConfig(PretrainedConfig):
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
         positional_encoding (`str`, *optional*, defaults to `"sincos"`):
-            Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.    
+            Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
         learn_pe (`bool`, *optional*, defaults to False):
-            Whether the positional encoding is updated during training.                   
+            Whether the positional encoding is updated during training.
         use_cls_token (`bool`, *optional*, defaults to False):
             Whether cls token is used.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
         shared_projection (`bool`, *optional*, defaults to True):
-            Sharing the projection layer across different channels in the forecast head. 
+            Sharing the projection layer across different channels in the forecast head.
         seed_number (`int`, *optional*, defaults to None):
             Use seed number for random masking.
         revin (`bool`, *optional*, defaults to True):
             Apply reverse instance normalization on each input batch.
-        
         mask_input (`bool`, *optional*, defaults to False):
             Apply masking during the pretraining.
         mask_type (`str`, *optional*, defaults to `"random"`):
             Masking type. Only `"random"` is currently supported.
         mask_ratio (`float`, *optional*, defaults to 0.5):
-            Masking ratio is applied to mask the input data during pretraining.           
+            Masking ratio is applied to mask the input data during pretraining.
         channel_consistent_masking (`bool`, *optional*, defaults to False):
             If channel consistent masking is True, all the channels will have the same masking.
         unmasked_channel_indices (`list`, *optional*, defaults to None):
             Channels are not masked during pretraining.
         mask_value (`int`, *optional*, defaults to 0):
             Mask value to set.
-        
         pooling (`str`, *optional*, defaults to `"mean"`):
             Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
         num_classes (`int`, *optional*, defaults to 1):
             Number of classes is defined for classification task.
         head_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for head.                                            
+            The dropout probability for head.
         prediction_length (`int`):
-            The prediction length for the encoder. In other words, the prediction horizon of the model. 
+            The prediction length for the encoder. In other words, the prediction horizon of the model.
         prediction_length (`int`):
-            The prediction length for the encoder. In other words, the prediction horizon of the model. 
+            The prediction length for the encoder. In other words, the prediction horizon of the model.
         num_output_channels (`int`, *optional*, defaults to 1):
             Number of output channels.
         prediction_range (`list`, *optional*, defaults to None):
-            The range of prediction values can be set to enforce the model to produce values within a range. 
-            
-                
+            The range of prediction values can be set to enforce the model to produce values within a range.
+
+
     Example:
 
     ```python
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 74d66f7d81638b..ce4c7e9be773b3 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -695,7 +695,9 @@ def __init__(self, config: PatchTSTConfig):
             )
 
         # Positional dropout
-        self.positional_dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
+        self.positional_dropout = (
+            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
+        )
 
         # Encoder
         self.encoder = ChannelAttentionTSTEncoder(config)
@@ -732,7 +734,8 @@ def forward(
             past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patches x d_model]
 
         if self.use_cls_token:
-            past_values = self.positional_dropout(past_values + self.w_pos[1:, :])  # x: [bs x nvars x num_patches x d_model]
+            # x: [bs x nvars x num_patches x d_model]
+            past_values = self.positional_dropout(past_values + self.w_pos[1:, :])
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples

From 208b83c1aef3308d7a66bba2b46f80a4b06f0911 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Thu, 14 Sep 2023 16:13:38 -0400
Subject: [PATCH 049/189] Rename integration test + import cleanup

---
 src/transformers/models/patchtst/modeling_patchtst.py | 7 +------
 tests/models/patchtst/test_modeling_patchtst.py       | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ce4c7e9be773b3..a776fc80886fe3 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -22,7 +22,6 @@
 import numpy as np
 import torch
 from torch import nn
-from torch.nn.modules.activation import MultiheadAttention
 
 from ...modeling_outputs import BaseModelOutputWithNoAttention
 from ...modeling_utils import PreTrainedModel
@@ -35,7 +34,7 @@
 _CONFIG_FOR_DOC = "PatchTSTConfig"
 
 PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "ibm/patchtst-base",
+    "ibm/patchtst-etth1-pretrain",
     # See all PatchTST models at https://huggingface.co/models?filter=patchtst
 ]
 
@@ -654,10 +653,6 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.init_std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, MultiheadAttention):
-            module.in_proj_weight.data.normal_(mean=0.0, std=self.config.init_std)
-            module.bias_k.data.normal_(mean=0.0, std=self.config.init_std)
-            module.bias_v.data.normal_(mean=0.0, std=self.config.init_std)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 83c457d9c43fd1..fb8767390b3b33 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -328,7 +328,7 @@ def test_pretrain_head(self):
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
-    def test_prediction_head(self):
+    def test_forecast_head(self):
         model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device)
 
         batch = prepare_batch(file="test-batch.pt")

From ba7290719322c256047fdd0b16a1c943aff2bc98 Mon Sep 17 00:00:00 2001
From: Gift Sinthong <phsinthong@gmail.com>
Date: Thu, 14 Sep 2023 18:11:15 -0400
Subject: [PATCH 050/189] Fix outpu_hidden_states return value

---
 src/transformers/models/patchtst/modeling_patchtst.py | 11 ++++++-----
 tests/models/patchtst/test_modeling_patchtst.py       |  5 +++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a776fc80886fe3..24e42977ce2161 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -520,12 +520,13 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None
         """
         all_hidden_states = []
 
-        if output_hidden_states:
-            for mod in self.layers:
-                src = mod(src)
+        for mod in self.layers:
+            src = mod(src)
+            if output_hidden_states:
                 all_hidden_states.append(src)
-            return src, all_hidden_states
-        return src, None
+        if output_hidden_states is None:
+            return src, None
+        return src, all_hidden_states
 
 
 class ChannelAttentionTSTEncoderLayer(nn.Module):
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index fb8767390b3b33..4f3cb2f1f465bc 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -323,7 +323,8 @@ def test_pretrain_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[[0.0100]], [[0.0242]], [[0.0128]], [[0.0125]], [[-0.0160]], [[0.0395]], [[0.0135]]], device=torch_device
+            [[[-0.5409]], [[0.3093]], [[-0.3759]], [[0.5068]], [[-0.8387]], [[0.0937]], [[0.2809]]],
+            device=torch_device,
         )
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
@@ -343,7 +344,7 @@ def test_forecast_head(self):
         self.assertEqual(output.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[0.2781, 0.4699, 0.4292, 0.4278, -0.2669, 0.4660, -0.8898]],
+            [[0.3228, 0.4320, 0.4591, 0.4066, -0.3461, 0.3094, -0.8426]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))

From eb96b0266346995fed79a4d501a30e8802b86b0f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 15 Sep 2023 13:35:25 +0200
Subject: [PATCH 051/189] removed unused mode

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 24e42977ce2161..5bf7e64befab38 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -856,8 +856,6 @@ def forward(self, x, mode: str):
             x = self._normalize(x)
         elif mode == "denorm":
             x = self._denormalize(x)
-        elif mode == "transform":
-            x = self._normalize(x)
         else:
             raise NotImplementedError
         return x

From 474e981e217cafa6e14a37be019cfc46f6a0b491 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 19 Sep 2023 10:07:21 +0200
Subject: [PATCH 052/189] added std, mean and nops scaler

---
 .../models/patchtst/configuration_patchtst.py |  11 +-
 .../models/patchtst/modeling_patchtst.py      | 203 ++++++++++++------
 2 files changed, 141 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index c044692824d95a..ea4d7382deb187 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """PatchTST model configuration"""
 
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -88,8 +88,9 @@ class PatchTSTConfig(PretrainedConfig):
             Sharing the projection layer across different channels in the forecast head.
         seed_number (`int`, *optional*, defaults to None):
             Use seed number for random masking.
-        revin (`bool`, *optional*, defaults to True):
-            Apply reverse instance normalization on each input batch.
+        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
         mask_input (`bool`, *optional*, defaults to False):
             Apply masking during the pretraining.
         mask_type (`str`, *optional*, defaults to `"random"`):
@@ -168,7 +169,7 @@ def __init__(
         init_std: float = 0.02,
         shared_projection: bool = True,
         seed_number: int = None,
-        revin: Optional[bool] = True,
+        scaling: Optional[Union[str, bool]] = "mean",
         # mask pretraining
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
@@ -211,7 +212,7 @@ def __init__(
         self.learn_pe = learn_pe
         self.use_cls_token = use_cls_token
         self.init_std = init_std
-        self.revin = revin
+        self.scaling = scaling
 
         # PatchTST
         self.patch_length = patch_length
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5bf7e64befab38..86e49ddc88befc 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -817,9 +817,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
             patched input to the Transformer
         mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
             Bool masked tensor indicating which patches are masked
-        revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
+        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
             mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
-        revin_stdev: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
+        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*)
             std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
     """
 
@@ -827,70 +827,137 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     patched_input: torch.FloatTensor = None
     mask: torch.FloatTensor = None
-    revin_mean: torch.FloatTensor = None
-    revin_stdev: torch.FloatTensor = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
 
 
-class RevIN(nn.Module):
-    def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None):
-        """
-        :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x
-        num_input_channels] :denorm_channels if the denorm input shape has less number of channels, mention the
-        channels in the denorm input here.
-        """
-        super(RevIN, self).__init__()
-        self.stdev = None
-        self.mean = None
-        self.start_dim = start_dim
-        self.denorm_channels = denorm_channels
-        self.eps = eps
-
-    def set_statistics(self, mean, stdev):
-        # get statistics
-        self.mean = mean
-        self.stdev = stdev
-
-    def forward(self, x, mode: str):
-        if mode == "norm":
-            self._get_statistics(x)
-            x = self._normalize(x)
-        elif mode == "denorm":
-            x = self._denormalize(x)
-        else:
-            raise NotImplementedError
-        return x
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
+class PatchTSTStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
 
-    def _get_statistics(self, x):
-        dim2reduce = tuple(range(self.start_dim, x.ndim - 1))
-        self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
-        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
 
-    def _normalize(self, x):
-        x = x - self.mean
-        x = x / self.stdev
-        return x
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST
+class PatchTSTMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
 
-    def _denormalize(self, x):
-        # denormalize the data
-        if self.denorm_channels is None:
-            x = x * self.stdev
-            x = x + self.mean
+    @torch.no_grad()
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
         else:
-            x = x * self.stdev[..., self.denorm_channels]
-            x = x + self.mean[..., self.denorm_channels]
+            default_scale = self.default_scale * torch.ones_like(scale)
 
-        return x
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST
+class PatchTSTNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
 
 
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
-        self.use_revin = config.revin
 
-        if self.use_revin:
-            self.revin = RevIN()
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = PatchTSTStdScaler(dim=1, keepdim=True)
         else:
-            self.revin = nn.Identity()
+            self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
 
         self.patching = Patchify(
             config.context_length,
@@ -920,6 +987,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ):
@@ -927,11 +995,14 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
 
-        past_values = self.revin(past_values, mode="norm")  # x: tensor [bs x seq_len x in_channels]
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
 
-        patched_values = self.patching(
-            past_values
-        )  # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
+        # x: tensor [bs x seq_len x in_channels]
+        scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask)
+
+        # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
+        patched_values = self.patching(scaled_past_values)
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:
@@ -942,8 +1013,8 @@ def forward(
             hidden_states=encoder_output.hidden_states,
             patched_input=patched_values,
             mask=mask,
-            revin_mean=self.revin.mean if self.use_revin else None,
-            revin_stdev=self.revin.stdev if self.use_revin else None,
+            loc=loc,
+            scale=scale,
         )
 
 
@@ -1287,11 +1358,6 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
         self.head = ForecastHead(config)
         self.loss = nn.MSELoss(reduction="mean")
-        self.use_revin = config.revin
-        if self.use_revin:
-            self.revin = RevIN()
-        else:
-            self.revin = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1300,18 +1366,19 @@ def forward(
         self,
         past_values: torch.Tensor,
         future_values: Optional[torch.Tensor],
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
 
         y_hat = self.head(model_output.last_hidden_state)
-
-        if self.use_revin:
-            self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev)
-            y_hat = self.revin(y_hat, mode="denorm")
+        y_hat = y_hat * model_output.scale + model_output.loc
 
         loss_val = None
         if future_values is not None:

From 46e89d6b3e0b4caa570b10dc0ce0791fcd9e59b2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 20 Sep 2023 13:23:01 +0200
Subject: [PATCH 053/189] add initial distributional loss for predition

---
 .../models/patchtst/configuration_patchtst.py |  9 ++++
 .../models/patchtst/modeling_patchtst.py      | 53 ++++++++++++++++---
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index ea4d7382deb187..3b306268785f74 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -44,6 +44,11 @@ class PatchTSTConfig(PretrainedConfig):
             multivariate targets.
         context_length (`int`, defaults to 32):
             The context length for the encoder.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"mse"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse".
         patch_length (`int`, *optional*, defaults to 1):
             Define the patch length of the patchification process. Default to 1.
         stride (`int`, *optional*, defaults to 1):
@@ -144,6 +149,8 @@ def __init__(
         # time series specific configuration
         num_input_channels: int = 1,
         context_length: int = 32,
+        distribution_output: str = "student_t",
+        loss: str = "mse",
         # PatchTST arguments
         patch_length: int = 1,
         stride: int = 1,
@@ -191,6 +198,8 @@ def __init__(
         # time series specific configuration
         self.context_length = context_length
         self.num_input_channels = num_input_channels  # n_vars
+        self.loss = loss
+        self.distribution_output = distribution_output
 
         # Transformer architecture configuration
         self.d_model = d_model
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 86e49ddc88befc..c6397ecbc2c57a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -25,6 +25,7 @@
 
 from ...modeling_outputs import BaseModelOutputWithNoAttention
 from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import ModelOutput, add_start_docstrings, logging
 from .configuration_patchtst import PatchTSTConfig
 
@@ -831,6 +832,14 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     scale: torch.FloatTensor = None
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
     """
@@ -1194,17 +1203,23 @@ class PatchTSTForClassificationOutput(ModelOutput):
 
 
 class PredictionHead(nn.Module):
-    def __init__(self, config: PatchTSTConfig):
+    def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
         self.num_output_channels = config.num_output_channels
+        self.dist_output_size = config.num_output_channels * config.d_model // config.encoder_attention_heads
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
         head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
-        self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
+        if distribution_output is None:
+            self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
+            self.args_proj = None
+        else:
+            self.linear = nn.Linearr(head_dim, config.prediction_length * self.dist_output_size)
+            self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
     def forward(self, x):
@@ -1226,9 +1241,13 @@ def forward(self, x):
         # flatten the input
         x = self.flatten(x)  # x: bs x (nvars * d_model)
         y = self.linear(self.dropout(x))  # y: bs x (pred_len * num_output_channels)
+        if self.args_proj is None:
+            # reshape the data
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
+        else:
+            # reshape and project prarameters of distribution
+            y = self.args_proj(y.reshape(batch_size, -1, self.dist_output_size))
 
-        # reshape the data
-        y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
         return y
 
 
@@ -1238,8 +1257,21 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
         self.model = PatchTSTModel(config)
-        self.head = PredictionHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+        if config.loss == "mse":
+            self.loss = nn.MSELoss(reduction="mean")
+            self.distribution_output = None
+        else:
+            self.loss = nll
+            if config.distribution_output == "student_t":
+                self.distribution_output = StudentTOutput(dim=config.num_output_channels)
+            elif config.distribution_output == "normal":
+                self.distribution_output = NormalOutput(dim=config.num_output_channels)
+            elif config.distribution_output == "negative_binomial":
+                self.distribution_output = NegativeBinomialOutput(dim=config.num_output_channels)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.head = PredictionHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1255,10 +1287,15 @@ def forward(
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
-
         loss_val = None
         if future_values is not None:
-            loss_val = self.loss(y_hat, future_values)
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
+                loss_val = self.loss(distribution, future_values)
+            else:
+                loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
 

From 48b8621c0419e9c1009501f01451de5e3e00af47 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 25 Sep 2023 12:06:18 +0200
Subject: [PATCH 054/189] fix typo in docs

---
 src/transformers/models/patchtst/modeling_patchtst.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index c6397ecbc2c57a..6d93d7177242c2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1049,11 +1049,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 @dataclass
 class PatchTSTOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForPrediction`].
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
+            MSE loss or nll loss.
         prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction outputs of the time series modeling heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -1302,7 +1302,7 @@ def forward(
 @dataclass
 class PatchTSTForForecastingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForForecasting`].
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):

From e5cf09d6270289544618a05e68e76ee780680c1f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 25 Sep 2023 12:12:27 +0200
Subject: [PATCH 055/189] add generate function

---
 .../models/patchtst/modeling_patchtst.py      | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6d93d7177242c2..9019a2beea0854 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -23,7 +23,7 @@
 import torch
 from torch import nn
 
-from ...modeling_outputs import BaseModelOutputWithNoAttention
+from ...modeling_outputs import BaseModelOutputWithNoAttention, SampleTSPredictionOutput
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import ModelOutput, add_start_docstrings, logging
@@ -1297,6 +1297,25 @@ def forward(
             else:
                 loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+    
+    @torch.no_grad()
+    def generate(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> SampleTSPredictionOutput:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        y_hat = self.head(model_output.last_hidden_state)
+        if self.distribution_output:
+            distribution = self.distribution_output.distribution(
+                y_hat, loc=model_output.loc, scale=model_output.scale
+            )
+            y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,))
+        else:
+            y_hat = y_hat * model_output.scale + model_output.loc
+        
+        return SampleTSPredictionOutput(sequences=y_hat)
+    
+
 
 
 @dataclass

From 5a7fb303e9fa8c6a7538a2ff904a9c9306620c2b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 25 Sep 2023 12:13:13 +0200
Subject: [PATCH 056/189] formatting

---
 .../models/patchtst/modeling_patchtst.py           | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 9019a2beea0854..587626a7e455b1 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1297,25 +1297,23 @@ def forward(
             else:
                 loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
-    
+
     @torch.no_grad()
-    def generate(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> SampleTSPredictionOutput:
+    def generate(
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
+    ) -> SampleTSPredictionOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
         if self.distribution_output:
-            distribution = self.distribution_output.distribution(
-                y_hat, loc=model_output.loc, scale=model_output.scale
-            )
+            distribution = self.distribution_output.distribution(y_hat, loc=model_output.loc, scale=model_output.scale)
             y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,))
         else:
             y_hat = y_hat * model_output.scale + model_output.loc
-        
-        return SampleTSPredictionOutput(sequences=y_hat)
-    
 
+        return SampleTSPredictionOutput(sequences=y_hat)
 
 
 @dataclass

From 18a43f56ba4c5b8faa9bf60dd5d3f4db2c229937 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 25 Sep 2023 13:58:30 +0200
Subject: [PATCH 057/189] add num_parallel_samples

---
 src/transformers/models/patchtst/configuration_patchtst.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 3b306268785f74..81a438bc2703f6 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -122,7 +122,8 @@ class PatchTSTConfig(PretrainedConfig):
             Number of output channels.
         prediction_range (`list`, *optional*, defaults to None):
             The range of prediction values can be set to enforce the model to produce values within a range.
-
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for probablistic forecast.
 
     Example:
 
@@ -173,6 +174,7 @@ def __init__(
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,
+        num_parallel_samples: int = 100,
         init_std: float = 0.02,
         shared_projection: bool = True,
         seed_number: int = None,
@@ -200,6 +202,7 @@ def __init__(
         self.num_input_channels = num_input_channels  # n_vars
         self.loss = loss
         self.distribution_output = distribution_output
+        self.num_parallel_samples = num_parallel_samples
 
         # Transformer architecture configuration
         self.d_model = d_model

From b54047ba7698cd5de7acbe69d3de7d80735876ba Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 27 Sep 2023 23:06:16 -0400
Subject: [PATCH 058/189] Fix a typo

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 587626a7e455b1..0ce3bbebf72db0 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1218,7 +1218,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
             self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
             self.args_proj = None
         else:
-            self.linear = nn.Linearr(head_dim, config.prediction_length * self.dist_output_size)
+            self.linear = nn.Linear(head_dim, config.prediction_length * self.dist_output_size)
             self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 

From 406cf00e3eddc984efdac629cbf60701a0daafff Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 1 Oct 2023 21:44:12 -0400
Subject: [PATCH 059/189] copy weighted_average function, edit PredictionHead

---
 .../models/patchtst/modeling_patchtst.py      | 98 ++++++++++---------
 1 file changed, 54 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 0ce3bbebf72db0..c5ebdafb103297 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -23,7 +23,7 @@
 import torch
 from torch import nn
 
-from ...modeling_outputs import BaseModelOutputWithNoAttention, SampleTSPredictionOutput
+from ...modeling_outputs import BaseModelOutputWithNoAttention
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import ModelOutput, add_start_docstrings, logging
@@ -840,6 +840,31 @@ def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.
     return -input.log_prob(target)
 
 
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+    
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
     """
@@ -1049,11 +1074,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 @dataclass
 class PatchTSTOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPrediction`].
+    Output type of [`PatchTSTForPredictiontion`].
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss or nll loss.
+            MSE loss.
         prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction outputs of the time series modeling heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -1206,8 +1231,7 @@ class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
-        self.num_output_channels = config.num_output_channels
-        self.dist_output_size = config.num_output_channels * config.d_model // config.encoder_attention_heads
+        self.num_output_channels = config.num_output_channels        
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
@@ -1217,9 +1241,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         if distribution_output is None:
             self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
             self.args_proj = None
-        else:
-            self.linear = nn.Linear(head_dim, config.prediction_length * self.dist_output_size)
-            self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size)
+        else:                        
+            self.args_proj = distribution_output.get_parameter_projection(head_dim)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
     def forward(self, x):
@@ -1238,15 +1261,18 @@ def forward(self, x):
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
-        # flatten the input
-        x = self.flatten(x)  # x: bs x (nvars * d_model)
-        y = self.linear(self.dropout(x))  # y: bs x (pred_len * num_output_channels)
+        # flatten the input        
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
+        # y = self.linear(self.dropout(x))  # y: bs x (pred_len * num_output_channels)        
         if self.args_proj is None:
-            # reshape the data
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
+            y = self.linear(x)  # y: bs x (pred_len * num_output_channels)        
+            # reshape the data to [bs x pred_len x num_output_channels]            
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]            
         else:
-            # reshape and project prarameters of distribution
-            y = self.args_proj(y.reshape(batch_size, -1, self.dist_output_size))
+            # project prarameters of distribution            
+            y = self.args_proj(x)                         
+            # reshape the data to be a tuple of [bs x pred_len x num_output_channels] 
+            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)            
 
         return y
 
@@ -1263,11 +1289,11 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.loss = nll
             if config.distribution_output == "student_t":
-                self.distribution_output = StudentTOutput(dim=config.num_output_channels)
+                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "normal":
-                self.distribution_output = NormalOutput(dim=config.num_output_channels)
+                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(dim=config.num_output_channels)
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1286,40 +1312,24 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output.last_hidden_state)
+        y_hat = self.head(model_output.last_hidden_state)        
+
         loss_val = None
-        if future_values is not None:
-            if self.distribution_output:
-                distribution = self.distribution_output.distribution(
-                    y_hat, loc=model_output.loc, scale=model_output.scale
-                )
+        if future_values is not None:            
+            if self.distribution_output:                
+                distribution = self.distribution_output.distribution(y_hat)
                 loss_val = self.loss(distribution, future_values)
-            else:
-                loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values)
+                # take average of the loss
+                loss_val = weighted_average(loss_val)
+            else:                
+                loss_val = self.loss(y_hat, future_values)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
-    @torch.no_grad()
-    def generate(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
-    ) -> SampleTSPredictionOutput:
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output.last_hidden_state)
-        if self.distribution_output:
-            distribution = self.distribution_output.distribution(y_hat, loc=model_output.loc, scale=model_output.scale)
-            y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,))
-        else:
-            y_hat = y_hat * model_output.scale + model_output.loc
-
-        return SampleTSPredictionOutput(sequences=y_hat)
-
 
 @dataclass
 class PatchTSTForForecastingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForForecasting`].
+    Output type of [`PatchTSTForPredictiontion`].
 
     Args:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):

From e89477c194f8ff77019e3efd8a2a32cb720e1992 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 1 Oct 2023 21:53:13 -0400
Subject: [PATCH 060/189] edit PredictionHead

---
 .../models/patchtst/modeling_patchtst.py      | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index c5ebdafb103297..04c23a0a67a24a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1238,11 +1238,16 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
+        # if distribution_output is None:
+        #     self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
+        #     self.args_proj = None
+        # else:                        
+        #     self.args_proj = distribution_output.get_parameter_projection(head_dim)
         if distribution_output is None:
-            self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
-            self.args_proj = None
+            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)            
         else:                        
-            self.args_proj = distribution_output.get_parameter_projection(head_dim)
+            self.projection = distribution_output.get_parameter_projection(head_dim)
+
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
     def forward(self, x):
@@ -1263,17 +1268,13 @@ def forward(self, x):
 
         # flatten the input        
         x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
-        # y = self.linear(self.dropout(x))  # y: bs x (pred_len * num_output_channels)        
-        if self.args_proj is None:
-            y = self.linear(x)  # y: bs x (pred_len * num_output_channels)        
-            # reshape the data to [bs x pred_len x num_output_channels]            
+        # projection 
+        y = self.projection(x)
+        # reshape y
+        if isinstance(y, tuple):    # for distribution head
+            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)   # tuple of [bs x pred_len x num_output_channels]          
+        else:       # for linear head
             y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]            
-        else:
-            # project prarameters of distribution            
-            y = self.args_proj(x)                         
-            # reshape the data to be a tuple of [bs x pred_len x num_output_channels] 
-            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)            
-
         return y
 
 

From a0815ee1d9ccb3746561c1124b7aeaf89c77c398 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 2 Oct 2023 14:28:06 -0400
Subject: [PATCH 061/189] add distribution head to forecasting

---
 .../models/patchtst/modeling_patchtst.py      | 96 ++++++++++++++-----
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 04c23a0a67a24a..e695a6224fb9ed 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1238,11 +1238,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
-        # if distribution_output is None:
-        #     self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
-        #     self.args_proj = None
-        # else:                        
-        #     self.args_proj = distribution_output.get_parameter_projection(head_dim)
+
         if distribution_output is None:
             self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)            
         else:                        
@@ -1324,7 +1320,10 @@ def forward(
                 loss_val = weighted_average(loss_val)
             else:                
                 loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(loss=loss_val, 
+                              prediction_output=y_hat, 
+                              hidden_states=model_output.hidden_states
+                              )
 
 
 @dataclass
@@ -1359,7 +1358,7 @@ class PatchTSTForForecastingOutput(ModelOutput):
 
 
 class ForecastHead(nn.Module):
-    def __init__(self, config: PatchTSTConfig):
+    def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
         self.shared_projection = config.shared_projection
@@ -1369,16 +1368,32 @@ def __init__(self, config: PatchTSTConfig):
         head_dim = config.d_model if self.pooling else config.d_model * config.num_patches
 
         if not self.shared_projection:
-            self.linears = nn.ModuleList()
+            # if each channel has its own head
+            self.projections = nn.ModuleList()
             self.dropouts = nn.ModuleList()
             self.flattens = nn.ModuleList()
             for i in range(self.num_input_channels):
                 self.flattens.append(nn.Flatten(start_dim=2))
-                self.linears.append(nn.Linear(head_dim, config.prediction_length))
+                if distribution_output is None:
+                    # use linear head
+                    self.projections.append(
+                        nn.Linear(head_dim, config.prediction_length)
+                        )
+                else:
+                    # use distribution head
+                    self.projections.append(
+                        distribution_output.get_parameter_projection(head_dim)
+                        )
                 self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
+            # all the channels share the same head
             self.flatten = nn.Flatten(start_dim=2)
-            self.linear = nn.Linear(head_dim, config.prediction_length)
+            if distribution_output is None:
+                # use linear head
+                self.projection = nn.Linear(head_dim, config.prediction_length)
+            else:
+                # use distribution head
+                self.projection = distribution_output.get_parameter_projection(head_dim)
             self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
     def forward(self, x: torch.Tensor):
@@ -1402,16 +1417,19 @@ def forward(self, x: torch.Tensor):
             x_out = []
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
-                z = self.linears[i](z)  # z: [bs x forecast_len]
                 z = self.dropouts[i](z)
+                z = self.projections[i](z)  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head              
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
             z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
-            x = self.linear(z)  # x: [bs x nvars x forecast_len]
+            x = self.projection(z)  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
 
-        x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
+        if isinstance(x, tuple):
+            x = (z.transpose(2,1) for z in x)   # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
+        else:
+            x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
 
         return x
 
@@ -1421,8 +1439,22 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
-        self.head = ForecastHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+
+        if config.loss == "mse":
+            self.loss = nn.MSELoss(reduction="mean")
+            self.distribution_output = None
+        else:
+            self.loss = nll
+            if config.distribution_output == "student_t":
+                self.distribution_output = StudentTOutput(dim=config.prediction_length)
+            elif config.distribution_output == "normal":
+                self.distribution_output = NormalOutput(dim=config.prediction_length)
+            elif config.distribution_output == "negative_binomial":
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.head = ForecastHead(config, self.distribution_output)        
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1438,19 +1470,30 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
-        )
+        model_output = self.model(past_values, 
+                                  past_observed_mask=past_observed_mask, 
+                                  output_hidden_states=output_hidden_states
+                                 )
 
         y_hat = self.head(model_output.last_hidden_state)
-        y_hat = y_hat * model_output.scale + model_output.loc
-
+        
         loss_val = None
         if future_values is not None:
-            loss_val = self.loss(y_hat, future_values)
-        return PatchTSTForForecastingOutput(
-            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
-        )
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(y_hat, 
+                                                                     loc=model_output.loc,
+                                                                     scale=model_output.scale)
+                loss_val = self.loss(distribution, future_values)
+                # take average of the loss
+                loss_val = weighted_average(loss_val)
+            else:
+                y_hat = y_hat * model_output.scale + model_output.loc
+                loss_val = self.loss(y_hat, future_values)
+
+        return PatchTSTForForecastingOutput(loss=loss_val, 
+                                            forecast_outputs=y_hat, 
+                                            hidden_states=model_output.hidden_states
+                                            )
 
 
 class RegressionHead(nn.Module):
@@ -1514,4 +1557,7 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(loss=loss_val, 
+                              prediction_output=y_hat, 
+                              hidden_states=model_output.hidden_states
+                              )

From d727ef7e69d98cb54cf9810f04b1a7357be7b729 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 4 Oct 2023 09:38:08 +0200
Subject: [PATCH 062/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 88 +++++++++----------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e695a6224fb9ed..e988af3992b586 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -863,7 +863,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
     else:
         return input_tensor.mean(dim=dim)
-    
+
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
@@ -1231,7 +1231,7 @@ class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
-        self.num_output_channels = config.num_output_channels        
+        self.num_output_channels = config.num_output_channels
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
@@ -1240,8 +1240,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.flatten = nn.Flatten(start_dim=1)
 
         if distribution_output is None:
-            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)            
-        else:                        
+            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
+        else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
@@ -1262,15 +1262,17 @@ def forward(self, x):
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
-        # flatten the input        
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
-        # projection 
+        # flatten the input
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
+        # projection
         y = self.projection(x)
         # reshape y
-        if isinstance(y, tuple):    # for distribution head
-            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)   # tuple of [bs x pred_len x num_output_channels]          
-        else:       # for linear head
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]            
+        if isinstance(y, tuple):  # for distribution head
+            y = (
+                z.reshape(batch_size, -1, self.num_output_channels) for z in y
+            )  # tuple of [bs x pred_len x num_output_channels]
+        else:  # for linear head
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
         return y
 
 
@@ -1290,7 +1292,9 @@ def __init__(self, config: PatchTSTConfig):
             elif config.distribution_output == "normal":
                 self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
+                self.distribution_output = NegativeBinomialOutput(
+                    dim=config.prediction_length * config.num_output_channels
+                )
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1309,21 +1313,18 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output.last_hidden_state)        
+        y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if future_values is not None:            
-            if self.distribution_output:                
+        if future_values is not None:
+            if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
-            else:                
+            else:
                 loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(loss=loss_val, 
-                              prediction_output=y_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
 
 
 @dataclass
@@ -1376,14 +1377,10 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 if distribution_output is None:
                     # use linear head
-                    self.projections.append(
-                        nn.Linear(head_dim, config.prediction_length)
-                        )
+                    self.projections.append(nn.Linear(head_dim, config.prediction_length))
                 else:
                     # use distribution head
-                    self.projections.append(
-                        distribution_output.get_parameter_projection(head_dim)
-                        )
+                    self.projections.append(distribution_output.get_parameter_projection(head_dim))
                 self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
             # all the channels share the same head
@@ -1418,16 +1415,20 @@ def forward(self, x: torch.Tensor):
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.dropouts[i](z)
-                z = self.projections[i](z)  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head              
+                z = self.projections[i](
+                    z
+                )  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
                 x_out.append(z)
             x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
             z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
-            x = self.projection(z)  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
+            x = self.projection(
+                z
+            )  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
 
         if isinstance(x, tuple):
-            x = (z.transpose(2,1) for z in x)   # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
+            x = (z.transpose(2, 1) for z in x)  # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
         else:
             x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
 
@@ -1454,7 +1455,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = ForecastHead(config, self.distribution_output)        
+        self.head = ForecastHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1470,19 +1471,18 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(past_values, 
-                                  past_observed_mask=past_observed_mask, 
-                                  output_hidden_states=output_hidden_states
-                                 )
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
 
         y_hat = self.head(model_output.last_hidden_state)
-        
+
         loss_val = None
         if future_values is not None:
             if self.distribution_output:
-                distribution = self.distribution_output.distribution(y_hat, 
-                                                                     loc=model_output.loc,
-                                                                     scale=model_output.scale)
+                distribution = self.distribution_output.distribution(
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
@@ -1490,10 +1490,9 @@ def forward(
                 y_hat = y_hat * model_output.scale + model_output.loc
                 loss_val = self.loss(y_hat, future_values)
 
-        return PatchTSTForForecastingOutput(loss=loss_val, 
-                                            forecast_outputs=y_hat, 
-                                            hidden_states=model_output.hidden_states
-                                            )
+        return PatchTSTForForecastingOutput(
+            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
+        )
 
 
 class RegressionHead(nn.Module):
@@ -1557,7 +1556,4 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, 
-                              prediction_output=y_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)

From e391bd3933753abd52fa700dd6f41a56208f51bd Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 4 Oct 2023 10:51:33 -0400
Subject: [PATCH 063/189] Add generate function for forecasting

---
 .../models/patchtst/modeling_patchtst.py      | 168 ++++++++++++------
 1 file changed, 116 insertions(+), 52 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e988af3992b586..f4ebfee875c5ae 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -863,7 +863,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
     else:
         return input_tensor.mean(dim=dim)
-
+    
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
@@ -1138,7 +1138,10 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(loss=masked_loss, 
+                              prediction_output=x_hat, 
+                              hidden_states=model_output.hidden_states
+                              )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1165,8 +1168,10 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool]
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
-            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
-        )
+                                loss=loss_val, 
+                                prediction_logits=y_hat, 
+                                hidden_states=model_output.hidden_states
+                                )
 
 
 class ClassificationHead(nn.Module):
@@ -1231,7 +1236,7 @@ class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
-        self.num_output_channels = config.num_output_channels
+        self.num_output_channels = config.num_output_channels        
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
@@ -1240,8 +1245,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.flatten = nn.Flatten(start_dim=1)
 
         if distribution_output is None:
-            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
-        else:
+            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)            
+        else:                        
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
@@ -1262,17 +1267,15 @@ def forward(self, x):
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
-        # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
-        # projection
+        # flatten the input        
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
+        # projection 
         y = self.projection(x)
         # reshape y
-        if isinstance(y, tuple):  # for distribution head
-            y = (
-                z.reshape(batch_size, -1, self.num_output_channels) for z in y
-            )  # tuple of [bs x pred_len x num_output_channels]
-        else:  # for linear head
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
+        if isinstance(y, tuple):    # for distribution head
+            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)   # tuple of [bs x pred_len x num_output_channels]          
+        else:       # for linear head
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]            
         return y
 
 
@@ -1292,9 +1295,7 @@ def __init__(self, config: PatchTSTConfig):
             elif config.distribution_output == "normal":
                 self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(
-                    dim=config.prediction_length * config.num_output_channels
-                )
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1313,18 +1314,21 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output.last_hidden_state)
+        y_hat = self.head(model_output.last_hidden_state)        
 
         loss_val = None
-        if future_values is not None:
-            if self.distribution_output:
+        if future_values is not None:            
+            if self.distribution_output:                
                 distribution = self.distribution_output.distribution(y_hat)
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
-            else:
+            else:                
                 loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(loss=loss_val, 
+                              prediction_output=y_hat, 
+                              hidden_states=model_output.hidden_states
+                              )
 
 
 @dataclass
@@ -1356,6 +1360,22 @@ class PatchTSTForForecastingOutput(ModelOutput):
     forecast_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
+
+
+@dataclass
+class SamplePatchTSTForecastOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Args:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or 
+        `(batch_size, num_samples, prediction_length, number_channels)`):
+                Sampled values from the chosen distribution.
+    """
+    sequences: torch.FloatTensor = None
 
 
 class ForecastHead(nn.Module):
@@ -1377,10 +1397,14 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 if distribution_output is None:
                     # use linear head
-                    self.projections.append(nn.Linear(head_dim, config.prediction_length))
+                    self.projections.append(
+                        nn.Linear(head_dim, config.prediction_length)
+                        )
                 else:
                     # use distribution head
-                    self.projections.append(distribution_output.get_parameter_projection(head_dim))
+                    self.projections.append(
+                        distribution_output.get_parameter_projection(head_dim)
+                        )
                 self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
             # all the channels share the same head
@@ -1415,24 +1439,20 @@ def forward(self, x: torch.Tensor):
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.dropouts[i](z)
-                z = self.projections[i](
-                    z
-                )  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
+                z = self.projections[i](z)  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head              
                 x_out.append(z)
-            x = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
+            output = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
             z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
-            x = self.projection(
-                z
-            )  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
-
-        if isinstance(x, tuple):
-            x = (z.transpose(2, 1) for z in x)  # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
+            output = self.projection(z)  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head            
+        
+        if isinstance(output, tuple):            
+            output = tuple(z.transpose(2,1) for z in output)   # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])            
         else:
-            x = x.transpose(2, 1)  # [bs x forecast_len x nvars]
+            output = output.transpose(2, 1)  # [bs x forecast_len x nvars]
 
-        return x
+        return output
 
 
 class PatchTSTForForecasting(PatchTSTPreTrainedModel):
@@ -1455,7 +1475,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = ForecastHead(config, self.distribution_output)
+        self.head = ForecastHead(config, self.distribution_output)        
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1463,7 +1483,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor],
+        future_values: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1471,28 +1491,69 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
-        )
-
-        y_hat = self.head(model_output.last_hidden_state)
+        model_output = self.model(past_values, 
+                                  past_observed_mask=past_observed_mask, 
+                                  output_hidden_states=output_hidden_states
+                                 )
 
+        y_hat = self.head(model_output.last_hidden_state)        
+        
         loss_val = None
+        
         if future_values is not None:
             if self.distribution_output:
-                distribution = self.distribution_output.distribution(
-                    y_hat, loc=model_output.loc, scale=model_output.scale
-                )
+                distribution = self.distribution_output.distribution(y_hat, 
+                                                                     loc=model_output.loc,
+                                                                     scale=model_output.scale)
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
+                # for testing                
+                # loss_val = nn.MSELoss(reduction='none')(distribution.mean, future_values)
+                # loss_val = weighted_average(loss_val)
             else:
                 y_hat = y_hat * model_output.scale + model_output.loc
                 loss_val = self.loss(y_hat, future_values)
 
-        return PatchTSTForForecastingOutput(
-            loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states
-        )
+        return PatchTSTForForecastingOutput(loss=loss_val, 
+                                            forecast_outputs=y_hat, 
+                                            hidden_states=model_output.hidden_states,
+                                            loc=model_output.loc,
+                                            scale=model_output.scale
+                                            )
+
+    def generate(self, 
+                 past_values: torch.Tensor,                 
+                 past_observed_mask: Optional[torch.Tensor] = None,                 
+                 output_hidden_states: Optional[bool] = None
+        ):
+        """
+        Return:
+            [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, number_channels)` for
+            multivariate predictions.
+        """
+        # get number of samples        
+        num_parallel_samples = self.config.num_parallel_samples
+
+        # get model output        
+        outputs = self(past_values=past_values,
+                       future_values=None,
+                       past_observed_mask=past_observed_mask,                       
+                       output_hidden_states=output_hidden_states
+                       )
+                
+        # get distribution
+        distribution = self.distribution_output.distribution(
+                                outputs.forecast_outputs, 
+                                loc=outputs.loc,
+                                scale=outputs.scale
+                                )        
+        # get samples
+        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x forecast_len x nvars]
+        # stack tensors
+        samples = torch.stack(samples, dim=1)   # [bs x num_samples x forecast_len x nvars]
+        return samples        
 
 
 class RegressionHead(nn.Module):
@@ -1556,4 +1617,7 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+        return PatchTSTOutput(loss=loss_val, 
+                              prediction_output=y_hat, 
+                              hidden_states=model_output.hidden_states
+                              )

From a50f6c23dcaa3a4a8df6fdc9cc53045601cc3173 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 4 Oct 2023 23:08:15 -0400
Subject: [PATCH 064/189] Add generate function to prediction task

---
 .../models/patchtst/configuration_patchtst.py |   8 +-
 .../models/patchtst/modeling_patchtst.py      | 418 ++++++++++++------
 2 files changed, 284 insertions(+), 142 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 81a438bc2703f6..e55737adcf4dda 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -38,7 +38,7 @@ class PatchTSTConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-    Args:
+    Parameters:
         num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
@@ -173,8 +173,7 @@ def __init__(
         pre_norm: bool = False,
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
-        use_cls_token: bool = False,
-        num_parallel_samples: int = 100,
+        use_cls_token: bool = False,        
         init_std: float = 0.02,
         shared_projection: bool = True,
         seed_number: int = None,
@@ -195,6 +194,8 @@ def __init__(
         prediction_length: int = 24,
         num_output_channels: int = 1,
         prediction_range: List = None,
+        # distribution head
+        num_parallel_samples: int = 100,
         **kwargs,
     ):
         # time series specific configuration
@@ -254,6 +255,7 @@ def __init__(
 
         # Forcasting and prediction
         self.prediction_length = prediction_length
+        self.num_parallel_samples = num_parallel_samples
 
         # Regression
         self.num_output_channels = num_output_channels
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f4ebfee875c5ae..46bd8fc8936635 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -266,7 +266,7 @@ def random_masking(
 ):
     """random_masking: Mask the input considering the control variables.
 
-    Args:
+    Parameters:
         xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
         mask_ratio (float): Mask ratio.
         unmasked_channel_indices (list, optional):
@@ -297,8 +297,8 @@ def random_masking(
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
-    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
-    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
+    ids_shuffle = torch.Parametersort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.Parametersort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
 
     mask = torch.gather(mask, dim=-1, index=ids_restore)
     mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patches x patch_length]
@@ -317,7 +317,7 @@ class Patchify(nn.Module):
     """
     A class to patchify the time series sequence into different patches
 
-    Args:
+    Parameters:
         sequence_length (int, required): input sequence length.
         patch_length (int, required): patch length.
         stride (int, required): stride between patches.
@@ -350,7 +350,7 @@ def __init__(
 
     def forward(self, past_values: torch.Tensor):
         """
-        Args:
+        Parameters:
             past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
 
         Returns:
@@ -371,7 +371,7 @@ def forward(self, past_values: torch.Tensor):
 
 class PatchEmbeddings(nn.Module):
     """
-    Args:
+    Parameters:
     A class to patchify the time series sequence into different patches
         sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride
         (int, required): stride between patches.
@@ -409,7 +409,7 @@ def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_d
 
     def forward(self, past_values: torch.Tensor):
         """
-        Args:
+        Parameters:
             past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
         Returns:
             embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim]
@@ -436,7 +436,7 @@ class PatchMasking(nn.Module):
     """
     PatchMasking: Class to random or forcast masking.
 
-    Args:
+    Parameters:
         mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
         mask_ratio (float, optional): Mask ratio.
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
@@ -706,7 +706,7 @@ def forward(
         self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
     ) -> BaseModelOutputWithNoAttention:
         """
-        Args:
+        Parameters:
             past_values: tensor [bs x nvars x num_patches x patch_length].
             output_hidden_states (bool, optional): Indicates if hidden states should be output.
 
@@ -767,7 +767,7 @@ def forward(
 """
 
 PATCHTST_INPUTS_DOCSTRING = r"""
-    Args:
+    Parameters:
         past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`):
             Past values of the time series, that serve as context in order to predict the future. The sequence size of
             this tensor must be larger than the `context_length` of the model, since the model will use the larger size
@@ -807,7 +807,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     """
     Base class for model's outputs, with potential hidden states.
 
-    Args:
+    Parameters:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             Sequence of hidden-states at the output of the last layer of the model.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -832,6 +832,183 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
     scale: torch.FloatTensor = None
 
 
+@dataclass
+class PatchTSTForMaskPretrainingOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PatchTSTForPredictionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PatchTSTOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PatchTSTForClassificationOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForClassification`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SamplePatchTSTPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Parameters:
+        sequences `(batch_size, num_samples, prediction_length, num_output_channels)`):
+                Sampled values from the chosen distribution.
+    """
+    sequences: torch.FloatTensor = None
+
+
+@dataclass
+class PatchTSTForForecastingOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPredictiontion`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+
+        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Forecasting outputs of the time series modeling heads.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    forecast_outputs: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
+
+
+@dataclass
+class SamplePatchTSTForecastOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Parameters:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or 
+        `(batch_size, num_samples, prediction_length, number_channels)`):
+                Sampled values from the chosen distribution.
+    """
+    sequences: torch.FloatTensor = None
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
 def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
     """
@@ -846,7 +1023,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
     Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
     meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
 
-    Args:
+    Parameters:
         input_tensor (`torch.FloatTensor`):
             Input tensor, of which the average must be computed.
         weights (`torch.FloatTensor`, *optional*):
@@ -871,7 +1048,7 @@ class PatchTSTStdScaler(nn.Module):
     Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
     by subtracting from the mean and dividing by the standard deviation.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to calculate the mean and standard deviation.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -905,7 +1082,7 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
     accordingly.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -962,7 +1139,7 @@ class PatchTSTNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1024,7 +1201,7 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+    ) -> PatchTSTModelOutputWithNoAttention:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1071,35 +1248,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-@dataclass
-class PatchTSTOutput(ModelOutput):
-    """
-    Output type of [`PatchTSTForPredictiontion`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction outputs of the time series modeling heads.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_output: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel):
     # PatchTSTModel + Pretraining Head
     def __init__(self, config: PatchTSTConfig):
@@ -1118,7 +1266,7 @@ def forward(
         past_values: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> PatchTSTOutput:
+    ) -> PatchTSTForMaskPretrainingOutput:
         """
         past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels
         """
@@ -1138,10 +1286,10 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTOutput(loss=masked_loss, 
-                              prediction_output=x_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
+        return PatchTSTForMaskPretrainingOutput(loss=masked_loss, 
+                                                prediction_output=x_hat, 
+                                                hidden_states=model_output.hidden_states
+                                                )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1156,7 +1304,12 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] = None):
+    def forward(self, 
+                past_values: torch.Tensor, 
+                labels: torch.Tensor = None, 
+                output_hidden_states: Optional[bool] = None
+                ) -> PatchTSTForClassificationOutput:
+        
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1183,7 +1336,7 @@ def __init__(self, config: PatchTSTConfig):
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
         self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor):
         """
         x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output:
         [bs x n_classes]
@@ -1202,36 +1355,6 @@ def forward(self, x):
         return y
 
 
-@dataclass
-class PatchTSTForClassificationOutput(ModelOutput):
-    """
-    Output type of [`PatchTSTForClassification`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
@@ -1251,7 +1374,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
 
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor):
         """
         x: [bs x nvars x num_patch x d_model]
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
@@ -1308,12 +1431,18 @@ def forward(
         self,
         past_values: torch.Tensor,
         future_values: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+    ) -> PatchTSTForPredictionOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        # get model output
+        model_output = self.model(past_values, 
+                                  past_observed_mask=past_observed_mask, 
+                                  output_hidden_states=output_hidden_states)
+        
+        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape           
         y_hat = self.head(model_output.last_hidden_state)        
 
         loss_val = None
@@ -1325,57 +1454,54 @@ def forward(
                 loss_val = weighted_average(loss_val)
             else:                
                 loss_val = self.loss(y_hat, future_values)
-        return PatchTSTOutput(loss=loss_val, 
-                              prediction_output=y_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
-
-
-@dataclass
-class PatchTSTForForecastingOutput(ModelOutput):
-    """
-    Output type of [`PatchTSTForPredictiontion`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
-
-        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Forecasting outputs of the time series modeling heads.
-
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
+        
+        return PatchTSTForPredictionOutput(loss=loss_val, 
+                                            prediction_output=y_hat, 
+                                            hidden_states=model_output.hidden_states
+                                            )
 
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
+    def generate(self, 
+                 past_values: torch.Tensor,                 
+                 past_observed_mask: Optional[torch.Tensor] = None,                                  
+        ) -> SamplePatchTSTPredictionOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
 
-    loss: Optional[torch.FloatTensor] = None
-    forecast_outputs: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: torch.FloatTensor = None
-    scale: torch.FloatTensor = None
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+                Past values of the time series that serves as context in order to predict the future.
 
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
 
-@dataclass
-class SamplePatchTSTForecastOutput(ModelOutput):
-    """
-    Base class for time series model's predictions outputs that contains the sampled values from the chosen
-    distribution.
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        
+        Return:
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
+            multivariate predictions.
+        """
+        # get number of samples        
+        num_parallel_samples = self.config.num_parallel_samples
 
-    Args:
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or 
-        `(batch_size, num_samples, prediction_length, number_channels)`):
-                Sampled values from the chosen distribution.
-    """
-    sequences: torch.FloatTensor = None
+        # get model output        
+        outputs = self(past_values=past_values,
+                       future_values=None,
+                       past_observed_mask=past_observed_mask,                       
+                       output_hidden_states=None
+                       )
+                
+        # get distribution
+        distribution = self.distribution_output.distribution(
+                                outputs.prediction_output                                 
+                                )        
+        # get samples
+        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x pred_len x num_output_channels]
+        # stack tensors
+        samples = torch.stack(samples, dim=1)   # [bs x num_samples x pred_len x num_output_channels]
+        return SamplePatchTSTPredictionOutput(sequences=samples)
 
 
 class ForecastHead(nn.Module):
@@ -1487,15 +1613,16 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+    ) -> PatchTSTForForecastingOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        # get model output
         model_output = self.model(past_values, 
                                   past_observed_mask=past_observed_mask, 
                                   output_hidden_states=output_hidden_states
                                  )
-
+        # get output head
         y_hat = self.head(model_output.last_hidden_state)        
         
         loss_val = None
@@ -1524,13 +1651,25 @@ def forward(
 
     def generate(self, 
                  past_values: torch.Tensor,                 
-                 past_observed_mask: Optional[torch.Tensor] = None,                 
-                 output_hidden_states: Optional[bool] = None
-        ):
+                 past_observed_mask: Optional[torch.Tensor] = None,                                  
+        ) -> SamplePatchTSTForecastOutput:
         """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+                Past values of the time series that serves as context in order to predict the future.
+
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        
         Return:
             [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
-            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, number_channels)` for
+            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
             multivariate predictions.
         """
         # get number of samples        
@@ -1540,7 +1679,7 @@ def generate(self,
         outputs = self(past_values=past_values,
                        future_values=None,
                        past_observed_mask=past_observed_mask,                       
-                       output_hidden_states=output_hidden_states
+                       output_hidden_states=None
                        )
                 
         # get distribution
@@ -1553,7 +1692,7 @@ def generate(self,
         samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x forecast_len x nvars]
         # stack tensors
         samples = torch.stack(samples, dim=1)   # [bs x num_samples x forecast_len x nvars]
-        return samples        
+        return SamplePatchTSTForecastOutput(sequences=samples)
 
 
 class RegressionHead(nn.Module):
@@ -1621,3 +1760,4 @@ def forward(
                               prediction_output=y_hat, 
                               hidden_states=model_output.hidden_states
                               )
+    

From 8daf165c9fe6903a4e7365613f132a131a85da0d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 10:09:12 +0200
Subject: [PATCH 065/189] formatting

---
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../models/patchtst/configuration_patchtst.py |   2 +-
 .../models/patchtst/modeling_patchtst.py      | 241 +++++++++---------
 3 files changed, 124 insertions(+), 123 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b82b189f5bb91a..22feb8a125ee83 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1126,8 +1126,8 @@
     [
         ("patchtst", "PatchTSTForRegression"),
     ]
-)     
- 
+)
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict(
     [
         ("swin2sr", "Swin2SRForImageSuperResolution"),
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index e55737adcf4dda..71aef0b436bbd3 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -173,7 +173,7 @@ def __init__(
         pre_norm: bool = False,
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
-        use_cls_token: bool = False,        
+        use_cls_token: bool = False,
         init_std: float = 0.02,
         shared_projection: bool = True,
         seed_number: int = None,
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 46bd8fc8936635..5e13cc9fda7740 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -959,6 +959,7 @@ class SamplePatchTSTPredictionOutput(ModelOutput):
         sequences `(batch_size, num_samples, prediction_length, num_output_channels)`):
                 Sampled values from the chosen distribution.
     """
+
     sequences: torch.FloatTensor = None
 
 
@@ -1002,10 +1003,11 @@ class SamplePatchTSTForecastOutput(ModelOutput):
     distribution.
 
     Parameters:
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or 
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or
         `(batch_size, num_samples, prediction_length, number_channels)`):
                 Sampled values from the chosen distribution.
     """
+
     sequences: torch.FloatTensor = None
 
 
@@ -1023,7 +1025,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
     Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
     meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
 
-    Parameters:
+    Args:
         input_tensor (`torch.FloatTensor`):
             Input tensor, of which the average must be computed.
         weights (`torch.FloatTensor`, *optional*):
@@ -1040,7 +1042,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
     else:
         return input_tensor.mean(dim=dim)
-    
+
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
@@ -1048,7 +1050,7 @@ class PatchTSTStdScaler(nn.Module):
     Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
     by subtracting from the mean and dividing by the standard deviation.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to calculate the mean and standard deviation.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1082,7 +1084,7 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
     accordingly.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1139,7 +1141,7 @@ class PatchTSTNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1286,10 +1288,9 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
-        return PatchTSTForMaskPretrainingOutput(loss=masked_loss, 
-                                                prediction_output=x_hat, 
-                                                hidden_states=model_output.hidden_states
-                                                )
+        return PatchTSTForMaskPretrainingOutput(
+            loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states
+        )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1304,12 +1305,9 @@ def __init__(self, config: PatchTSTConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def forward(self, 
-                past_values: torch.Tensor, 
-                labels: torch.Tensor = None, 
-                output_hidden_states: Optional[bool] = None
-                ) -> PatchTSTForClassificationOutput:
-        
+    def forward(
+        self, past_values: torch.Tensor, labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None
+    ) -> PatchTSTForClassificationOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1321,10 +1319,8 @@ def forward(self,
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
         return PatchTSTForClassificationOutput(
-                                loss=loss_val, 
-                                prediction_logits=y_hat, 
-                                hidden_states=model_output.hidden_states
-                                )
+            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
+        )
 
 
 class ClassificationHead(nn.Module):
@@ -1359,7 +1355,7 @@ class PredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
-        self.num_output_channels = config.num_output_channels        
+        self.num_output_channels = config.num_output_channels
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
 
@@ -1368,8 +1364,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.flatten = nn.Flatten(start_dim=1)
 
         if distribution_output is None:
-            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)            
-        else:                        
+            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
+        else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
@@ -1390,15 +1386,17 @@ def forward(self, x: torch.Tensor):
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
-        # flatten the input        
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
-        # projection 
+        # flatten the input
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
+        # projection
         y = self.projection(x)
         # reshape y
-        if isinstance(y, tuple):    # for distribution head
-            y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y)   # tuple of [bs x pred_len x num_output_channels]          
-        else:       # for linear head
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]            
+        if isinstance(y, tuple):  # for distribution head
+            y = (
+                z.reshape(batch_size, -1, self.num_output_channels) for z in y
+            )  # tuple of [bs x pred_len x num_output_channels]
+        else:  # for linear head
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
         return y
 
 
@@ -1418,7 +1416,9 @@ def __init__(self, config: PatchTSTConfig):
             elif config.distribution_output == "normal":
                 self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
+                self.distribution_output = NegativeBinomialOutput(
+                    dim=config.prediction_length * config.num_output_channels
+                )
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1438,37 +1438,37 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         # get model output
-        model_output = self.model(past_values, 
-                                  past_observed_mask=past_observed_mask, 
-                                  output_hidden_states=output_hidden_states)
-        
-        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape           
-        y_hat = self.head(model_output.last_hidden_state)        
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
+
+        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape
+        y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if future_values is not None:            
-            if self.distribution_output:                
+        if future_values is not None:
+            if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
-            else:                
+            else:
                 loss_val = self.loss(y_hat, future_values)
-        
-        return PatchTSTForPredictionOutput(loss=loss_val, 
-                                            prediction_output=y_hat, 
-                                            hidden_states=model_output.hidden_states
-                                            )
-
-    def generate(self, 
-                 past_values: torch.Tensor,                 
-                 past_observed_mask: Optional[torch.Tensor] = None,                                  
-        ) -> SamplePatchTSTPredictionOutput:
+
+        return PatchTSTForPredictionOutput(
+            loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSTPredictionOutput:
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
         Parameters:
-            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 Past values of the time series that serves as context in order to predict the future.
 
             past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
@@ -1477,30 +1477,31 @@ def generate(self,
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-        
+
         Return:
             [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
             samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
             multivariate predictions.
         """
-        # get number of samples        
+        # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
 
-        # get model output        
-        outputs = self(past_values=past_values,
-                       future_values=None,
-                       past_observed_mask=past_observed_mask,                       
-                       output_hidden_states=None
-                       )
-                
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            future_values=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=None,
+        )
+
         # get distribution
-        distribution = self.distribution_output.distribution(
-                                outputs.prediction_output                                 
-                                )        
+        distribution = self.distribution_output.distribution(outputs.prediction_output)
         # get samples
-        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x pred_len x num_output_channels]
+        samples = [
+            distribution.sample() for i in range(num_parallel_samples)
+        ]  # samples: list of [bs x pred_len x num_output_channels]
         # stack tensors
-        samples = torch.stack(samples, dim=1)   # [bs x num_samples x pred_len x num_output_channels]
+        samples = torch.stack(samples, dim=1)  # [bs x num_samples x pred_len x num_output_channels]
         return SamplePatchTSTPredictionOutput(sequences=samples)
 
 
@@ -1523,14 +1524,10 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
                 self.flattens.append(nn.Flatten(start_dim=2))
                 if distribution_output is None:
                     # use linear head
-                    self.projections.append(
-                        nn.Linear(head_dim, config.prediction_length)
-                        )
+                    self.projections.append(nn.Linear(head_dim, config.prediction_length))
                 else:
                     # use distribution head
-                    self.projections.append(
-                        distribution_output.get_parameter_projection(head_dim)
-                        )
+                    self.projections.append(distribution_output.get_parameter_projection(head_dim))
                 self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
         else:
             # all the channels share the same head
@@ -1565,16 +1562,22 @@ def forward(self, x: torch.Tensor):
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.dropouts[i](z)
-                z = self.projections[i](z)  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head              
+                z = self.projections[i](
+                    z
+                )  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
                 x_out.append(z)
             output = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
         else:
             z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
             z = self.dropout(z)
-            output = self.projection(z)  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head            
-        
-        if isinstance(output, tuple):            
-            output = tuple(z.transpose(2,1) for z in output)   # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])            
+            output = self.projection(
+                z
+            )  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
+
+        if isinstance(output, tuple):
+            output = tuple(
+                z.transpose(2, 1) for z in output
+            )  # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
         else:
             output = output.transpose(2, 1)  # [bs x forecast_len x nvars]
 
@@ -1601,7 +1604,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = ForecastHead(config, self.distribution_output)        
+        self.head = ForecastHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1618,46 +1621,47 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         # get model output
-        model_output = self.model(past_values, 
-                                  past_observed_mask=past_observed_mask, 
-                                  output_hidden_states=output_hidden_states
-                                 )
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
         # get output head
-        y_hat = self.head(model_output.last_hidden_state)        
-        
+        y_hat = self.head(model_output.last_hidden_state)
+
         loss_val = None
-        
+
         if future_values is not None:
             if self.distribution_output:
-                distribution = self.distribution_output.distribution(y_hat, 
-                                                                     loc=model_output.loc,
-                                                                     scale=model_output.scale)
+                distribution = self.distribution_output.distribution(
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
                 loss_val = self.loss(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
-                # for testing                
+                # for testing
                 # loss_val = nn.MSELoss(reduction='none')(distribution.mean, future_values)
                 # loss_val = weighted_average(loss_val)
             else:
                 y_hat = y_hat * model_output.scale + model_output.loc
                 loss_val = self.loss(y_hat, future_values)
 
-        return PatchTSTForForecastingOutput(loss=loss_val, 
-                                            forecast_outputs=y_hat, 
-                                            hidden_states=model_output.hidden_states,
-                                            loc=model_output.loc,
-                                            scale=model_output.scale
-                                            )
-
-    def generate(self, 
-                 past_values: torch.Tensor,                 
-                 past_observed_mask: Optional[torch.Tensor] = None,                                  
-        ) -> SamplePatchTSTForecastOutput:
+        return PatchTSTForForecastingOutput(
+            loss=loss_val,
+            forecast_outputs=y_hat,
+            hidden_states=model_output.hidden_states,
+            loc=model_output.loc,
+            scale=model_output.scale,
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSTForecastOutput:
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
         Parameters:
-            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 Past values of the time series that serves as context in order to predict the future.
 
             past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
@@ -1666,32 +1670,33 @@ def generate(self,
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-        
+
         Return:
             [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
             samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
             multivariate predictions.
         """
-        # get number of samples        
+        # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
 
-        # get model output        
-        outputs = self(past_values=past_values,
-                       future_values=None,
-                       past_observed_mask=past_observed_mask,                       
-                       output_hidden_states=None
-                       )
-                
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            future_values=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=None,
+        )
+
         # get distribution
         distribution = self.distribution_output.distribution(
-                                outputs.forecast_outputs, 
-                                loc=outputs.loc,
-                                scale=outputs.scale
-                                )        
+            outputs.forecast_outputs, loc=outputs.loc, scale=outputs.scale
+        )
         # get samples
-        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x forecast_len x nvars]
+        samples = [
+            distribution.sample() for i in range(num_parallel_samples)
+        ]  # samples: list of [bs x forecast_len x nvars]
         # stack tensors
-        samples = torch.stack(samples, dim=1)   # [bs x num_samples x forecast_len x nvars]
+        samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x nvars]
         return SamplePatchTSTForecastOutput(sequences=samples)
 
 
@@ -1756,8 +1761,4 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, 
-                              prediction_output=y_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
-    
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)

From e2f8fd8d563e3a13a6f9da44b109eca19ebfbac0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 10:17:56 +0200
Subject: [PATCH 066/189] use argsort

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5e13cc9fda7740..a98a101abb411a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -297,8 +297,8 @@ def random_masking(
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
-    ids_shuffle = torch.Parametersort(noise, dim=-1)  # ascend: small is keep, large is remove
-    ids_restore = torch.Parametersort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
+    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
 
     mask = torch.gather(mask, dim=-1, index=ids_restore)
     mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patches x patch_length]

From 1c8ec9dfe3c54f97af23ddbb2ea1401e92e6581d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 10:28:52 +0200
Subject: [PATCH 067/189] add past_observed_mask ordering

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ++--
 tests/models/patchtst/test_modeling_patchtst.py       | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a98a101abb411a..180eec91af9f0a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1430,8 +1430,8 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTForPredictionOutput:
         output_hidden_states = (
@@ -1612,8 +1612,8 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        future_values: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         future_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTForForecastingOutput:
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 4f3cb2f1f465bc..a7c6c6e2186b19 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -277,6 +277,7 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "past_values",
+                "past_observed_mask",
                 "future_values",
             ]
             if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(

From 7ebaa61ffc3b39ba8ecaeb130e528d1f09af1dd9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 12:09:15 +0200
Subject: [PATCH 068/189] fix arguments

---
 .../models/patchtst/modeling_patchtst.py           | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 180eec91af9f0a..8f08bcf4edee5e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1266,6 +1266,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTForMaskPretrainingOutput:
@@ -1306,7 +1307,11 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[bool] = None,
+        labels: torch.Tensor = None,
+        output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTForClassificationOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1614,7 +1619,6 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        future_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> PatchTSTForForecastingOutput:
         output_hidden_states = (
@@ -1750,7 +1754,11 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

From f3dca2530179991584ba77d4ec33ff78de3bcb84 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 12:09:58 +0200
Subject: [PATCH 069/189] docs

---
 docs/source/en/model_doc/patchtst.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 209e50a6b12480..504be80c3e6c9a 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
 ## Overview
 
 The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
-<INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 
@@ -27,9 +26,10 @@ The abstract from the paper is the following:
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+The model also adds a time series classification pipeline and time series regression pipeline.
 
 This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif).
+
 The original code can be found [here](https://github.com/yuqinie98/PatchTST).
 
 

From 5349bf439ee1c889f909b43e453290bd429832ab Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 12:42:09 +0200
Subject: [PATCH 070/189] add back test_model_outputs_equivalence test

---
 .../models/patchtst/modeling_patchtst.py      | 71 +++++++++++++++----
 .../models/patchtst/test_modeling_patchtst.py | 10 +--
 2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 8f08bcf4edee5e..ffac3bcf4db227 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -17,7 +17,7 @@
 import math
 import random
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -1203,10 +1203,12 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> PatchTSTModelOutputWithNoAttention:
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if past_observed_mask is None:
             past_observed_mask = torch.ones_like(past_values)
@@ -1221,9 +1223,15 @@ def forward(
         else:
             masked_values, mask = self.masking(patched_values), None
         encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
+
+        hidden_states = encoder_output.last_hidden_state
+        encoder_states = encoder_output.hidden_states
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, patched_values, mask, loc, scale] if v is not None)
         return PatchTSTModelOutputWithNoAttention(
-            last_hidden_state=encoder_output.last_hidden_state,
-            hidden_states=encoder_output.hidden_states,
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
             patched_input=patched_values,
             mask=mask,
             loc=loc,
@@ -1269,13 +1277,15 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> PatchTSTForMaskPretrainingOutput:
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTForMaskPretrainingOutput]:
         """
         past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # past_values: [bs x nvars x num_patches x d_model] or
         # [bs x nvars x (num_patches+1) x d_model] if use cls_token
@@ -1289,8 +1299,11 @@ def forward(
         loss_val = self.loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
+        encoder_states = model_output.hidden_states
+        if not return_dict:
+            return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None)
         return PatchTSTForMaskPretrainingOutput(
-            loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states
+            loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states
         )
 
 
@@ -1312,10 +1325,12 @@ def forward(
         past_observed_mask: Optional[bool] = None,
         labels: torch.Tensor = None,
         output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> PatchTSTForClassificationOutput:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output[0])
@@ -1323,8 +1338,12 @@ def forward(
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
+
+        encoder_states = model_output.hidden_states
+        if not return_dict:
+            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
         return PatchTSTForClassificationOutput(
-            loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states
+            loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states
         )
 
 
@@ -1438,10 +1457,13 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> PatchTSTForPredictionOutput:
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTForPredictionOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         # get model output
         model_output = self.model(
             past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
@@ -1460,8 +1482,11 @@ def forward(
             else:
                 loss_val = self.loss(y_hat, future_values)
 
+        encoder_states = model_output.hidden_states
+        if not return_dict:
+            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
         return PatchTSTForPredictionOutput(
-            loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states
+            loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states
         )
 
     def generate(
@@ -1620,10 +1645,13 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> PatchTSTForForecastingOutput:
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTForForecastingOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         # get model output
         model_output = self.model(
             past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
@@ -1648,12 +1676,18 @@ def forward(
                 y_hat = y_hat * model_output.scale + model_output.loc
                 loss_val = self.loss(y_hat, future_values)
 
+        encoder_states = model_output.hidden_states
+        loc = model_output.loc
+        scale = model_output.scale
+
+        if not return_dict:
+            return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None)
         return PatchTSTForForecastingOutput(
             loss=loss_val,
             forecast_outputs=y_hat,
-            hidden_states=model_output.hidden_states,
-            loc=model_output.loc,
-            scale=model_output.scale,
+            hidden_states=encoder_states,
+            loc=loc,
+            scale=scale,
         )
 
     def generate(
@@ -1759,14 +1793,21 @@ def forward(
         past_observed_mask: Optional[bool] = None,
         labels: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
         if labels is not None:
             loss_val = self.loss(y_hat, labels)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states)
+
+        encoder_states = model_output.hidden_states
+        if not return_dict:
+            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)            
+        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index a7c6c6e2186b19..20ef536a9cb796 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -248,17 +248,13 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    #
-    #     # Ignore since we have no tokens embeddings
 
+    # Ignore since we have no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass
 
-    def test_model_outputs_equivalence(self):
-        pass
-
-    def test_determinism(self):
-        pass
+    # def test_model_outputs_equivalence(self):
+    #     pass
 
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))

From eb7f547f3c8a6763c73548e760a91e80c723d771 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 12:43:24 +0200
Subject: [PATCH 071/189] formatting

---
 src/transformers/models/patchtst/modeling_patchtst.py | 10 +++-------
 tests/models/patchtst/test_modeling_patchtst.py       |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ffac3bcf4db227..6f50c32d272070 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1342,9 +1342,7 @@ def forward(
         encoder_states = model_output.hidden_states
         if not return_dict:
             return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTForClassificationOutput(
-            loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states
-        )
+        return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states)
 
 
 class ClassificationHead(nn.Module):
@@ -1485,9 +1483,7 @@ def forward(
         encoder_states = model_output.hidden_states
         if not return_dict:
             return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTForPredictionOutput(
-            loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states
-        )
+        return PatchTSTForPredictionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
 
     def generate(
         self,
@@ -1809,5 +1805,5 @@ def forward(
 
         encoder_states = model_output.hidden_states
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)            
+            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 20ef536a9cb796..d9756b3c55b76a 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -248,7 +248,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-
     # Ignore since we have no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass

From a1cf42cbab4b95e7b5ce7690e76d6aa94ecececb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 12:50:48 +0200
Subject: [PATCH 072/189] cleanup

---
 tests/models/patchtst/test_modeling_patchtst.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index d9756b3c55b76a..7f453f707948ef 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -248,13 +248,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # Ignore since we have no tokens embeddings
+    @unittest.skip(reason="we have no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # def test_model_outputs_equivalence(self):
-    #     pass
-
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
         # The main input is the name of the argument after `self`

From 6392f999610bbe8cc8aa9e8a9cac606e98463d0f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 13:02:22 +0200
Subject: [PATCH 073/189] formatting

---
 .../models/patchtst/configuration_patchtst.py        |  6 ++++--
 .../models/patchtst/modeling_patchtst.py             | 12 ++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 71aef0b436bbd3..2ba1c808358cf7 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -45,10 +45,12 @@ class PatchTSTConfig(PretrainedConfig):
         context_length (`int`, defaults to 32):
             The context length for the encoder.
         distribution_output (`string`, *optional*, defaults to `"student_t"`):
-            The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial".
+            The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
+            "negative_binomial".
         loss (`string`, *optional*, defaults to `"mse"`):
             The loss function for the model corresponding to the `distribution_output` head. For parametric
-            distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse".
+            distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
+            error "mse".
         patch_length (`int`, *optional*, defaults to 1):
             Define the patch length of the patchification process. Default to 1.
         stride (`int`, *optional*, defaults to 1):
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6f50c32d272070..ef0eefb9e1b300 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1505,9 +1505,9 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
-            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
-            multivariate predictions.
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
+            num_input_channels)` for multivariate predictions.
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
@@ -1706,9 +1706,9 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-            [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
-            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for
-            multivariate predictions.
+            [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number
+            of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
+            num_input_channels)` for multivariate predictions.
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples

From 8a91544f1ae6de73a368c6b7eeb9e37f75fd2797 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 16:01:48 +0200
Subject: [PATCH 074/189] use ACT2CLS

---
 src/transformers/models/patchtst/modeling_patchtst.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ef0eefb9e1b300..4a5608cd02fd01 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -23,6 +23,7 @@
 import torch
 from torch import nn
 
+from ...activations import ACT2CLS
 from ...modeling_outputs import BaseModelOutputWithNoAttention
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
@@ -195,14 +196,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-def get_activation_fn(activation):
-    if callable(activation):
-        return activation()
-    elif activation.lower() == "relu":
-        return nn.ReLU()
-    elif activation.lower() == "gelu":
-        return nn.GELU()
-    raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable')
 
 
 class Transpose(nn.Module):
@@ -562,7 +555,7 @@ def __init__(self, config: PatchTSTConfig):
         # Position-wise Feed-Forward
         self.ff = nn.Sequential(
             nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias),
-            get_activation_fn(config.activation_function),
+            ACT2CLS[config.activation_function](),
             nn.Dropout(config.ff_dropout) if config.ff_dropout > 0 else nn.Identity(),
             nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias),
         )

From dfbea052770425c6cdd59ec0e85b10ea424e9eac Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 16:15:07 +0200
Subject: [PATCH 075/189] formatting

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4a5608cd02fd01..9b438601506a2d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -196,8 +196,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-
-
 class Transpose(nn.Module):
     def __init__(self, *dims, contiguous=False):
         super().__init__()

From 1a0c55ee945855a4fb493f08bd7bfd72b6c3f311 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 16:33:07 +0200
Subject: [PATCH 076/189] fix add_start_docstrings decorator

---
 src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 9b438601506a2d..b49feb2ed23326 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -790,10 +790,6 @@ def forward(
 
 
 @dataclass
-@add_start_docstrings(
-    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
-    PATCHTST_START_DOCSTRING,
-)
 class PatchTSTModelOutputWithNoAttention(ModelOutput):
     """
     Base class for model's outputs, with potential hidden states.
@@ -1152,6 +1148,10 @@ def forward(
         return data, loc, scale
 
 
+@add_start_docstrings(
+    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
+    PATCHTST_START_DOCSTRING,
+)
 class PatchTSTModel(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)

From 0a4e58bedeb296c5df05a9a7b913b9e541fecb58 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 5 Oct 2023 13:55:08 -0400
Subject: [PATCH 077/189] add distribution head and generate function to
 regression task

add distribution head and generate function to regression task. Also made add PatchTSTForForecastingOutput,  PatchTSTForRegressionOutput.
---
 .../models/patchtst/modeling_patchtst.py      | 228 ++++++++++++++----
 1 file changed, 177 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index b49feb2ed23326..a3a5cf2b57993c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -822,7 +822,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 @dataclass
 class PatchTSTForMaskPretrainingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForMaskPretraining`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
@@ -878,9 +878,9 @@ class PatchTSTForPredictionOutput(ModelOutput):
 
 
 @dataclass
-class PatchTSTOutput(ModelOutput):
+class PatchTSTForRegressionOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForRegression`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
@@ -907,16 +907,17 @@ class PatchTSTOutput(ModelOutput):
 
 
 @dataclass
-class PatchTSTForClassificationOutput(ModelOutput):
+class PatchTSTForForecastingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForClassification`].
+    Output type of [`PatchTSTForForecasting`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            MSE loss.
+
+        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Forecasting outputs of the time series modeling heads.
+
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -931,37 +932,25 @@ class PatchTSTForClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
+    forecast_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
 
 
-@dataclass
-class SamplePatchTSTPredictionOutput(ModelOutput):
-    """
-    Base class for time series model's predictions outputs that contains the sampled values from the chosen
-    distribution.
-
-    Parameters:
-        sequences `(batch_size, num_samples, prediction_length, num_output_channels)`):
-                Sampled values from the chosen distribution.
-    """
-
-    sequences: torch.FloatTensor = None
-
 
 @dataclass
-class PatchTSTForForecastingOutput(ModelOutput):
+class PatchTSTForClassificationOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForClassification`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
-
-        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Forecasting outputs of the time series modeling heads.
-
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -976,11 +965,23 @@ class PatchTSTForForecastingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    forecast_outputs: torch.FloatTensor = None
+    prediction_logits: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: torch.FloatTensor = None
-    scale: torch.FloatTensor = None
+
+
+@dataclass
+class SamplePatchTSTPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Parameters:
+        sequences `(batch_size, num_samples, prediction_length, num_output_channels)`):
+                Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
 
 
 @dataclass
@@ -990,14 +991,31 @@ class SamplePatchTSTForecastOutput(ModelOutput):
     distribution.
 
     Parameters:
+<<<<<<< Updated upstream
         sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or
         `(batch_size, num_samples, prediction_length, number_channels)`):
+=======
+        sequences `(batch_size, num_samples, prediction_length, number_channels)`):
+>>>>>>> Stashed changes
                 Sampled values from the chosen distribution.
     """
 
     sequences: torch.FloatTensor = None
 
 
+@dataclass
+class SamplePatchTSTRegressionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Parameters:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)`        
+                Sampled values from the chosen distribution.
+    """
+    sequences: torch.FloatTensor = None    
+
+
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
 def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
     """
@@ -1375,13 +1393,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
         if distribution_output is None:
             self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
         else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
-
-        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+        
 
     def forward(self, x: torch.Tensor):
         """
@@ -1454,12 +1472,21 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # get model output
+<<<<<<< Updated upstream
         model_output = self.model(
             past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
         )
 
         # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape
         y_hat = self.head(model_output.last_hidden_state)
+=======
+        model_output = self.model(past_values, 
+                                  past_observed_mask=past_observed_mask, 
+                                  output_hidden_states=output_hidden_states)
+        
+        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape           
+        y_hat = self.head(model_output.last_hidden_state)        
+>>>>>>> Stashed changes
 
         loss_val = None
         if future_values is not None:
@@ -1496,9 +1523,14 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
+<<<<<<< Updated upstream
             [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
             number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
             num_input_channels)` for multivariate predictions.
+=======
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length, num_output_channels)`
+>>>>>>> Stashed changes
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
@@ -1726,38 +1758,43 @@ def generate(
 
 
 class RegressionHead(nn.Module):
-    def __init__(self, config: PatchTSTConfig):
+    def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
         self.y_range = config.prediction_range
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
-        # self.is_flatten = is_flatten
+        self.distribution_output = distribution_output
+        
+        head_dim = config.num_input_channels * config.d_model        
 
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        head_dim = config.num_input_channels * config.d_model
-        # if is_flatten: head_dim *= num_patch
-        self.linear = nn.Linear(head_dim, config.num_output_channels)
+                
+        if distribution_output is None:
+            self.projection = nn.Linear(head_dim, config.num_output_channels)
+        else:                        
+            self.projection = distribution_output.get_parameter_projection(head_dim)            
 
-    def forward(self, past_values):
+    def forward(self, x):
         """
         x: [bs x nvars x num_patch x d_model]
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         output: [bs x output_dim]
-        """
+        """        
         if self.use_cls_token:
-            past_values = past_values[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
+            x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
         elif self.pooling == "mean":
-            past_values = past_values.mean(dim=2)  # x: [bs x nvars x d_model]
+            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
         elif self.pooling == "max":
-            past_values = past_values.max(dim=2)  # x: [bs x nvars x d_model]
+            x = x.max(dim=2)  # x: [bs x nvars x d_model]
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
         # flatten the input
-        past_values = self.flatten(past_values)  # x: bs x nvars * d_model
-        y = self.linear(self.dropout(past_values))  # y: bs x output_dim
-
-        if self.y_range:
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
+        # projection 
+        y = self.projection(x)  # y: bs x output_dim or a tuple of this shape for distribution head
+        # 
+        if (self.distribution_output is None) & self.y_range:    # linear head
             y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
 
         return y
@@ -1768,13 +1805,29 @@ class PatchTSTForRegression(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
-        self.head = RegressionHead(config)
-        self.loss = nn.MSELoss(reduction="mean")
+
+        self.model = PatchTSTModel(config)
+        if config.loss == "mse":
+            self.loss = nn.MSELoss(reduction="mean")
+            self.distribution_output = None
+        else:
+            self.loss = nll
+            if config.distribution_output == "student_t":
+                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
+            elif config.distribution_output == "normal":
+                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
+            elif config.distribution_output == "negative_binomial":
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.head = RegressionHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
+<<<<<<< Updated upstream
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[bool] = None,
@@ -1798,3 +1851,76 @@ def forward(
         if not return_dict:
             return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
         return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
+=======
+        self, past_values: torch.Tensor, 
+        labels: Optional[torch.Tensor], 
+        past_observed_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None
+    ) -> PatchTSTForRegressionOutput:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        model_output = self.model(past_values, 
+                                  past_observed_mask=past_observed_mask, 
+                                  output_hidden_states=output_hidden_states)
+        # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape           
+        y_hat = self.head(model_output.last_hidden_state)
+
+        loss_val = None
+        if labels is not None:            
+            if self.distribution_output:                
+                distribution = self.distribution_output.distribution(y_hat)
+                loss_val = self.loss(distribution, labels)
+                # take average of the loss
+                loss_val = weighted_average(loss_val)
+            else:                
+                loss_val = self.loss(y_hat, labels)
+
+        return PatchTSTForRegressionOutput(loss=loss_val, 
+                              prediction_output=y_hat, 
+                              hidden_states=model_output.hidden_states
+                              )
+    
+
+    def generate(self, 
+                 past_values: torch.Tensor,                 
+                 past_observed_mask: Optional[torch.Tensor] = None,                                  
+        ) -> SamplePatchTSTPredictionOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+                Past values of the time series that serves as context in order to predict the future.
+
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        
+        Return:
+            [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, num_output_channels)`.
+        """
+        # get number of samples        
+        num_parallel_samples = self.config.num_parallel_samples
+
+        # get model output        
+        outputs = self(past_values=past_values,
+                       labels=None,
+                       past_observed_mask=past_observed_mask,                       
+                       output_hidden_states=None
+                       )
+                
+        # get distribution
+        distribution = self.distribution_output.distribution(
+                                outputs.prediction_output                                 
+                                )        
+        # get samples
+        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x num_output_channels]
+        # stack tensors
+        samples = torch.stack(samples, dim=1)   # [bs x num_samples x num_output_channels]
+        return SamplePatchTSTRegressionOutput(sequences=samples)
+>>>>>>> Stashed changes

From 72a6e1e5b29205ab6094c33aedb83e5fcb38079f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 5 Oct 2023 14:12:30 -0400
Subject: [PATCH 078/189] add distribution head and generate function to
 regression task

add distribution head and generate function to regression task. Also made add PatchTSTForForecastingOutput,  PatchTSTForRegressionOutput.
---
 .../models/patchtst/modeling_patchtst.py      | 68 +++++--------------
 1 file changed, 16 insertions(+), 52 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a3a5cf2b57993c..b9dc656ef8d827 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1335,13 +1335,15 @@ def forward(
         labels: torch.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> PatchTSTForClassificationOutput:
+    ) -> Union[tuple, PatchTSTForClassificationOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        model_output = self.model(past_values,
+                                  past_observed_mask=past_observed_mask,
+                                  output_hidden_states=output_hidden_states)
         y_hat = self.head(model_output[0])
 
         loss_val = None
@@ -1472,21 +1474,12 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # get model output
-<<<<<<< Updated upstream
-        model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
-        )
-
-        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape
-        y_hat = self.head(model_output.last_hidden_state)
-=======
         model_output = self.model(past_values, 
                                   past_observed_mask=past_observed_mask, 
                                   output_hidden_states=output_hidden_states)
         
         # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape           
         y_hat = self.head(model_output.last_hidden_state)        
->>>>>>> Stashed changes
 
         loss_val = None
         if future_values is not None:
@@ -1523,14 +1516,8 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-<<<<<<< Updated upstream
-            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
-            number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
-            num_input_channels)` for multivariate predictions.
-=======
             [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
             samples, prediction_length, num_output_channels)`
->>>>>>> Stashed changes
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
@@ -1547,7 +1534,7 @@ def generate(
         distribution = self.distribution_output.distribution(outputs.prediction_output)
         # get samples
         samples = [
-            distribution.sample() for i in range(num_parallel_samples)
+            distribution.sample() for _ in range(num_parallel_samples)
         ]  # samples: list of [bs x pred_len x num_output_channels]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x pred_len x num_output_channels]
@@ -1750,7 +1737,7 @@ def generate(
         )
         # get samples
         samples = [
-            distribution.sample() for i in range(num_parallel_samples)
+            distribution.sample() for _ in range(num_parallel_samples)
         ]  # samples: list of [bs x forecast_len x nvars]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x nvars]
@@ -1827,36 +1814,13 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-<<<<<<< Updated upstream
         self,
         past_values: torch.Tensor,
-        past_observed_mask: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, PatchTSTOutput]:
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
-        y_hat = self.head(model_output.last_hidden_state)
-
-        loss_val = None
-        if labels is not None:
-            loss_val = self.loss(y_hat, labels)
-
-        encoder_states = model_output.hidden_states
-        if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
-=======
-        self, past_values: torch.Tensor, 
         labels: Optional[torch.Tensor], 
         past_observed_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None
-    ) -> PatchTSTForRegressionOutput:
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, PatchTSTForRegressionOutput]:
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1876,16 +1840,17 @@ def forward(
             else:                
                 loss_val = self.loss(y_hat, labels)
 
+        if not return_dict:
+            return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None)
         return PatchTSTForRegressionOutput(loss=loss_val, 
-                              prediction_output=y_hat, 
-                              hidden_states=model_output.hidden_states
-                              )
-    
+                                          prediction_output=y_hat,
+                                          hidden_states=model_output.hidden_states
+                                          )
 
     def generate(self, 
                  past_values: torch.Tensor,                 
                  past_observed_mask: Optional[torch.Tensor] = None,                                  
-        ) -> SamplePatchTSTPredictionOutput:
+        ) -> SamplePatchTSTRegressionOutput:
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
@@ -1919,8 +1884,7 @@ def generate(self,
                                 outputs.prediction_output                                 
                                 )        
         # get samples
-        samples = [distribution.sample() for i in range(num_parallel_samples)]     # samples: list of [bs x num_output_channels]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]     # samples: list of [bs x num_output_channels]
         # stack tensors
         samples = torch.stack(samples, dim=1)   # [bs x num_samples x num_output_channels]
         return SamplePatchTSTRegressionOutput(sequences=samples)
->>>>>>> Stashed changes

From 9908c6a10b4ca4a469bf5234fe89c39dd41125a6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 5 Oct 2023 20:32:31 +0200
Subject: [PATCH 079/189] fix typos

---
 .../models/patchtst/modeling_patchtst.py      | 141 +++++++++---------
 1 file changed, 70 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index b9dc656ef8d827..3f5102c0183160 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -939,7 +939,6 @@ class PatchTSTForForecastingOutput(ModelOutput):
     scale: torch.FloatTensor = None
 
 
-
 @dataclass
 class PatchTSTForClassificationOutput(ModelOutput):
     """
@@ -987,17 +986,17 @@ class SamplePatchTSTPredictionOutput(ModelOutput):
 @dataclass
 class SamplePatchTSTForecastOutput(ModelOutput):
     """
-    Base class for time series model's predictions outputs that contains the sampled values from the chosen
-    distribution.
+        Base class for time series model's predictions outputs that contains the sampled values from the chosen
+        distribution.
 
-    Parameters:
-<<<<<<< Updated upstream
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or
-        `(batch_size, num_samples, prediction_length, number_channels)`):
-=======
-        sequences `(batch_size, num_samples, prediction_length, number_channels)`):
->>>>>>> Stashed changes
-                Sampled values from the chosen distribution.
+        Parameters:
+    <<<<<<< Updated upstream
+            sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size,
+            num_samples, prediction_length, number_channels)`):
+    =======
+            sequences `(batch_size, num_samples, prediction_length, number_channels)`):
+    >>>>>>> Stashed changes
+                    Sampled values from the chosen distribution.
     """
 
     sequences: torch.FloatTensor = None
@@ -1010,10 +1009,11 @@ class SamplePatchTSTRegressionOutput(ModelOutput):
     distribution.
 
     Parameters:
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)`        
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)`
                 Sampled values from the chosen distribution.
     """
-    sequences: torch.FloatTensor = None    
+
+    sequences: torch.FloatTensor = None
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
@@ -1341,9 +1341,9 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        model_output = self.model(past_values,
-                                  past_observed_mask=past_observed_mask,
-                                  output_hidden_states=output_hidden_states)
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
         y_hat = self.head(model_output[0])
 
         loss_val = None
@@ -1401,7 +1401,6 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
             self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
         else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
-        
 
     def forward(self, x: torch.Tensor):
         """
@@ -1474,12 +1473,12 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # get model output
-        model_output = self.model(past_values, 
-                                  past_observed_mask=past_observed_mask, 
-                                  output_hidden_states=output_hidden_states)
-        
-        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape           
-        y_hat = self.head(model_output.last_hidden_state)        
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
+
+        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape
+        y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
         if future_values is not None:
@@ -1516,8 +1515,8 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
-            samples, prediction_length, num_output_channels)`
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, prediction_length, num_output_channels)`
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
@@ -1751,23 +1750,23 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
         self.distribution_output = distribution_output
-        
-        head_dim = config.num_input_channels * config.d_model        
+
+        head_dim = config.num_input_channels * config.d_model
 
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-                
+
         if distribution_output is None:
             self.projection = nn.Linear(head_dim, config.num_output_channels)
-        else:                        
-            self.projection = distribution_output.get_parameter_projection(head_dim)            
+        else:
+            self.projection = distribution_output.get_parameter_projection(head_dim)
 
     def forward(self, x):
         """
         x: [bs x nvars x num_patch x d_model]
             or [bs x nvars x (num_patch+1) x d_model] if use cls_token
         output: [bs x output_dim]
-        """        
+        """
         if self.use_cls_token:
             x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
         elif self.pooling == "mean":
@@ -1777,11 +1776,11 @@ def forward(self, x):
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
         # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)        
-        # projection 
+        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
+        # projection
         y = self.projection(x)  # y: bs x output_dim or a tuple of this shape for distribution head
-        # 
-        if (self.distribution_output is None) & self.y_range:    # linear head
+        #
+        if (self.distribution_output is None) & self.y_range:  # linear head
             y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
 
         return y
@@ -1804,7 +1803,9 @@ def __init__(self, config: PatchTSTConfig):
             elif config.distribution_output == "normal":
                 self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels)
+                self.distribution_output = NegativeBinomialOutput(
+                    dim=config.prediction_length * config.num_output_channels
+                )
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1816,7 +1817,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        labels: Optional[torch.Tensor], 
+        labels: Optional[torch.Tensor],
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1824,38 +1825,38 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        model_output = self.model(past_values, 
-                                  past_observed_mask=past_observed_mask, 
-                                  output_hidden_states=output_hidden_states)
-        # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape           
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
+        # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if labels is not None:            
-            if self.distribution_output:                
+        if labels is not None:
+            if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
                 loss_val = self.loss(distribution, labels)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
-            else:                
+            else:
                 loss_val = self.loss(y_hat, labels)
 
+        encoder_states = model_output.hidden_states
+
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None)
-        return PatchTSTForRegressionOutput(loss=loss_val, 
-                                          prediction_output=y_hat,
-                                          hidden_states=model_output.hidden_states
-                                          )
-
-    def generate(self, 
-                 past_values: torch.Tensor,                 
-                 past_observed_mask: Optional[torch.Tensor] = None,                                  
-        ) -> SamplePatchTSTRegressionOutput:
+            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
+        return PatchTSTForRegressionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSTRegressionOutput:
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
         Parameters:
-            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): 
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 Past values of the time series that serves as context in order to predict the future.
 
             past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
@@ -1864,27 +1865,25 @@ def generate(self,
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-        
+
         Return:
-            [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
-            samples, num_output_channels)`.
+            [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, num_output_channels)`.
         """
-        # get number of samples        
+        # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
 
-        # get model output        
-        outputs = self(past_values=past_values,
-                       labels=None,
-                       past_observed_mask=past_observed_mask,                       
-                       output_hidden_states=None
-                       )
-                
+        # get model output
+        outputs = self(
+            past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=None
+        )
+
         # get distribution
-        distribution = self.distribution_output.distribution(
-                                outputs.prediction_output                                 
-                                )        
+        distribution = self.distribution_output.distribution(outputs.prediction_output)
         # get samples
-        samples = [distribution.sample() for _ in range(num_parallel_samples)]     # samples: list of [bs x num_output_channels]
+        samples = [
+            distribution.sample() for _ in range(num_parallel_samples)
+        ]  # samples: list of [bs x num_output_channels]
         # stack tensors
-        samples = torch.stack(samples, dim=1)   # [bs x num_samples x num_output_channels]
+        samples = torch.stack(samples, dim=1)  # [bs x num_samples x num_output_channels]
         return SamplePatchTSTRegressionOutput(sequences=samples)

From 91a4c46eb03bdc2af6d41ee395bffdfcbfac71e0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 6 Oct 2023 15:31:21 +0200
Subject: [PATCH 080/189] add forecast_masking

---
 .../models/patchtst/modeling_patchtst.py      | 76 ++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 3f5102c0183160..cb9360499ed863 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -300,6 +300,73 @@ def random_masking(
     return xb_mask, mask[..., 0]
 
 
+def forecast_masking(
+    xb: torch.Tensor,
+    patch_lengths: list,
+    mix_ratio: list = None,
+    unmasked_channel_indices: list = None,
+    mask_value: int = 0,
+):
+    """forecast_masking Mask last K patches where K is from the patch_lengths list.
+    For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in
+    cv_channel_indices
+
+    Args:
+        xb (Tensor):
+            Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
+        patch_lengths (list): List of patch lengths to mask in the end of the data.
+        mix_ratio (list, optional): List of weights to use for each patch length. For Ex.
+            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to
+            None.
+        unmasked_channel_indices (list, optional):
+            Control Variable channel indices. These channels will not be masked. Defaults to None.
+        mask_value (int, optional): Value to use for masking. Defaults to 0.
+
+    Returns:
+        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x
+        tsg2 x c x n]
+    """
+    if mix_ratio is None:
+        mix_ratio = [1 for t in patch_lengths]
+
+    bs, nvars, L, D = xb.shape
+    mask = torch.zeros(bs, nvars, L, device=xb.device)
+
+    t_list = []
+    total_length = 0
+    total_ratio = sum(mix_ratio)
+
+    for i, j in zip(patch_lengths, mix_ratio):
+        if i <= 0 or i >= L:
+            raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
+        temp_len = int(bs * j / total_ratio)
+        t_list.append([i, j, temp_len])
+        total_length += temp_len
+
+    t_list = sorted(t_list, key=lambda x: x[2])
+
+    if total_length < bs:
+        t_list[0][2] = t_list[0][2] + (bs - total_length)
+    elif total_length > bs:
+        t_list[-1][2] = t_list[-1][2] + (total_length - bs)
+
+    b1 = 0
+    for p, r, l in t_list:
+        b2 = b1 + l
+        mask[b1:b2, :, -p:] = 1
+        b1 = b2
+
+    perm = torch.randperm(mask.shape[0])
+    mask = mask[perm]
+
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    xb_mask = xb.masked_fill(mask.bool(), mask_value)
+    return xb_mask, mask[..., 0]
+
+
 def compute_num_patches(sequence_length, patch_length, stride):
     return (max(sequence_length, patch_length) - patch_length) // stride + 1
 
@@ -490,7 +557,14 @@ def forward(self, x: torch.Tensor):
                 mask_value=self.mask_value,
                 seed_number=self.seed_number,
             )
-
+        elif self.mask_type == "forecast":
+            x_mask, mask = forecast_masking(
+                xb=x,
+                patch_lengths=self.mask_patches,
+                mix_ratio=self.mask_patch_ratios,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                mask_value=self.mask_value,
+            )
         else:
             raise Exception("Invalid mask type")
 

From 17c60a7aa69c5bf82d6ca842bba3b3b49e96bebb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 6 Oct 2023 16:36:48 +0200
Subject: [PATCH 081/189] fixed tests

---
 src/transformers/models/patchtst/modeling_patchtst.py | 8 +++++---
 tests/models/patchtst/test_modeling_patchtst.py       | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index cb9360499ed863..49c73ef20b639e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -707,7 +707,7 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
     config_class = PatchTSTConfig
     base_model_prefix = "model"
     main_input_name = "past_values"
-    supports_gradient_checkpointing = True
+    supports_gradient_checkpointing = False
 
     def _init_weights(self, module):
         """Initialize weights"""
@@ -1405,8 +1405,8 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        past_observed_mask: Optional[bool] = None,
         labels: torch.Tensor = None,
+        past_observed_mask: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForClassificationOutput]:
@@ -1854,7 +1854,7 @@ def forward(self, x):
         # projection
         y = self.projection(x)  # y: bs x output_dim or a tuple of this shape for distribution head
         #
-        if (self.distribution_output is None) & self.y_range:  # linear head
+        if (self.distribution_output is None) & (self.y_range is not None):  # linear head
             y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
 
         return y
@@ -1899,6 +1899,8 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        
         model_output = self.model(
             past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
         )
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 7f453f707948ef..69c8dad8e44fea 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -276,7 +276,9 @@ def test_forward_signature(self):
                 MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
             ):
                 expected_arg_names.remove("future_values")
+                expected_arg_names.remove("past_observed_mask")
                 expected_arg_names.append("labels")
+                expected_arg_names.append("past_observed_mask")
             expected_arg_names.extend(
                 [
                     "output_hidden_states",

From a61ac773cb5fd7127edba9de5c52dd9190a2b0f1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 6 Oct 2023 18:12:48 +0200
Subject: [PATCH 082/189] use set_seed

---
 docs/source/en/index.md                       |  1 +
 .../models/patchtst/configuration_patchtst.py |  2 +-
 .../models/patchtst/modeling_patchtst.py      | 19 ++++++++-----------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index e20389a2ab49cb..fe841a1b43607a 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -209,6 +209,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
 |                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
 |                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
+|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
 |                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
 |                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
 |                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 2ba1c808358cf7..e6f140165f624b 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -178,7 +178,7 @@ def __init__(
         use_cls_token: bool = False,
         init_std: float = 0.02,
         shared_projection: bool = True,
-        seed_number: int = None,
+        seed_number: Optional[int] = None,
         scaling: Optional[Union[str, bool]] = "mean",
         # mask pretraining
         mask_input: Optional[bool] = None,
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 49c73ef20b639e..60ad66da70c7eb 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -15,11 +15,9 @@
 """ PyTorch PatchTST model."""
 
 import math
-import random
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
-import numpy as np
 import torch
 from torch import nn
 
@@ -27,6 +25,7 @@
 from ...modeling_outputs import BaseModelOutputWithNoAttention
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...trainer_utils import set_seed
 from ...utils import ModelOutput, add_start_docstrings, logging
 from .configuration_patchtst import PatchTSTConfig
 
@@ -239,14 +238,6 @@ def positional_encoding(pe, learn_pe, q_len, d_model):
     return nn.Parameter(w_pos, requires_grad=learn_pe)
 
 
-def set_seed(x=42):
-    random.seed(x)
-    np.random.seed(x)
-    torch.manual_seed(x)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(x)
-
-
 def random_masking(
     xb: torch.Tensor,
     mask_ratio: float,
@@ -306,6 +297,7 @@ def forecast_masking(
     mix_ratio: list = None,
     unmasked_channel_indices: list = None,
     mask_value: int = 0,
+    seed_number: Optional[int] = None,
 ):
     """forecast_masking Mask last K patches where K is from the patch_lengths list.
     For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in
@@ -321,11 +313,15 @@ def forecast_masking(
         unmasked_channel_indices (list, optional):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
+        seed_number (int, optional): Value to set for the random seed.
 
     Returns:
         Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x
         tsg2 x c x n]
     """
+    if seed_number:
+        set_seed(seed_number)
+
     if mix_ratio is None:
         mix_ratio = [1 for t in patch_lengths]
 
@@ -564,6 +560,7 @@ def forward(self, x: torch.Tensor):
                 mix_ratio=self.mask_patch_ratios,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 mask_value=self.mask_value,
+                seed_number=self.seed_number,
             )
         else:
             raise Exception("Invalid mask type")
@@ -1900,7 +1897,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        
+
         model_output = self.model(
             past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
         )

From de7fb9e3dfc14b83b6f8814df78fe7998f80b9a1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 6 Oct 2023 18:38:02 +0200
Subject: [PATCH 083/189] fix doc test

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 60ad66da70c7eb..abeb092baa5b4d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1061,12 +1061,8 @@ class SamplePatchTSTForecastOutput(ModelOutput):
         distribution.
 
         Parameters:
-    <<<<<<< Updated upstream
             sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size,
             num_samples, prediction_length, number_channels)`):
-    =======
-            sequences `(batch_size, num_samples, prediction_length, number_channels)`):
-    >>>>>>> Stashed changes
                     Sampled values from the chosen distribution.
     """
 

From 0fd0ce701894cf2316e1e7bda8ed288d1e6dd732 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 6 Oct 2023 18:58:18 +0200
Subject: [PATCH 084/189] formatting

---
 .../models/patchtst/modeling_patchtst.py             | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index abeb092baa5b4d..8b01612ae3b54d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1057,13 +1057,13 @@ class SamplePatchTSTPredictionOutput(ModelOutput):
 @dataclass
 class SamplePatchTSTForecastOutput(ModelOutput):
     """
-        Base class for time series model's predictions outputs that contains the sampled values from the chosen
-        distribution.
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
 
-        Parameters:
-            sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size,
-            num_samples, prediction_length, number_channels)`):
-                    Sampled values from the chosen distribution.
+    Parameters:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size,
+        num_samples, prediction_length, number_channels)`):
+                Sampled values from the chosen distribution.
     """
 
     sequences: torch.FloatTensor = None

From cb52b6f87f150ac1520f2b12da5544cad43bc12d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sat, 7 Oct 2023 11:16:22 +0200
Subject: [PATCH 085/189] Update docs/source/en/model_doc/patchtst.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/patchtst.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 504be80c3e6c9a..12542a8dc5206b 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 Tips:
 
-The model also adds a time series classification pipeline and time series regression pipeline.
+The model can also be used for time series classification and time series regression. See the respective [`PatchTSTForClassification`] and [`PatchTSTForRegression`] classes.
 
 This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif).
 

From 3daec960700190206e3dcae03d70a724dea6ad6e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sun, 8 Oct 2023 13:31:53 +0200
Subject: [PATCH 086/189] better var names

---
 .../models/patchtst/modeling_patchtst.py      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 8b01612ae3b54d..a888a1a58ccbb4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -265,17 +265,17 @@ def random_masking(
     if seed_number:
         set_seed(seed_number)
 
-    bs, nvars, L, D = xb.shape
+    batch_size, nvars, seq_len, feat = xb.shape
 
-    len_keep = int(L * (1 - mask_ratio))
+    len_keep = int(seq_len * (1 - mask_ratio))
 
     if channel_consistent_masking:
-        noise = torch.rand(bs, 1, L, device=xb.device)  # noise in [0, 1], bs x 1 x  L
-        noise = noise.repeat(1, nvars, 1)  # bs x nvars x L
+        noise = torch.rand(batch_size, 1, seq_len, device=xb.device)  # noise in [0, 1], bs x 1 x  L
+        noise = noise.repeat(1, nvars, 1)  # bs x nvars x time
     else:
-        noise = torch.rand(bs, nvars, L, device=xb.device)  # noise in [0, 1], bs x nvars x L
+        noise = torch.rand(batch_size, nvars, seq_len, device=xb.device)  # noise in [0, 1], bs x nvars x L
 
-    mask = torch.ones(bs, nvars, L, device=xb.device)  # mask: [bs x nvars x num_patch]
+    mask = torch.ones(batch_size, nvars, seq_len, device=xb.device)  # mask: [bs x nvars x num_patch]
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
@@ -283,7 +283,7 @@ def random_masking(
     ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
 
     mask = torch.gather(mask, dim=-1, index=ids_restore)
-    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patches x patch_length]
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat)  # mask: [bs x nvars x num_patches x patch_length]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
@@ -325,29 +325,29 @@ def forecast_masking(
     if mix_ratio is None:
         mix_ratio = [1 for t in patch_lengths]
 
-    bs, nvars, L, D = xb.shape
-    mask = torch.zeros(bs, nvars, L, device=xb.device)
+    batch_size, nvars, seq_len, feat = xb.shape
+    mask = torch.zeros(batch_size, nvars, seq_len, device=xb.device)
 
     t_list = []
     total_length = 0
     total_ratio = sum(mix_ratio)
 
     for i, j in zip(patch_lengths, mix_ratio):
-        if i <= 0 or i >= L:
+        if i <= 0 or i >= seq_len:
             raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
-        temp_len = int(bs * j / total_ratio)
+        temp_len = int(batch_size * j / total_ratio)
         t_list.append([i, j, temp_len])
         total_length += temp_len
 
     t_list = sorted(t_list, key=lambda x: x[2])
 
-    if total_length < bs:
-        t_list[0][2] = t_list[0][2] + (bs - total_length)
-    elif total_length > bs:
-        t_list[-1][2] = t_list[-1][2] + (total_length - bs)
+    if total_length < batch_size:
+        t_list[0][2] = t_list[0][2] + (batch_size - total_length)
+    elif total_length > batch_size:
+        t_list[-1][2] = t_list[-1][2] + (total_length - batch_size)
 
     b1 = 0
-    for p, r, l in t_list:
+    for p, _, l in t_list:
         b2 = b1 + l
         mask[b1:b2, :, -p:] = 1
         b1 = b2
@@ -355,7 +355,7 @@ def forecast_masking(
     perm = torch.randperm(mask.shape[0])
     mask = mask[perm]
 
-    mask = mask.unsqueeze(-1).repeat(1, 1, 1, D)  # mask: [bs x nvars x num_patch x patch_len]
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat)  # mask: [bs x nvars x num_patch x patch_len]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 

From c82022ded984a86e1c6b00415987eb763cc3d06d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 09:51:28 +0200
Subject: [PATCH 087/189] rename PatchTSTTranspose

---
 .../models/patchtst/modeling_patchtst.py      | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a888a1a58ccbb4..d086276b03ef0f 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 TSFM team. All rights reserved.
+# Copyright 2023 IBM & Hugging Face. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -195,34 +195,35 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-class Transpose(nn.Module):
+class PatchTSTTranspose(nn.Module):
     def __init__(self, *dims, contiguous=False):
         super().__init__()
         self.dims, self.contiguous = dims, contiguous
 
-    def forward(self, x):
+    def forward(self, inputs):
         if self.contiguous:
-            return x.transpose(*self.dims).contiguous()
+            return inputs.transpose(*self.dims).contiguous()
         else:
-            return x.transpose(*self.dims)
+            return inputs.transpose(*self.dims)
 
 
-def positional_encoding(pe, learn_pe, q_len, d_model):
+def positional_encoding(position_embedding_type, learned, q_len, d_model):
     # Positional encoding
-    if pe is None:
-        w_pos = torch.empty((q_len, d_model))  # pe = None and learn_pe = False can be used to measure impact of pe
+    if position_embedding_type is None:
+        # position_embedding_type = None and learned = False can be used to measure impact of positional encoding
+        w_pos = torch.empty((q_len, d_model))
         nn.init.uniform_(w_pos, -0.02, 0.02)
-        learn_pe = False
-    elif pe == "zeros":
+        learned = False
+    elif position_embedding_type == "zeros":
         w_pos = torch.empty((q_len, d_model))
         nn.init.uniform_(w_pos, -0.02, 0.02)
-    elif pe == "normal":
+    elif position_embedding_type == "normal":
         w_pos = torch.zeros((q_len, 1))
         torch.nn.init.normal_(w_pos, mean=0.0, std=0.1)
-    elif pe == "uniform":
+    elif position_embedding_type == "uniform":
         w_pos = torch.zeros((q_len, 1))
         nn.init.uniform_(w_pos, a=0.0, b=0.1)
-    elif pe == "sincos":
+    elif position_embedding_type == "sincos":
         pos_enc = torch.zeros(q_len, d_model)
         position = torch.arange(0, q_len).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
@@ -233,9 +234,9 @@ def positional_encoding(pe, learn_pe, q_len, d_model):
         w_pos = pos_enc
     else:
         raise ValueError(
-            f"{pe} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
+            f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
         )
-    return nn.Parameter(w_pos, requires_grad=learn_pe)
+    return nn.Parameter(w_pos, requires_grad=learned)
 
 
 def random_masking(
@@ -609,7 +610,9 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of the sublayer 1
         self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+            self.norm_sublayer1 = nn.Sequential(
+                PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
+            )
         else:
             self.norm_sublayer1 = nn.LayerNorm(config.d_model)
 
@@ -617,7 +620,9 @@ def __init__(self, config: PatchTSTConfig):
         if self.channel_attention:
             self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
             if "batch" in config.norm.lower():
-                self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+                self.norm_sublayer2 = nn.Sequential(
+                    PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
+                )
             else:
                 self.norm_sublayer2 = nn.LayerNorm(config.d_model)
 
@@ -632,7 +637,9 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of sublayer 3
         self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer3 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2))
+            self.norm_sublayer3 = nn.Sequential(
+                PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
+            )
         else:
             self.norm_sublayer3 = nn.LayerNorm(config.d_model)
 

From 687e3c84f927456a156af518485d288dcb117dfc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:11:27 +0200
Subject: [PATCH 088/189] fix argument names and docs string

---
 .../models/patchtst/modeling_patchtst.py      | 81 ++++++++++---------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d086276b03ef0f..10393875750527 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -240,7 +240,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model):
 
 
 def random_masking(
-    xb: torch.Tensor,
+    inputs: torch.Tensor,
     mask_ratio: float,
     unmasked_channel_indices: list = None,
     channel_consistent_masking: bool = False,
@@ -249,34 +249,40 @@ def random_masking(
 ):
     """random_masking: Mask the input considering the control variables.
 
-    Parameters:
-        xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length]
-        mask_ratio (float): Mask ratio.
-        unmasked_channel_indices (list, optional):
+    Args:
+        inputs (`torch.Tensor` of shape `(batch_size, nvars, seq_len, feat)`):
+            The input tensor to mask.
+        mask_ratio (`float`):
+            Mask ratio.
+        unmasked_channel_indices (list, *optional*):
             indices of unmasked channels. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional):
+        channel_consistent_masking (bool, *optional* defaults to False):
             When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
-            across channels. Defaults to True.
-        mask_value (int, optional): Value to use for masking. Defaults to 0.
-        seed_number (int, optional): Value to set for the random seed.
+            across channels. Defaults to False.
+        mask_value (int, *optional* defaults to 0):
+            Value to use for masking.
+        seed_number (int, *optional*):
+            Value to set for the random seed.
 
     Returns:
-        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n]
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
+        n]
     """
     if seed_number:
         set_seed(seed_number)
 
-    batch_size, nvars, seq_len, feat = xb.shape
+    batch_size, nvars, seq_len, feat = inputs.shape
+    device = inputs.device
 
     len_keep = int(seq_len * (1 - mask_ratio))
 
     if channel_consistent_masking:
-        noise = torch.rand(batch_size, 1, seq_len, device=xb.device)  # noise in [0, 1], bs x 1 x  L
+        noise = torch.rand(batch_size, 1, seq_len, device=device)  # noise in [0, 1], bs x 1 x  L
         noise = noise.repeat(1, nvars, 1)  # bs x nvars x time
     else:
-        noise = torch.rand(batch_size, nvars, seq_len, device=xb.device)  # noise in [0, 1], bs x nvars x L
+        noise = torch.rand(batch_size, nvars, seq_len, device=device)  # noise in [0, 1], bs x nvars x L
 
-    mask = torch.ones(batch_size, nvars, seq_len, device=xb.device)  # mask: [bs x nvars x num_patch]
+    mask = torch.ones(batch_size, nvars, seq_len, device=device)  # mask: [bs x nvars x num_patch]
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
@@ -288,37 +294,40 @@ def random_masking(
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
-    xb_mask = xb.masked_fill(mask.bool(), mask_value)
-    return xb_mask, mask[..., 0]
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
 
 
 def forecast_masking(
-    xb: torch.Tensor,
+    inputs: torch.Tensor,
     patch_lengths: list,
     mix_ratio: list = None,
     unmasked_channel_indices: list = None,
     mask_value: int = 0,
     seed_number: Optional[int] = None,
 ):
-    """forecast_masking Mask last K patches where K is from the patch_lengths list.
-    For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in
-    cv_channel_indices
+    """Forecast masking that masks the last K patches where K is from the patch_lengths list.
+    For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in
+    unmasked_channel_indices.
 
     Args:
-        xb (Tensor):
+        inputs (`torch.Tensor`):
             Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
-        patch_lengths (list): List of patch lengths to mask in the end of the data.
-        mix_ratio (list, optional): List of weights to use for each patch length. For Ex.
-            if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to
-            None.
-        unmasked_channel_indices (list, optional):
+        patch_lengths (list):
+            List of patch lengths to mask in the end of the data.
+        mix_ratio (list, *optional* defaults to None):
+            List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
+            then equal weights to both patch lengths. Defaults to None.
+        unmasked_channel_indices (list, *optional* defaults to None):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
-        mask_value (int, optional): Value to use for masking. Defaults to 0.
-        seed_number (int, optional): Value to set for the random seed.
+        mask_value (int, *optional* defaults to 0):
+            Value to use for masking. Defaults to 0.
+        seed_number (int, *optional*):
+            Value to set for the random seed.
 
     Returns:
-        Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x
-        tsg2 x c x n]
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape [bs x c
+        x n] or [bs x tsg1 x tsg2 x c x n]
     """
     if seed_number:
         set_seed(seed_number)
@@ -326,8 +335,8 @@ def forecast_masking(
     if mix_ratio is None:
         mix_ratio = [1 for t in patch_lengths]
 
-    batch_size, nvars, seq_len, feat = xb.shape
-    mask = torch.zeros(batch_size, nvars, seq_len, device=xb.device)
+    batch_size, nvars, seq_len, feat = inputs.shape
+    mask = torch.zeros(batch_size, nvars, seq_len, device=inputs.device)
 
     t_list = []
     total_length = 0
@@ -360,8 +369,8 @@ def forecast_masking(
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
-    xb_mask = xb.masked_fill(mask.bool(), mask_value)
-    return xb_mask, mask[..., 0]
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
 
 
 def compute_num_patches(sequence_length, patch_length, stride):
@@ -547,7 +556,7 @@ def forward(self, x: torch.Tensor):
 
         if self.mask_type == "random":
             x_mask, mask = random_masking(
-                xb=x,
+                inputs=x,
                 mask_ratio=self.mask_ratio,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
@@ -556,7 +565,7 @@ def forward(self, x: torch.Tensor):
             )
         elif self.mask_type == "forecast":
             x_mask, mask = forecast_masking(
-                xb=x,
+                inputs=x,
                 patch_lengths=self.mask_patches,
                 mix_ratio=self.mask_patch_ratios,
                 unmasked_channel_indices=self.unmasked_channel_indices,

From 5469748c5d2238ced75ef98c0a7998f1f78c80fe Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:26:21 +0200
Subject: [PATCH 089/189] remove compute_num_patches and unused class

---
 .../models/patchtst/modeling_patchtst.py      | 75 +------------------
 1 file changed, 4 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 10393875750527..7aff6af096b962 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -373,11 +373,7 @@ def forecast_masking(
     return inputs_mask, mask[..., 0]
 
 
-def compute_num_patches(sequence_length, patch_length, stride):
-    return (max(sequence_length, patch_length) - patch_length) // stride + 1
-
-
-class Patchify(nn.Module):
+class PatchTSTPatchify(nn.Module):
     """
     A class to patchify the time series sequence into different patches
 
@@ -408,8 +404,8 @@ def __init__(
         self.stride = stride
 
         # get the number of patches
-        self.num_patches = compute_num_patches(sequence_length, patch_length, stride)
-        new_sequence_length = patch_length + stride * (self.num_patches - 1)
+        num_patches = (max(sequence_length, patch_length) - patch_length) // stride + 1
+        new_sequence_length = patch_length + stride * (num_patches - 1)
         self.s_begin = sequence_length - new_sequence_length
 
     def forward(self, past_values: torch.Tensor):
@@ -433,69 +429,6 @@ def forward(self, past_values: torch.Tensor):
         return x
 
 
-class PatchEmbeddings(nn.Module):
-    """
-    Parameters:
-    A class to patchify the time series sequence into different patches
-        sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride
-        (int, required): stride between patches.
-
-    Returns:
-        embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim]
-    """
-
-    def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int):
-        super().__init__()
-
-        assert (
-            sequence_length > patch_length
-        ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
-
-        # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride"
-
-        self.sequence_length = sequence_length
-        self.patch_length = patch_length
-        self.stride = stride
-        self.embed_dim = embed_dim
-
-        # get the number of patches
-        self.num_patches = compute_num_patches(sequence_length, patch_length, stride)
-        new_sequence_length = patch_length + stride * (self.num_patches - 1)
-        self.s_begin = sequence_length - new_sequence_length
-
-        # Embedding
-        self.projection = nn.Conv1d(
-            in_channels=1,
-            out_channels=embed_dim,
-            kernel_size=patch_length,
-            stride=stride,
-        )
-
-    def forward(self, past_values: torch.Tensor):
-        """
-        Parameters:
-            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
-        Returns:
-            embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim]
-        """
-        bs, sequence_length, num_input_channels = past_values.shape
-        assert (
-            sequence_length == self.sequence_length
-        ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})."
-
-        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
-        # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ]
-        x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous()
-        # projection
-        embeddings = self.projection(x)  # embeddings: [bs*num_input_channels x emb_dim x num_patches]
-        # reshape
-        embeddings = (
-            embeddings.transpose(1, 2).view(bs, num_input_channels, -1, self.embed_dim).contiguous()
-        )  # embeddings: [bs x num_input_channels x num_patches x emb_dim]
-        # embeddings = embeddings.flatten(2).transpose(1, 2)
-        return embeddings
-
-
 class PatchMasking(nn.Module):
     """
     PatchMasking: Class to random or forcast masking.
@@ -1264,7 +1197,7 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
 
-        self.patching = Patchify(
+        self.patching = PatchTSTPatchify(
             config.context_length,
             patch_length=config.patch_length,
             stride=config.stride,

From d5c83591ba846812f64fe9d38f3c8315078d4d8d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:31:00 +0200
Subject: [PATCH 090/189] remove assert

---
 .../models/patchtst/modeling_patchtst.py           | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 7aff6af096b962..13d3acd9ed50a6 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -395,9 +395,10 @@ def __init__(
     ):
         super().__init__()
 
-        assert (
-            sequence_length > patch_length
-        ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
+        if sequence_length <= patch_length:
+            raise ValueError(
+                f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
+            )
 
         self.sequence_length = sequence_length
         self.patch_length = patch_length
@@ -417,9 +418,10 @@ def forward(self, past_values: torch.Tensor):
             x: output tensor data [bs x num_input_channels x num_patches x patch_length]
         """
         sequence_length = past_values.shape[-2]
-        assert (
-            sequence_length == self.sequence_length
-        ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+        if sequence_length != self.sequence_length:
+            raise ValueError(
+                f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+            )
 
         x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
         x = x.unfold(

From a25d433103a33d0f01b64e7486cb0a52f333d672 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:35:19 +0200
Subject: [PATCH 091/189] renamed to PatchTSTMasking

---
 src/transformers/models/patchtst/modeling_patchtst.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 13d3acd9ed50a6..7c10dac7069b53 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -431,11 +431,11 @@ def forward(self, past_values: torch.Tensor):
         return x
 
 
-class PatchMasking(nn.Module):
+class PatchTSTMasking(nn.Module):
     """
-    PatchMasking: Class to random or forcast masking.
+    PatchTSTMasking: Class for random or forcast masking on inputs.
 
-    Parameters:
+    Args:
         mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
         mask_ratio (float, optional): Mask ratio.
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
@@ -448,6 +448,9 @@ class PatchMasking(nn.Module):
             across channels. Defaults to True.
         mask_value (int, optional): Value to use for masking. Defaults to 0.
         seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
+
+    Returns:
+
     """
 
     def __init__(
@@ -1207,7 +1210,7 @@ def __init__(self, config: PatchTSTConfig):
         self.mask_input = config.mask_input
 
         if self.mask_input:
-            self.masking = PatchMasking(
+            self.masking = PatchTSTMasking(
                 mask_type=config.mask_type,
                 mask_ratio=config.mask_ratio,
                 mask_patches=config.mask_patches,

From db96ed830fb9986d77d840bccb2fa14121cc6166 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:37:26 +0200
Subject: [PATCH 092/189] use num_labels for classification

---
 .../models/patchtst/configuration_patchtst.py             | 2 +-
 src/transformers/models/patchtst/modeling_patchtst.py     | 2 +-
 tests/models/patchtst/test_modeling_patchtst.py           | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index e6f140165f624b..337caf70cd0895 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -253,7 +253,7 @@ def __init__(
         self.shared_projection = shared_projection
 
         # Classification
-        self.num_classes = num_classes
+        self.num_labels = num_labels
 
         # Forcasting and prediction
         self.prediction_length = prediction_length
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 7c10dac7069b53..b54f50346bb79c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1384,7 +1384,7 @@ def __init__(self, config: PatchTSTConfig):
         self.pooling = config.pooling
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes)
+        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels)
 
     def forward(self, x: torch.Tensor):
         """
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 69c8dad8e44fea..c669ef6f44959e 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -71,7 +71,7 @@ def __init__(
         lags_sequence=[1, 2, 3, 4, 5],
         distil=False,
         seed_number=42,
-        num_classes=2,
+        num_labels=2,
         num_output_channels=2,
     ):
         self.parent = parent
@@ -93,7 +93,7 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
         self.seed_number = seed_number
-        self.num_classes = num_classes
+        self.num_labels = num_labels
         self.num_output_channels = num_output_channels
         self.distil = distil
         self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
@@ -113,7 +113,7 @@ def get_config(self):
             context_length=self.context_length,
             activation_function=self.hidden_act,
             seed_number=self.seed_number,
-            num_classes=self.num_classes,
+            num_labels=self.num_labels,
             num_output_channels=self.num_output_channels,
         )
 
@@ -191,7 +191,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         # if classification model:
         if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
-            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_classes, rng=rng)
+            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng)
             inputs_dict["labels"] = labels
             inputs_dict.pop("future_values")
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):

From b6d3b4e526e80c6f00d1c6a14718f880c4ddd8a4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 10:38:28 +0200
Subject: [PATCH 093/189] use num_labels

---
 src/transformers/models/patchtst/configuration_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 337caf70cd0895..07524457d29678 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -112,7 +112,7 @@ class PatchTSTConfig(PretrainedConfig):
             Mask value to set.
         pooling (`str`, *optional*, defaults to `"mean"`):
             Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
-        num_classes (`int`, *optional*, defaults to 1):
+        num_labels (`int`, *optional*, defaults to 1):
             Number of classes is defined for classification task.
         head_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for head.
@@ -191,7 +191,7 @@ def __init__(
         mask_value=0,
         # head
         pooling: str = "mean",
-        num_classes: int = 1,
+        num_labels: int = 1,
         head_dropout: float = 0.0,
         prediction_length: int = 24,
         num_output_channels: int = 1,

From ca648ef01b7beb3dd69e466f5115807cbf0d0df5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 11:15:34 +0200
Subject: [PATCH 094/189] use default num_labels from super class

---
 src/transformers/models/patchtst/configuration_patchtst.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 07524457d29678..f3764b9e41e3c3 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -112,8 +112,6 @@ class PatchTSTConfig(PretrainedConfig):
             Mask value to set.
         pooling (`str`, *optional*, defaults to `"mean"`):
             Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
-        num_labels (`int`, *optional*, defaults to 1):
-            Number of classes is defined for classification task.
         head_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for head.
         prediction_length (`int`):
@@ -191,7 +189,6 @@ def __init__(
         mask_value=0,
         # head
         pooling: str = "mean",
-        num_labels: int = 1,
         head_dropout: float = 0.0,
         prediction_length: int = 24,
         num_output_channels: int = 1,
@@ -252,9 +249,6 @@ def __init__(
         # Forecast head
         self.shared_projection = shared_projection
 
-        # Classification
-        self.num_labels = num_labels
-
         # Forcasting and prediction
         self.prediction_length = prediction_length
         self.num_parallel_samples = num_parallel_samples

From e56de1134de39c18a9505ea3679ac563f4fe6ba5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 11:21:06 +0200
Subject: [PATCH 095/189] move model_type after docstring

---
 src/transformers/models/patchtst/configuration_patchtst.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index f3764b9e41e3c3..d2144cd65eca44 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -29,7 +29,6 @@
 
 
 class PatchTSTConfig(PretrainedConfig):
-    model_type = "patchtst"
     r"""
     This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an
     PatchTST model according to the specified arguments, defining the model architecture.
@@ -139,6 +138,8 @@ class PatchTSTConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
+    model_type = "patchtst"
     attribute_map = {
         "hidden_size": "d_model",
         "num_attention_heads": "encoder_attention_heads",

From fcfa103b5211a2d8005e6f2e8c6f7824b8e34fc5 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 11:29:11 +0200
Subject: [PATCH 096/189] renamed PatchTSTForMaskPretraining

---
 docs/source/en/model_doc/patchtst.md          |  4 +--
 src/transformers/__init__.py                  |  4 +--
 src/transformers/models/patchtst/__init__.py  |  4 +--
 .../models/patchtst/configuration_patchtst.py | 33 ++++++++++---------
 .../models/patchtst/modeling_patchtst.py      | 10 +++---
 src/transformers/utils/dummy_pt_objects.py    |  2 +-
 .../models/patchtst/test_modeling_patchtst.py |  8 ++---
 utils/check_repo.py                           |  2 +-
 8 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 12542a8dc5206b..88094385c1500d 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -62,9 +62,9 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST).
     - forward
 
 
-## PatchTSTForMaskPretraining
+## PatchTSTForPretraining
 
-[[autodoc]] PatchTSTForMaskPretraining
+[[autodoc]] PatchTSTForPretraining
     - forward
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index de83c9c45c978f..d94188d42f5e8a 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2439,7 +2439,7 @@
             "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
             "PatchTSTForClassification",
             "PatchTSTForForecasting",
-            "PatchTSTForMaskPretraining",
+            "PatchTSTForPretraining",
             "PatchTSTForPrediction",
             "PatchTSTForRegression",
             "PatchTSTModel",
@@ -6269,7 +6269,7 @@
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
             PatchTSTForForecasting,
-            PatchTSTForMaskPretraining,
+            PatchTSTForPretraining,
             PatchTSTForPrediction,
             PatchTSTForRegression,
             PatchTSTModel,
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 8979bed2341ab2..8ca9b1f88eb8c4 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -36,7 +36,7 @@
         "PatchTSTPreTrainedModel",
         "PatchTSTForPrediction",
         "PatchTSTForForecasting",
-        "PatchTSTForMaskPretraining",
+        "PatchTSTForPretraining",
         "PatchTSTForRegression",
         "PatchTSTForClassification",
     ]
@@ -55,7 +55,7 @@
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
             PatchTSTForForecasting,
-            PatchTSTForMaskPretraining,
+            PatchTSTForPretraining,
             PatchTSTForPrediction,
             PatchTSTForRegression,
             PatchTSTModel,
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index d2144cd65eca44..f8ee3f75a9530a 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -41,7 +41,7 @@ class PatchTSTConfig(PretrainedConfig):
         num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        context_length (`int`, defaults to 32):
+        context_length (`int`, defaults to 32, *optional*, defaults to 32):
             The context length for the encoder.
         distribution_output (`string`, *optional*, defaults to `"student_t"`):
             The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
@@ -54,15 +54,15 @@ class PatchTSTConfig(PretrainedConfig):
             Define the patch length of the patchification process. Default to 1.
         stride (`int`, *optional*, defaults to 1):
             define the stride of the patchification process. Default to 1.
-        encoder_layers (`int`, *optional*, defaults to 2):
+        encoder_layers (`int`, *optional*, defaults to 3):
             Number of encoder layers.
         d_model (`int`, *optional*, defaults to 64):
             Dimensionality of the transformer layers.
         encoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        shared_embedding (`bool`, *optional*, defaults to True):
+        shared_embedding (`bool`, *optional*, defaults to `True`):
             Sharing the input embedding across all channels.
-        channel_attention (`bool`, *optional*, defaults to False):
+        channel_attention (`bool`, *optional*, defaults to `False`):
             Activate channel attention block in the Transformer to allow channels to attend each other.
         encoder_ffn_dim (`int`, *optional*, defaults to 256):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
@@ -78,23 +78,24 @@ class PatchTSTConfig(PretrainedConfig):
             The dropout path in the residual block.
         ff_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability used between the two layers of the feed-forward networks.
-        bias (`bool`, *optional*, defaults to True):
+        bias (`bool`, *optional*, defaults to `True`):
             Consider bias in the feed-forward networks.
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
+        pre_norm (`bool`, *optional*, defaults to `False`): <fill_docstring>
         positional_encoding (`str`, *optional*, defaults to `"sincos"`):
             Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
-        learn_pe (`bool`, *optional*, defaults to False):
+        learn_pe (`bool`, *optional*, defaults to `False`):
             Whether the positional encoding is updated during training.
-        use_cls_token (`bool`, *optional*, defaults to False):
+        use_cls_token (`bool`, *optional*, defaults to `False`):
             Whether cls token is used.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated normal weight initialization distribution.
-        shared_projection (`bool`, *optional*, defaults to True):
+        shared_projection (`bool`, *optional*, defaults to `True`):
             Sharing the projection layer across different channels in the forecast head.
-        seed_number (`int`, *optional*, defaults to None):
+        seed_number (`int`, *optional*):
             Use seed number for random masking.
-        scaling (`string` or `bool`, *optional* defaults to `"mean"`):
+        scaling (`string` or `bool`, *optional*, defaults to `"mean"`):
             Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
             scaler is set to "mean".
         mask_input (`bool`, *optional*, defaults to False):
@@ -103,9 +104,11 @@ class PatchTSTConfig(PretrainedConfig):
             Masking type. Only `"random"` is currently supported.
         mask_ratio (`float`, *optional*, defaults to 0.5):
             Masking ratio is applied to mask the input data during pretraining.
-        channel_consistent_masking (`bool`, *optional*, defaults to False):
+        mask_patches (`List`, *optional*, defaults to `[2, 3]`): <fill_docstring>
+        mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): <fill_docstring>
+        channel_consistent_masking (`bool`, *optional*, defaults to `False`):
             If channel consistent masking is True, all the channels will have the same masking.
-        unmasked_channel_indices (`list`, *optional*, defaults to None):
+        unmasked_channel_indices (`list`, *optional*):
             Channels are not masked during pretraining.
         mask_value (`int`, *optional*, defaults to 0):
             Mask value to set.
@@ -113,13 +116,11 @@ class PatchTSTConfig(PretrainedConfig):
             Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
         head_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for head.
-        prediction_length (`int`):
-            The prediction length for the encoder. In other words, the prediction horizon of the model.
-        prediction_length (`int`):
+        prediction_length (`int`, *optional*, defaults to 24):
             The prediction length for the encoder. In other words, the prediction horizon of the model.
         num_output_channels (`int`, *optional*, defaults to 1):
             Number of output channels.
-        prediction_range (`list`, *optional*, defaults to None):
+        prediction_range (`list`, *optional*):
             The range of prediction values can be set to enforce the model to produce values within a range.
         num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for probablistic forecast.
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index b54f50346bb79c..e7ecad05adf5c7 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -845,9 +845,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 
 
 @dataclass
-class PatchTSTForMaskPretrainingOutput(ModelOutput):
+class PatchTSTForPretrainingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForMaskPretraining`].
+    Output type of [`PatchTSTForPretraining`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
@@ -1288,7 +1288,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel):
+class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     # PatchTSTModel + Pretraining Head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -1308,7 +1308,7 @@ def forward(
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, PatchTSTForMaskPretrainingOutput]:
+    ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
         """
         past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels
         """
@@ -1332,7 +1332,7 @@ def forward(
         encoder_states = model_output.hidden_states
         if not return_dict:
             return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None)
-        return PatchTSTForMaskPretrainingOutput(
+        return PatchTSTForPretrainingOutput(
             loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states
         )
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 45522f0c8da893..d60b81511deda0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5921,7 +5921,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class PatchTSTForMaskPretraining(metaclass=DummyObject):
+class PatchTSTForPretraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index c669ef6f44959e..14a47cd8ad523b 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -41,7 +41,7 @@
         PatchTSTConfig,
         PatchTSTForClassification,
         PatchTSTForForecasting,
-        PatchTSTForMaskPretraining,
+        PatchTSTForPretraining,
         PatchTSTForPrediction,
         PatchTSTForRegression,
         PatchTSTModel,
@@ -149,7 +149,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
             PatchTSTModel,
             PatchTSTForPrediction,
             PatchTSTForForecasting,
-            PatchTSTForMaskPretraining,
+            PatchTSTForPretraining,
             PatchTSTForClassification,
             PatchTSTForRegression,
         )
@@ -157,7 +157,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         else ()
     )
     all_generative_model_classes = (
-        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else ()
+        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
     )
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     test_pruning = False
@@ -305,7 +305,7 @@ def prepare_batch(repo_id="ibm/etth1-forecast-test", file="train-batch.pt"):
 class PatchTSTModelIntegrationTests(unittest.TestCase):
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_pretrain_head(self):
-        model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device)
+        model = PatchTSTForPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device)
         batch = prepare_batch()
 
         torch.manual_seed(0)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 33cd397a627710..5358f4854b410e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -179,7 +179,7 @@
     "InformerForPrediction",
     "AutoformerForPrediction",
     "PatchTSTForForecasting",
-    "PatchTSTForMaskPretraining",
+    "PatchTSTForPretraining",
     "PatchTSTForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",

From cd0133f77b7590caa4f3a593210291a80dc27f69 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 13:36:08 +0200
Subject: [PATCH 097/189] bs -> batch_size

---
 .../models/patchtst/modeling_patchtst.py       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e7ecad05adf5c7..5a48e47ff6f63b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -594,14 +594,14 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [bs x nvars x sequence_length x d_model] Return:
-            Tensor [bs x nvars x sequence_length x d_model]
+        src: tensor [batch_size x nvars x sequence_length x d_model] Return:
+            Tensor [batch_size x nvars x sequence_length x d_model]
         """
-        bs, num_input_channels, sequence_length, d_model = src.shape
+        batch_size, num_input_channels, sequence_length, d_model = src.shape
 
         # First sublayer: attention across time
         src = src.view(
-            bs * num_input_channels, sequence_length, d_model
+            batch_size * num_input_channels, sequence_length, d_model
         )  # src: [(bs*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
@@ -613,7 +613,7 @@ def forward(self, src: torch.Tensor):
             src = self.norm_sublayer1(
                 src + self.dropout_path1(self.self_attn(src)[0])
             )  # src: [(bs*nvars) x sequence_length x d_model]
-        src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+        src = src.reshape(batch_size, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
@@ -632,13 +632,13 @@ def forward(self, src: torch.Tensor):
                     src + self.dropout_path2(self.self_attn(src)[0])
                 )  # src: [(bs*sequence_length) x nvars x d_model]
             src = (
-                src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
+                src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
             )  # src: [bs x nvars x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
         src = src.view(
-            bs * num_input_channels, sequence_length, d_model
-        )  # src: [(bs*nvars) x sequence_length x d_model]
+            batch_size * num_input_channels, sequence_length, d_model
+        )  # src: [(batch_size*nvars) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(
@@ -649,7 +649,7 @@ def forward(self, src: torch.Tensor):
             src = self.norm_sublayer3(
                 src + self.dropout_path3(self.ff(src))
             )  # Add: residual connection with residual dropout
-        src = src.reshape(bs, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+        src = src.reshape(batch_size, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
 
         return src
 

From bc2bf31f76567f3319541bb64d3580290688d004 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 15:09:55 +0200
Subject: [PATCH 098/189] more review fixes

---
 .../models/patchtst/modeling_patchtst.py             | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5a48e47ff6f63b..d3da375028b995 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -545,9 +545,8 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
         self.channel_attention = config.channel_attention
-        # Multi-Head attention
-        # self.self_attn = PatchTSTAttention(config)
 
+        # Multi-Head attention
         self.self_attn = PatchTSTAttention(
             embed_dim=config.d_model,
             num_heads=config.encoder_attention_heads,
@@ -673,11 +672,11 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (ChannelAttentionPatchTSTEncoder)):
+        if isinstance(module, (PatchTSTEncoder)):
             module.gradient_checkpointing = value
 
 
-class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel):
+class PatchTSTEncoder(PatchTSTPreTrainedModel):
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.num_input_channels = config.num_input_channels
@@ -730,8 +729,7 @@ def forward(
             tensor [bs x nvars x num_patches x d_model]
                 or [bs x nvars x (num_patches+1) x d_model] if use cls_token
         """
-        # bs, num_patches, num_input_channels, patch_length = x.shape
-        bs, num_input_channels, num_patches, patch_length = past_values.shape
+        _, num_input_channels, _, _ = past_values.shape
 
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1222,7 +1220,7 @@ def __init__(self, config: PatchTSTConfig):
             )
         else:
             self.masking = nn.Identity()
-        self.encoder = ChannelAttentionPatchTSTEncoder(config)
+        self.encoder = PatchTSTEncoder(config)
 
         # Initialize weights and apply final processing
         self.post_init()

From b8a8231781490d1fa66799288447dabe47a85ab1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 15:15:46 +0200
Subject: [PATCH 099/189] use hidden_state

---
 src/transformers/__init__.py                  |  4 +--
 src/transformers/models/patchtst/__init__.py  |  2 +-
 .../models/patchtst/modeling_patchtst.py      | 26 ++++++++++---------
 .../models/patchtst/test_modeling_patchtst.py |  2 +-
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d94188d42f5e8a..e43c7ef59175cb 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2439,8 +2439,8 @@
             "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
             "PatchTSTForClassification",
             "PatchTSTForForecasting",
-            "PatchTSTForPretraining",
             "PatchTSTForPrediction",
+            "PatchTSTForPretraining",
             "PatchTSTForRegression",
             "PatchTSTModel",
             "PatchTSTPreTrainedModel",
@@ -6269,8 +6269,8 @@
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
             PatchTSTForForecasting,
-            PatchTSTForPretraining,
             PatchTSTForPrediction,
+            PatchTSTForPretraining,
             PatchTSTForRegression,
             PatchTSTModel,
             PatchTSTPreTrainedModel,
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index 8ca9b1f88eb8c4..e2ac594688d90e 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -55,8 +55,8 @@
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
             PatchTSTForForecasting,
-            PatchTSTForPretraining,
             PatchTSTForPrediction,
+            PatchTSTForPretraining,
             PatchTSTForRegression,
             PatchTSTModel,
             PatchTSTPreTrainedModel,
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d3da375028b995..74d92b4ec88081 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -524,20 +524,20 @@ def __init__(self, config: PatchTSTConfig):
 
         self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
-    def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None):
+    def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        src: tensor [bs x nvars x sequence_length x d_model] Return:
+        hidden_state: tensor [bs x nvars x sequence_length x d_model] Return:
             Tensor [bs x nvars x sequence_length x d_model]
         """
         all_hidden_states = []
 
         for mod in self.layers:
-            src = mod(src)
+            hidden_state = mod(hidden_state)
             if output_hidden_states:
-                all_hidden_states.append(src)
+                all_hidden_states.append(hidden_state)
         if output_hidden_states is None:
-            return src, None
-        return src, all_hidden_states
+            return hidden_state, None
+        return hidden_state, all_hidden_states
 
 
 class ChannelAttentionTSTEncoderLayer(nn.Module):
@@ -612,13 +612,15 @@ def forward(self, src: torch.Tensor):
             src = self.norm_sublayer1(
                 src + self.dropout_path1(self.self_attn(src)[0])
             )  # src: [(bs*nvars) x sequence_length x d_model]
-        src = src.reshape(batch_size, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+        src = src.reshape(
+            batch_size, num_input_channels, sequence_length, d_model
+        )  # [bs x nvars x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
         # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
         if self.channel_attention:
             src = (
-                src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, d_model)
+                src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model)
             )  # [(bs*sequence_length) x nvars x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
@@ -648,7 +650,9 @@ def forward(self, src: torch.Tensor):
             src = self.norm_sublayer3(
                 src + self.dropout_path3(self.ff(src))
             )  # Add: residual connection with residual dropout
-        src = src.reshape(batch_size, num_input_channels, sequence_length, d_model)  # [bs x nvars x sequence_length x d_model]
+        src = src.reshape(
+            batch_size, num_input_channels, sequence_length, d_model
+        )  # [bs x nvars x sequence_length x d_model]
 
         return src
 
@@ -1330,9 +1334,7 @@ def forward(
         encoder_states = model_output.hidden_states
         if not return_dict:
             return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None)
-        return PatchTSTForPretrainingOutput(
-            loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states
-        )
+        return PatchTSTForPretrainingOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states)
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 14a47cd8ad523b..d25cc525326ab5 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -41,8 +41,8 @@
         PatchTSTConfig,
         PatchTSTForClassification,
         PatchTSTForForecasting,
-        PatchTSTForPretraining,
         PatchTSTForPrediction,
+        PatchTSTForPretraining,
         PatchTSTForRegression,
         PatchTSTModel,
     )

From 8c3ab7f44585420f9f09b473b2fedb385ba3c98a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 9 Oct 2023 15:25:11 +0200
Subject: [PATCH 100/189] rename encoder layer and block class

---
 src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 74d92b4ec88081..faba4597a0f01a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -518,11 +518,11 @@ def forward(self, x: torch.Tensor):
         return x_mask, mask
 
 
-class ChannelAttentionTSTEncoder(nn.Module):
+class PatchTSTEncoderBlock(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)])
+        self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
     def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
@@ -540,7 +540,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo
         return hidden_state, all_hidden_states
 
 
-class ChannelAttentionTSTEncoderLayer(nn.Module):
+class PatchTSTEncoderLayer(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -716,7 +716,7 @@ def __init__(self, config: PatchTSTConfig):
         )
 
         # Encoder
-        self.encoder = ChannelAttentionTSTEncoder(config)
+        self.encoder = PatchTSTEncoderBlock(config)
 
         # Initialize weights and apply final processing
         self.post_init()

From 2553965ad3f406282b844f12172e9c853513b682 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 9 Oct 2023 13:55:45 -0400
Subject: [PATCH 101/189] remove commented seed_number

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index faba4597a0f01a..128f6e772f6739 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -464,8 +464,6 @@ def __init__(
         mask_value=0,
         seed_number: Optional[int] = None,
     ):
-        # if seed_number:
-        #     set_seed(seed_number)
         self.mask_ratio = mask_ratio
         self.channel_consistent_masking = channel_consistent_masking
         self.mask_type = mask_type

From 85538b163b7c32892f09d35036a0d7c87b16bd97 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 9 Oct 2023 15:11:12 -0400
Subject: [PATCH 102/189] edit docstring

---
 .../models/patchtst/modeling_patchtst.py      | 66 ++++++++++++-------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 128f6e772f6739..87ad4673d9b81d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -310,7 +310,7 @@ def forecast_masking(
     For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in
     unmasked_channel_indices.
 
-    Args:
+    Parameters:
         inputs (`torch.Tensor`):
             Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
         patch_lengths (list):
@@ -383,7 +383,7 @@ class PatchTSTPatchify(nn.Module):
         stride (int, required): stride between patches.
 
     Returns:
-        z: output tensor data [bs x num_input_channels x num_patches x patch_length]
+        `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`
     """
 
     def __init__(
@@ -412,10 +412,10 @@ def __init__(
     def forward(self, past_values: torch.Tensor):
         """
         Parameters:
-            past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels]
+            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, nvars)`, *required*):
 
         Returns:
-            x: output tensor data [bs x num_input_channels x num_patches x patch_length]
+            `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`
         """
         sequence_length = past_values.shape[-2]
         if sequence_length != self.sequence_length:
@@ -433,9 +433,9 @@ def forward(self, past_values: torch.Tensor):
 
 class PatchTSTMasking(nn.Module):
     """
-    PatchTSTMasking: Class for random or forcast masking on inputs.
+    Class for random or forcast masking on inputs.
 
-    Args:
+    Parameters:
         mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
         mask_ratio (float, optional): Mask ratio.
         mask_patches (list, optional): List of patch lengths to mask in the end of the data.
@@ -450,6 +450,10 @@ class PatchTSTMasking(nn.Module):
         seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
 
     Returns:
+        x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`)
+                Masked patched input
+        mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`)
+            Bool tensor indicating True on masked points
 
     """
 
@@ -479,15 +483,16 @@ def __init__(
 
     def forward(self, x: torch.Tensor):
         """
-        Input:
-            x: patched input
-                4D: [bs x num_input_channels x num_patches x patch_length]
-
-        Output:
-            x_mask: Masked patched input
-                4D: [bs x num_input_channels x num_patches x patch_length]
-            mask: bool tensor indicating True on masked points
-                4D: [bs x num_input_channels x num_patch]
+        Parameters:
+            x (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*):
+                Patched input
+            
+        Return:
+            x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) 
+                Masked patched input                
+            mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`)
+                Bool tensor indicating True on masked points
+
         """
 
         if self.mask_type == "random":
@@ -517,6 +522,9 @@ def forward(self, x: torch.Tensor):
 
 
 class PatchTSTEncoderBlock(nn.Module):
+    """
+    PatchTST encoder block
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -524,8 +532,14 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
-        hidden_state: tensor [bs x nvars x sequence_length x d_model] Return:
-            Tensor [bs x nvars x sequence_length x d_model]
+        Parameters:
+            hidden_state (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*):
+                Past values of the time series
+            output_hidden_states (`bool`, *optional*):
+                output hidden state option
+        Return:
+            `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`
+
         """
         all_hidden_states = []
 
@@ -539,6 +553,9 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo
 
 
 class PatchTSTEncoderLayer(nn.Module):
+    """
+    PatchTST encoder layer
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -591,8 +608,12 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, src: torch.Tensor):
         """
-        src: tensor [batch_size x nvars x sequence_length x d_model] Return:
-            Tensor [batch_size x nvars x sequence_length x d_model]
+        Parameters:
+            src (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*):
+                Past values of the time series
+        Return:
+            `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`
+
         """
         batch_size, num_input_channels, sequence_length, d_model = src.shape
 
@@ -724,12 +745,13 @@ def forward(
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
-            past_values: tensor [bs x nvars x num_patches x patch_length].
+            past_values (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*):
+                Past values of the time series
             output_hidden_states (bool, optional): Indicates if hidden states should be output.
 
         return:
-            tensor [bs x nvars x num_patches x d_model]
-                or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+            `torch.Tensor` of shape `(batch_size, nvars, num_patches, d_model)`
+            or `(batch_size, nvars, num_patches+1, d_model)` if cls_token is used
         """
         _, num_input_channels, _, _ = past_values.shape
 

From c36370deaa3d008c6f6254d113244ab8d75bd38c Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 9 Oct 2023 17:48:00 -0400
Subject: [PATCH 103/189] Add docstring

---
 .../models/patchtst/modeling_patchtst.py      | 232 ++++++++++--------
 1 file changed, 130 insertions(+), 102 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 87ad4673d9b81d..3ebf831273f73c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -42,7 +42,14 @@
 
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST
 class PatchTSTAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper
+
+    Parameters:
+        hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
+            Input to the multi-head attention block
+
+    """
 
     def __init__(
         self,
@@ -71,8 +78,8 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, sequence_length: int, bsz: int):
+        return tensor.view(bsz, sequence_length, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
         self,
@@ -196,11 +203,23 @@ def forward(
 
 
 class PatchTSTTranspose(nn.Module):
+    """
+    Transpose the tensor to the dimension defined in **dims**
+    Parameters:
+        dims (`list`): list of dimensions to be transposed
+        contiguous (`bool`): if True, the transposed tensor is contiguous
+    """
     def __init__(self, *dims, contiguous=False):
         super().__init__()
         self.dims, self.contiguous = dims, contiguous
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor):
+        """
+        Parameters:
+            inputs (`torch.Tensor`): input to be transposed
+        Returns:
+            `torch.Tensor`: transposed tensor
+        """
         if self.contiguous:
             return inputs.transpose(*self.dims).contiguous()
         else:
@@ -244,13 +263,13 @@ def random_masking(
     mask_ratio: float,
     unmasked_channel_indices: list = None,
     channel_consistent_masking: bool = False,
-    mask_value=0,
+    mask_value: int = 0,
     seed_number: Optional[int] = None,
 ):
     """random_masking: Mask the input considering the control variables.
 
     Args:
-        inputs (`torch.Tensor` of shape `(batch_size, nvars, seq_len, feat)`):
+        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
             The input tensor to mask.
         mask_ratio (`float`):
             Mask ratio.
@@ -271,26 +290,26 @@ def random_masking(
     if seed_number:
         set_seed(seed_number)
 
-    batch_size, nvars, seq_len, feat = inputs.shape
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
     device = inputs.device
 
-    len_keep = int(seq_len * (1 - mask_ratio))
+    len_keep = int(sequence_length * (1 - mask_ratio))
 
     if channel_consistent_masking:
-        noise = torch.rand(batch_size, 1, seq_len, device=device)  # noise in [0, 1], bs x 1 x  L
-        noise = noise.repeat(1, nvars, 1)  # bs x nvars x time
+        noise = torch.rand(batch_size, 1, sequence_length, device=device)  # noise in [0, 1], bs x 1 x  L
+        noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
     else:
-        noise = torch.rand(batch_size, nvars, seq_len, device=device)  # noise in [0, 1], bs x nvars x L
+        noise = torch.rand(batch_size, num_channels, sequence_length, device=device)  # noise in [0, 1], bs x num_channels x L
 
-    mask = torch.ones(batch_size, nvars, seq_len, device=device)  # mask: [bs x nvars x num_patch]
+    mask = torch.ones(batch_size, num_channels, sequence_length, device=device)  # mask: [bs x num_channels x num_patch]
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
     ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
-    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x nvars x L]
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x num_channels x L]
 
     mask = torch.gather(mask, dim=-1, index=ids_restore)
-    mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat)  # mask: [bs x nvars x num_patches x patch_length]
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patches x patch_length]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
@@ -312,7 +331,7 @@ def forecast_masking(
 
     Parameters:
         inputs (`torch.Tensor`):
-            Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len]
+            Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch x patch_len]
         patch_lengths (list):
             List of patch lengths to mask in the end of the data.
         mix_ratio (list, *optional* defaults to None):
@@ -335,15 +354,15 @@ def forecast_masking(
     if mix_ratio is None:
         mix_ratio = [1 for t in patch_lengths]
 
-    batch_size, nvars, seq_len, feat = inputs.shape
-    mask = torch.zeros(batch_size, nvars, seq_len, device=inputs.device)
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
+    mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
 
     t_list = []
     total_length = 0
     total_ratio = sum(mix_ratio)
 
     for i, j in zip(patch_lengths, mix_ratio):
-        if i <= 0 or i >= seq_len:
+        if i <= 0 or i >= sequence_length:
             raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
         temp_len = int(batch_size * j / total_ratio)
         t_list.append([i, j, temp_len])
@@ -365,7 +384,7 @@ def forecast_masking(
     perm = torch.randperm(mask.shape[0])
     mask = mask[perm]
 
-    mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat)  # mask: [bs x nvars x num_patch x patch_len]
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patch x patch_len]
     if unmasked_channel_indices is not None:
         mask[:, unmasked_channel_indices, :, :] = 0
 
@@ -383,7 +402,7 @@ class PatchTSTPatchify(nn.Module):
         stride (int, required): stride between patches.
 
     Returns:
-        `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`
+        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
     """
 
     def __init__(
@@ -412,10 +431,10 @@ def __init__(
     def forward(self, past_values: torch.Tensor):
         """
         Parameters:
-            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, nvars)`, *required*):
+            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
 
         Returns:
-            `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
         """
         sequence_length = past_values.shape[-2]
         if sequence_length != self.sequence_length:
@@ -423,11 +442,11 @@ def forward(self, past_values: torch.Tensor):
                 f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
             )
 
-        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x nvars]
+        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x num_channels]
         x = x.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
         )  # x: [bs x num_patches x num_input_channels x patch_length]
-        x = x.transpose(-2, -3).contiguous()  # xb: [bs x num_input_channels x num_patches x patch_length]
+        x = x.transpose(-2, -3).contiguous()  # x: [bs x num_input_channels x num_patches x patch_length]
         return x
 
 
@@ -450,9 +469,9 @@ class PatchTSTMasking(nn.Module):
         seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
 
     Returns:
-        x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`)
+        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                 Masked patched input
-        mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`)
+        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
             Bool tensor indicating True on masked points
 
     """
@@ -484,13 +503,13 @@ def __init__(
     def forward(self, x: torch.Tensor):
         """
         Parameters:
-            x (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*):
+            x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Patched input
             
         Return:
-            x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) 
+            x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                 Masked patched input                
-            mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`)
+            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                 Bool tensor indicating True on masked points
 
         """
@@ -533,12 +552,12 @@ def __init__(self, config: PatchTSTConfig):
     def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
         Parameters:
-            hidden_state (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*):
+            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                 Past values of the time series
             output_hidden_states (`bool`, *optional*):
                 output hidden state option
         Return:
-            `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`
+            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
 
         """
         all_hidden_states = []
@@ -609,10 +628,10 @@ def __init__(self, config: PatchTSTConfig):
     def forward(self, src: torch.Tensor):
         """
         Parameters:
-            src (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*):
+            src (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                 Past values of the time series
         Return:
-            `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`
+            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
 
         """
         batch_size, num_input_channels, sequence_length, d_model = src.shape
@@ -620,7 +639,7 @@ def forward(self, src: torch.Tensor):
         # First sublayer: attention across time
         src = src.view(
             batch_size * num_input_channels, sequence_length, d_model
-        )  # src: [(bs*nvars) x sequence_length x d_model]
+        )  # src: [(bs*num_channels) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
@@ -630,17 +649,18 @@ def forward(self, src: torch.Tensor):
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             src = self.norm_sublayer1(
                 src + self.dropout_path1(self.self_attn(src)[0])
-            )  # src: [(bs*nvars) x sequence_length x d_model]
+            )  # src: [(bs*num_channels) x sequence_length x d_model]
         src = src.reshape(
             batch_size, num_input_channels, sequence_length, d_model
-        )  # [bs x nvars x sequence_length x d_model]
+        )  # [bs x num_channels x sequence_length x d_model]
 
         # second sublayer: attention across variable at any given time
-        # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model]
+        # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model]
+        #                                                 -> [(bs*sequence_length) x num_channels x d_model]
         if self.channel_attention:
             src = (
                 src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model)
-            )  # [(bs*sequence_length) x nvars x d_model]
+            )  # [(bs*sequence_length) x num_channels x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 src = src + self.dropout_path2(
@@ -650,15 +670,15 @@ def forward(self, src: torch.Tensor):
                 ## Multi-Head attention and Add residual connection and Norm
                 src = self.norm_sublayer2(
                     src + self.dropout_path2(self.self_attn(src)[0])
-                )  # src: [(bs*sequence_length) x nvars x d_model]
+                )  # src: [(bs*sequence_length) x num_channels x d_model]
             src = (
                 src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
-            )  # src: [bs x nvars x sequence_length x d_model]
+            )  # src: [bs x num_channels x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
         src = src.view(
             batch_size * num_input_channels, sequence_length, d_model
-        )  # src: [(batch_size*nvars) x sequence_length x d_model]
+        )  # src: [(batch_size*num_channels) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             src = src + self.dropout_path3(
@@ -671,7 +691,7 @@ def forward(self, src: torch.Tensor):
             )  # Add: residual connection with residual dropout
         src = src.reshape(
             batch_size, num_input_channels, sequence_length, d_model
-        )  # [bs x nvars x sequence_length x d_model]
+        )  # [bs x num_channels x sequence_length x d_model]
 
         return src
 
@@ -745,13 +765,13 @@ def forward(
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
-            past_values (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*):
+            past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Past values of the time series
             output_hidden_states (bool, optional): Indicates if hidden states should be output.
 
         return:
-            `torch.Tensor` of shape `(batch_size, nvars, num_patches, d_model)`
-            or `(batch_size, nvars, num_patches+1, d_model)` if cls_token is used
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
+            or `(batch_size, num_channels, num_patches+1, d_model)` if cls_token is used
         """
         _, num_input_channels, _, _ = past_values.shape
 
@@ -766,23 +786,23 @@ def forward(
                 x_out.append(z)
             past_values = torch.stack(x_out, dim=1)
         else:
-            past_values = self.w_p(past_values)  # x: [bs x nvars  x num_patches x d_model]
+            past_values = self.w_p(past_values)  # x: [bs x num_channels  x num_patches x d_model]
 
         if self.use_cls_token:
-            # x: [bs x nvars x num_patches x d_model]
+            # x: [bs x num_channels x num_patches x d_model]
             past_values = self.positional_dropout(past_values + self.w_pos[1:, :])
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
-            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x nvars x (num_patches+1) x d_model]
+            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x num_channels x (num_patches+1) x d_model]
         else:
-            past_values = self.positional_dropout(past_values + self.w_pos)  # x: [bs x nvars x num_patches x d_model]
+            past_values = self.positional_dropout(past_values + self.w_pos)  # x: [bs x num_channels x num_patches x d_model]
 
         # Encoder
         past_values, hidden_states = self.encoder(
             past_values, output_hidden_states
-        )  # x: [bs x nvars x num_patches x d_model]
-        # or [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        )  # x: [bs x num_channels x num_patches x d_model]
+        # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
 
         # return past_values, hidden_states
         return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states)
@@ -1098,7 +1118,7 @@ class PatchTSTStdScaler(nn.Module):
     Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
     by subtracting from the mean and dividing by the standard deviation.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to calculate the mean and standard deviation.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1132,7 +1152,7 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
     accordingly.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1189,7 +1209,7 @@ class PatchTSTNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
-    Args:
+    Parameters:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1265,7 +1285,7 @@ def forward(
         if past_observed_mask is None:
             past_observed_mask = torch.ones_like(past_values)
 
-        # x: tensor [bs x seq_len x in_channels]
+        # x: tensor [bs x sequence_length x num_input_channels]
         scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask)
 
         # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
@@ -1300,11 +1320,11 @@ def __init__(self, config):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        x: tensor [bs x nvars x num_patches x d_model]
-                or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        output: tensor [bs x nvars x num_patches x patch_length]
+        x: tensor [bs x num_channels x num_patches x d_model]
+                or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+        output: tensor [bs x num_channels x num_patches x patch_length]
         """
-        x = self.linear(self.dropout(x))  # [bs x nvars x num_patches x patch_length]
+        x = self.linear(self.dropout(x))  # [bs x num_channels x num_patches x patch_length]
         if self.use_cls_token:
             x = x[:, :, 1:, :]  # remove the first cls token
         return x
@@ -1339,12 +1359,12 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # past_values: [bs x nvars x num_patches x d_model] or
-        # [bs x nvars x (num_patches+1) x d_model] if use cls_token
+        # past_values: [bs x num_channels x num_patches x d_model] or
+        # [bs x num_channels x (num_patches+1) x d_model] if use cls_token
         model_output = self.model(past_values, output_hidden_states=output_hidden_states)
 
-        # model_output[0]: [bs x nvars x num_patches x patch_length] or
-        # [bs x nvars x (num_patches+1) x patch_length] if use cls_token
+        # model_output[0]: [bs x num_channels x num_patches x patch_length] or
+        # [bs x num_channels x (num_patches+1) x patch_length] if use cls_token
         x_hat = self.head(model_output[0])
 
         # calculate masked_loss
@@ -1406,21 +1426,22 @@ def __init__(self, config: PatchTSTConfig):
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
         self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels)
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, embedding: torch.Tensor):
         """
-        x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output:
+        embedding: [bs x num_channels x num_patches x d_model]
+            or [bs x num_channels x (num_patches+1) x d_model] if use cls_token output:
         [bs x n_classes]
         """
         if self.use_cls_token:
-            x = x[:, :, 0, :]  # use the first output token, x: bs x nvars x d_model
+            x = embedding[:, :, 0, :]  # use the first output token, x: bs x num_channels x d_model
         elif self.pooling == "mean":
-            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
         elif self.pooling == "max":
-            x = x.max(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
-        x = self.flatten(x)  # x: bs x nvars * d_model
+        x = self.flatten(x)  # x: bs x num_channels * d_model
         y = self.linear(self.dropout(x))  # y: bs x n_classes
         return y
 
@@ -1443,37 +1464,40 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, embedding: torch.Tensor):
         """
-        x: [bs x nvars x num_patch x d_model]
-            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        embedding: [bs x num_channels x num_patch x d_model]
+            or [bs x num_channels x (num_patch+1) x d_model] if use cls_token
         output: [bs x pred_len x num_output_channels]
         """
-        batch_size = x.shape[0]
+        batch_size = embedding.shape[0]
         if self.use_cls_token:
-            x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
+            x = embedding[:, :, 0, :]  # use the first output token, x: [bs x num_channels x d_model]
         elif self.pooling == "mean":
-            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
         elif self.pooling == "max":
-            x = x.max(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
 
         # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
+        x = self.dropout(self.flatten(x))  # x: bs x (num_channels * d_model)
         # projection
         y = self.projection(x)
         # reshape y
         if isinstance(y, tuple):  # for distribution head
             y = (
                 z.reshape(batch_size, -1, self.num_output_channels) for z in y
-            )  # tuple of [bs x pred_len x num_output_channels]
+            )  # tuple of [bs x prediction_len x num_output_channels]
         else:  # for linear head
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x pred_len x num_output_channels]
+            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x prediction_len x num_output_channels]
         return y
 
 
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
+    """
+
+    """
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -1544,7 +1568,7 @@ def generate(
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
-        Parameters:
+        Args:
             past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 Past values of the time series that serves as context in order to predict the future.
 
@@ -1616,22 +1640,22 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
                 self.projection = distribution_output.get_parameter_projection(head_dim)
             self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, embedding: torch.Tensor):
         """
-        x: [bs x nvars x num_patches x d_model]
-            or [bs x nvars x (num_patches+1) x d_model] if use cls_token
-        output: [bs x forecast_len x nvars]
+        embedding: [bs x num_channels x num_patches x d_model]
+            or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+        output: [bs x forecast_len x num_channels]
         """
 
         if self.use_cls_token:
-            y = x[:, :, 0, :]  # y: [bs x nvars x d_model]
+            y = embedding[:, :, 0, :]  # y: [bs x num_channels x d_model]
         else:
             if self.pooling == "mean":
-                y = x.mean(dim=2)  # y: [bs x nvars x d_model]
+                y = embedding.mean(dim=2)  # y: [bs x num_channels x d_model]
             elif self.pooling == "max":
-                y = x.max(dim=2)  # y: [bs x nvars x d_model]
+                y = embedding.max(dim=2)  # y: [bs x num_channels x d_model]
             else:
-                y = x  # y: [bs x nvars x num_patches x d_model]
+                y = embedding  # y: [bs x num_channels x num_patches x d_model]
 
         if not self.shared_projection:
             x_out = []
@@ -1642,25 +1666,29 @@ def forward(self, x: torch.Tensor):
                     z
                 )  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
                 x_out.append(z)
-            output = torch.stack(x_out, dim=1)  # x: [bs x nvars x forecast_len]
+            output = torch.stack(x_out, dim=1)  # x: [bs x num_channels x forecast_len]
         else:
-            z = self.flatten(y)  # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)]
+            z = self.flatten(y)  # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)]
             z = self.dropout(z)
             output = self.projection(
                 z
-            )  # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head
+            )  # output: [bs x num_channels x forecast_len]
+               # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
 
         if isinstance(output, tuple):
             output = tuple(
                 z.transpose(2, 1) for z in output
-            )  # ([bs x forecast_len x nvars], [bs x forecast_len x nvars])
+            )  # ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels])
         else:
-            output = output.transpose(2, 1)  # [bs x forecast_len x nvars]
+            output = output.transpose(2, 1)  # [bs x forecast_len x num_channels]
 
         return output
 
 
 class PatchTSTForForecasting(PatchTSTPreTrainedModel):
+    """
+    PatchTST for forecasting
+    """
     # PatchTST model + Forecasting head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -1778,9 +1806,9 @@ def generate(
         # get samples
         samples = [
             distribution.sample() for _ in range(num_parallel_samples)
-        ]  # samples: list of [bs x forecast_len x nvars]
+        ]  # samples: list of [bs x forecast_len x num_channels]
         # stack tensors
-        samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x nvars]
+        samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x num_channels]
         return SamplePatchTSTForecastOutput(sequences=samples)
 
 
@@ -1802,22 +1830,22 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
-    def forward(self, x):
+    def forward(self, embedding: torch.Tensor):
         """
-        x: [bs x nvars x num_patch x d_model]
-            or [bs x nvars x (num_patch+1) x d_model] if use cls_token
+        embedding: [bs x num_channels x num_patch x d_model]
+            or [bs x num_channels x (num_patch+1) x d_model] if use cls_token
         output: [bs x output_dim]
         """
         if self.use_cls_token:
-            x = x[:, :, 0, :]  # use the first output token, x: [bs x nvars x d_model]
+            x = embedding[:, :, 0, :]  # use the first output token, x: [bs x num_channels x d_model]
         elif self.pooling == "mean":
-            x = x.mean(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
         elif self.pooling == "max":
-            x = x.max(dim=2)  # x: [bs x nvars x d_model]
+            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
         else:
             raise Exception(f"pooling operator {self.pooling} is not implemented yet")
         # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (nvars * d_model)
+        x = self.dropout(self.flatten(x))  # x: bs x (num_channels * d_model)
         # projection
         y = self.projection(x)  # y: bs x output_dim or a tuple of this shape for distribution head
         #

From fe3f4d49da315f0c31f6d1b6be4c9359ab999978 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 10 Oct 2023 11:35:35 +0200
Subject: [PATCH 104/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 3ebf831273f73c..55ff3446afe00b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -204,11 +204,12 @@ def forward(
 
 class PatchTSTTranspose(nn.Module):
     """
-    Transpose the tensor to the dimension defined in **dims**
     Parameters:
+    Transpose the tensor to the dimension defined in **dims**
         dims (`list`): list of dimensions to be transposed
         contiguous (`bool`): if True, the transposed tensor is contiguous
     """
+
     def __init__(self, *dims, contiguous=False):
         super().__init__()
         self.dims, self.contiguous = dims, contiguous
@@ -299,9 +300,13 @@ def random_masking(
         noise = torch.rand(batch_size, 1, sequence_length, device=device)  # noise in [0, 1], bs x 1 x  L
         noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
     else:
-        noise = torch.rand(batch_size, num_channels, sequence_length, device=device)  # noise in [0, 1], bs x num_channels x L
+        noise = torch.rand(
+            batch_size, num_channels, sequence_length, device=device
+        )  # noise in [0, 1], bs x num_channels x L
 
-    mask = torch.ones(batch_size, num_channels, sequence_length, device=device)  # mask: [bs x num_channels x num_patch]
+    mask = torch.ones(
+        batch_size, num_channels, sequence_length, device=device
+    )  # mask: [bs x num_channels x num_patch]
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
@@ -331,7 +336,8 @@ def forecast_masking(
 
     Parameters:
         inputs (`torch.Tensor`):
-            Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch x patch_len]
+            Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch
+            x patch_len]
         patch_lengths (list):
             List of patch lengths to mask in the end of the data.
         mix_ratio (list, *optional* defaults to None):
@@ -505,10 +511,10 @@ def forward(self, x: torch.Tensor):
         Parameters:
             x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Patched input
-            
+
         Return:
             x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
-                Masked patched input                
+                Masked patched input
             mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                 Bool tensor indicating True on masked points
 
@@ -544,6 +550,7 @@ class PatchTSTEncoderBlock(nn.Module):
     """
     PatchTST encoder block
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -575,6 +582,7 @@ class PatchTSTEncoderLayer(nn.Module):
     """
     PatchTST encoder layer
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -770,8 +778,8 @@ def forward(
             output_hidden_states (bool, optional): Indicates if hidden states should be output.
 
         return:
-            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
-            or `(batch_size, num_channels, num_patches+1, d_model)` if cls_token is used
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` or `(batch_size, num_channels,
+            num_patches+1, d_model)` if cls_token is used
         """
         _, num_input_channels, _, _ = past_values.shape
 
@@ -794,9 +802,13 @@ def forward(
             # append cls token
             cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
-            past_values = torch.cat((cls_tokens, past_values), dim=1)  # x: [bs x num_channels x (num_patches+1) x d_model]
+            past_values = torch.cat(
+                (cls_tokens, past_values), dim=1
+            )  # x: [bs x num_channels x (num_patches+1) x d_model]
         else:
-            past_values = self.positional_dropout(past_values + self.w_pos)  # x: [bs x num_channels x num_patches x d_model]
+            past_values = self.positional_dropout(
+                past_values + self.w_pos
+            )  # x: [bs x num_channels x num_patches x d_model]
 
         # Encoder
         past_values, hidden_states = self.encoder(
@@ -1495,9 +1507,8 @@ def forward(self, embedding: torch.Tensor):
 
 
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
-    """
+    """ """
 
-    """
     # PatchTST model + prediction head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
@@ -1670,10 +1681,8 @@ def forward(self, embedding: torch.Tensor):
         else:
             z = self.flatten(y)  # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)]
             z = self.dropout(z)
-            output = self.projection(
-                z
-            )  # output: [bs x num_channels x forecast_len]
-               # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
+            output = self.projection(z)  # output: [bs x num_channels x forecast_len]
+            # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
 
         if isinstance(output, tuple):
             output = tuple(
@@ -1689,6 +1698,7 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     """
     PatchTST for forecasting
     """
+
     # PatchTST model + Forecasting head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)

From 11feb7c80d5211c974df856ae389b415174c26f1 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 10 Oct 2023 12:22:34 +0200
Subject: [PATCH 105/189] use past_observed_mask

---
 src/transformers/models/patchtst/modeling_patchtst.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 55ff3446afe00b..26753be95f93d2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -206,8 +206,8 @@ class PatchTSTTranspose(nn.Module):
     """
     Parameters:
     Transpose the tensor to the dimension defined in **dims**
-        dims (`list`): list of dimensions to be transposed
-        contiguous (`bool`): if True, the transposed tensor is contiguous
+        dims (`list`): list of dimensions to be transposed contiguous (`bool`): if True, the transposed tensor is
+        contiguous
     """
 
     def __init__(self, *dims, contiguous=False):
@@ -1373,7 +1373,9 @@ def forward(
 
         # past_values: [bs x num_channels x num_patches x d_model] or
         # [bs x num_channels x (num_patches+1) x d_model] if use cls_token
-        model_output = self.model(past_values, output_hidden_states=output_hidden_states)
+        model_output = self.model(
+            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+        )
 
         # model_output[0]: [bs x num_channels x num_patches x patch_length] or
         # [bs x num_channels x (num_patches+1) x patch_length] if use cls_token

From 3af8567bb1a1c0fd199bdc8b8b9cbddf2bff4c1e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 10 Oct 2023 14:30:54 +0200
Subject: [PATCH 106/189] doc suggestion

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 26753be95f93d2..48e732bacbd5a5 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -458,7 +458,7 @@ def forward(self, past_values: torch.Tensor):
 
 class PatchTSTMasking(nn.Module):
     """
-    Class for random or forcast masking on inputs.
+    Class to perform random or forecast masking.
 
     Parameters:
         mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.

From ed5b26be04b2e0535e752c93e4ba3b97952390eb Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 10 Oct 2023 14:47:57 +0200
Subject: [PATCH 107/189] make fix-copies

---
 .../models/patchtst/modeling_patchtst.py      | 19 ++++++-------------
 src/transformers/utils/dummy_pt_objects.py    |  4 ++--
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 48e732bacbd5a5..625f131187deb3 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -42,14 +42,7 @@
 
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST
 class PatchTSTAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper
-
-    Parameters:
-        hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
-            Input to the multi-head attention block
-
-    """
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
         self,
@@ -78,8 +71,8 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: torch.Tensor, sequence_length: int, bsz: int):
-        return tensor.view(bsz, sequence_length, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
         self,
@@ -1130,7 +1123,7 @@ class PatchTSTStdScaler(nn.Module):
     Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
     by subtracting from the mean and dividing by the standard deviation.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to calculate the mean and standard deviation.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1164,7 +1157,7 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
     accordingly.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
@@ -1221,7 +1214,7 @@ class PatchTSTNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
 
-    Parameters:
+    Args:
         dim (`int`):
             Dimension along which to compute the scale.
         keepdim (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d60b81511deda0..3dbb01528a3b99 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5921,14 +5921,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class PatchTSTForPretraining(metaclass=DummyObject):
+class PatchTSTForPrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class PatchTSTForPrediction(metaclass=DummyObject):
+class PatchTSTForPretraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):

From ddffd717a54a8f415eca0a574ee2dadee507ffde Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 10 Oct 2023 15:37:26 +0200
Subject: [PATCH 108/189] use Args:

---
 .../models/patchtst/configuration_patchtst.py      | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index f8ee3f75a9530a..89872754623825 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -37,7 +37,7 @@ class PatchTSTConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-    Parameters:
+    Args:
         num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
@@ -82,7 +82,8 @@ class PatchTSTConfig(PretrainedConfig):
             Consider bias in the feed-forward networks.
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
-        pre_norm (`bool`, *optional*, defaults to `False`): <fill_docstring>
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            TODO
         positional_encoding (`str`, *optional*, defaults to `"sincos"`):
             Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
         learn_pe (`bool`, *optional*, defaults to `False`):
@@ -104,8 +105,10 @@ class PatchTSTConfig(PretrainedConfig):
             Masking type. Only `"random"` is currently supported.
         mask_ratio (`float`, *optional*, defaults to 0.5):
             Masking ratio is applied to mask the input data during pretraining.
-        mask_patches (`List`, *optional*, defaults to `[2, 3]`): <fill_docstring>
-        mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): <fill_docstring>
+        mask_patches (`List`, *optional*, defaults to `[2, 3]`):
+            TODO
+        mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`):
+            TODO
         channel_consistent_masking (`bool`, *optional*, defaults to `False`):
             If channel consistent masking is True, all the channels will have the same masking.
         unmasked_channel_indices (`list`, *optional*):
@@ -125,7 +128,7 @@ class PatchTSTConfig(PretrainedConfig):
         num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for probablistic forecast.
 
-    Example:
+        Example:
 
     ```python
     >>> from transformers import PatchTSTConfig, PatchTSTModel
@@ -139,7 +142,6 @@ class PatchTSTConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-
     model_type = "patchtst"
     attribute_map = {
         "hidden_size": "d_model",

From ccdd0130ded6517e6338934274e37fb4d3c957be Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 10 Oct 2023 18:39:06 -0400
Subject: [PATCH 109/189] add docstring

---
 .../models/patchtst/modeling_patchtst.py      | 88 +++++++++++--------
 1 file changed, 49 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 625f131187deb3..1b2ffab6523ab4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -329,29 +329,29 @@ def forecast_masking(
 
     Parameters:
         inputs (`torch.Tensor`):
-            Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch
-            x patch_len]
-        patch_lengths (list):
+            Input of shape `(bs, num_channels, num_patch, patch_len)`
+            or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)`
+        patch_lengths (`list`):
             List of patch lengths to mask in the end of the data.
-        mix_ratio (list, *optional* defaults to None):
+        mix_ratio (`list`, *optional*):
             List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
             then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (list, *optional* defaults to None):
+        unmasked_channel_indices (`list`, *optional*):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
-        mask_value (int, *optional* defaults to 0):
+        mask_value (`int`, *optional* defaults to 0):
             Value to use for masking. Defaults to 0.
-        seed_number (int, *optional*):
+        seed_number (`int`, *optional*):
             Value to set for the random seed.
 
     Returns:
-        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape [bs x c
-        x n] or [bs x tsg1 x tsg2 x c x n]
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs, num_channels
+        , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
     """
     if seed_number:
         set_seed(seed_number)
 
     if mix_ratio is None:
-        mix_ratio = [1 for t in patch_lengths]
+        mix_ratio = [1 for _ in patch_lengths]
 
     batch_size, num_channels, sequence_length, num_features = inputs.shape
     mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
@@ -396,9 +396,9 @@ class PatchTSTPatchify(nn.Module):
     A class to patchify the time series sequence into different patches
 
     Parameters:
-        sequence_length (int, required): input sequence length.
-        patch_length (int, required): patch length.
-        stride (int, required): stride between patches.
+        sequence_length (`int`, *required*): input sequence length.
+        patch_length (`int`, *required*): patch length.
+        stride (`int`, *required*): stride between patches.
 
     Returns:
         `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
@@ -409,7 +409,6 @@ def __init__(
         sequence_length: int,
         patch_length: int,
         stride: int,
-        padding: bool = False,  # TODO: use this to set whether we want to pad zeros to the sequence
     ):
         super().__init__()
 
@@ -431,6 +430,7 @@ def forward(self, past_values: torch.Tensor):
         """
         Parameters:
             past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
+                Input to be patchified
 
         Returns:
             `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
@@ -441,12 +441,12 @@ def forward(self, past_values: torch.Tensor):
                 f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
             )
 
-        x = past_values[:, self.s_begin :, :]  # x: [bs x new_sequence_length x num_channels]
-        x = x.unfold(
+        output = past_values[:, self.s_begin :, :]  # output: [bs x new_sequence_length x num_channels]
+        output = output.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
-        )  # x: [bs x num_patches x num_input_channels x patch_length]
-        x = x.transpose(-2, -3).contiguous()  # x: [bs x num_input_channels x num_patches x patch_length]
-        return x
+        )  # output: [bs x num_patches x num_input_channels x patch_length]
+        output = output.transpose(-2, -3).contiguous()  # output: [bs x num_input_channels x num_patches x patch_length]
+        return output
 
 
 class PatchTSTMasking(nn.Module):
@@ -454,18 +454,18 @@ class PatchTSTMasking(nn.Module):
     Class to perform random or forecast masking.
 
     Parameters:
-        mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random.
-        mask_ratio (float, optional): Mask ratio.
-        mask_patches (list, optional): List of patch lengths to mask in the end of the data.
-        mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex.
+        mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random.
+        mask_ratio (`float`, *optional*): Mask ratio.
+        mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
+        mask_patch_ratios (`list`, *optional*): List of weights to use for each patch length. For Ex.
         if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (list, optional):
-            Control Variable channel indices. These channels will not be masked. Defaults to None.
-        channel_consistent_masking (bool, optional):
+        unmasked_channel_indices (`list`, *optional*):
+            Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
+        channel_consistent_masking (`bool`, *optional*):
             When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
             across channels. Defaults to True.
-        mask_value (int, optional): Value to use for masking. Defaults to 0.
-        seed_number (int, optional): Random seed, when None seed is not set. Defaults to None.
+        mask_value (`int`, *optional*): Value to use for masking. Defaults to 0.
+        seed_number (`int`, *optional*): Random seed, when None seed is not set. Defaults to None.
 
     Returns:
         x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
@@ -478,12 +478,12 @@ class PatchTSTMasking(nn.Module):
     def __init__(
         self,
         mask_type: str = "random",
-        mask_ratio=0.5,
+        mask_ratio: float = 0.5,
         mask_patches: list = [2, 3],
         mask_patch_ratios: list = [1, 1],
         channel_consistent_masking: bool = False,
         unmasked_channel_indices: list = None,
-        mask_value=0,
+        mask_value: int = 0,
         seed_number: Optional[int] = None,
     ):
         self.mask_ratio = mask_ratio
@@ -503,7 +503,7 @@ def forward(self, x: torch.Tensor):
         """
         Parameters:
             x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
-                Patched input
+                Patch input
 
         Return:
             x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
@@ -557,7 +557,9 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo
             output_hidden_states (`bool`, *optional*):
                 output hidden state option
         Return:
-            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
+            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`)
+
+            all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` of shapes `(batch_size, num_channels, sequence_length, d_model)`)
 
         """
         all_hidden_states = []
@@ -1317,22 +1319,30 @@ def forward(
 
 
 class MaskPretrainHead(nn.Module):
+    """
+    Pretraining head for mask modelling
+    """
     def __init__(self, config):
         super().__init__()
         self.dropout = nn.Dropout(config.dropout)
         self.linear = nn.Linear(config.d_model, config.patch_length)
         self.use_cls_token = config.use_cls_token
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, embedding: torch.Tensor) -> torch.Tensor:
         """
-        x: tensor [bs x num_channels x num_patches x d_model]
-                or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
-        output: tensor [bs x num_channels x num_patches x patch_length]
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True
+                Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True
+
         """
-        x = self.linear(self.dropout(x))  # [bs x num_channels x num_patches x patch_length]
+        embedding = self.linear(self.dropout(embedding))  # [bs x num_channels x num_patches x patch_length]
         if self.use_cls_token:
-            x = x[:, :, 1:, :]  # remove the first cls token
-        return x
+            embedding = embedding[:, :, 1:, :]  # remove the first cls token
+        return embedding
 
 
 class PatchTSTForPretraining(PatchTSTPreTrainedModel):

From c993a50f1f2dc0de7575720a70920b20061e233f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 10 Oct 2023 23:11:33 -0400
Subject: [PATCH 110/189] add docstring

---
 .../models/patchtst/modeling_patchtst.py      | 161 +++++++++++++++---
 1 file changed, 136 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 1b2ffab6523ab4..f9b644ad6dd148 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1332,8 +1332,8 @@ def forward(self, embedding: torch.Tensor) -> torch.Tensor:
         """
         Parameters:
             embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
-                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True
-                Embedding from the model
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                    Embedding from the model
         Returns:
             `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                             `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True
@@ -1346,7 +1346,9 @@ def forward(self, embedding: torch.Tensor) -> torch.Tensor:
 
 
 class PatchTSTForPretraining(PatchTSTPreTrainedModel):
-    # PatchTSTModel + Pretraining Head
+    """
+    Mask pretrain model: PatchTST model + pretrain head
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1362,12 +1364,25 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
         """
-        past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1395,7 +1410,9 @@ def forward(
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
-    # PatchTST model + classification head
+    """
+    PatchTST model for classification. The model contains PatchTST model + classification head
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1414,6 +1431,24 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForClassificationOutput]:
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            labels (`torch.Tensor`, *optional*): labels associates with the `past_values`
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
+        """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1435,6 +1470,9 @@ def forward(
 
 
 class ClassificationHead(nn.Module):
+    """
+    Classification head
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.use_cls_token = config.use_cls_token
@@ -1445,9 +1483,13 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(self, embedding: torch.Tensor):
         """
-        embedding: [bs x num_channels x num_patches x d_model]
-            or [bs x num_channels x (num_patches+1) x d_model] if use cls_token output:
-        [bs x n_classes]
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                    Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, num_labels)`
+
         """
         if self.use_cls_token:
             x = embedding[:, :, 0, :]  # use the first output token, x: bs x num_channels x d_model
@@ -1483,9 +1525,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
 
     def forward(self, embedding: torch.Tensor):
         """
-        embedding: [bs x num_channels x num_patch x d_model]
-            or [bs x num_channels x (num_patch+1) x d_model] if use cls_token
-        output: [bs x pred_len x num_output_channels]
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                    Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, pred_len, num_output_channels)`
+
         """
         batch_size = embedding.shape[0]
         if self.use_cls_token:
@@ -1512,9 +1558,9 @@ def forward(self, embedding: torch.Tensor):
 
 
 class PatchTSTForPrediction(PatchTSTPreTrainedModel):
-    """ """
-
-    # PatchTST model + prediction head
+    """
+    PatchTST model for prediction. The model contains PatchTST model + prediction head
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1548,6 +1594,25 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
+                future target values associates with the `past_values`
+            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
+        """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1658,11 +1723,14 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
 
     def forward(self, embedding: torch.Tensor):
         """
-        embedding: [bs x num_channels x num_patches x d_model]
-            or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
-        output: [bs x forecast_len x num_channels]
-        """
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                    Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`
 
+        """
         if self.use_cls_token:
             y = embedding[:, :, 0, :]  # y: [bs x num_channels x d_model]
         else:
@@ -1701,10 +1769,8 @@ def forward(self, embedding: torch.Tensor):
 
 class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     """
-    PatchTST for forecasting
+    PatchTST for forecasting. The model contains PatchTST model + Forecasting head
     """
-
-    # PatchTST model + Forecasting head
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
@@ -1736,6 +1802,25 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForForecastingOutput]:
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
+                future target values associates with the `past_values`
+            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
+        """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1828,6 +1913,9 @@ def generate(
 
 
 class RegressionHead(nn.Module):
+    """
+    Regression head
+    """
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
         self.y_range = config.prediction_range
@@ -1847,9 +1935,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
 
     def forward(self, embedding: torch.Tensor):
         """
-        embedding: [bs x num_channels x num_patch x d_model]
-            or [bs x num_channels x (num_patch+1) x d_model] if use cls_token
-        output: [bs x output_dim]
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
+                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                    Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, output_dim)`
+
         """
         if self.use_cls_token:
             x = embedding[:, :, 0, :]  # use the first output token, x: [bs x num_channels x d_model]
@@ -1906,6 +1998,25 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForRegressionOutput]:
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
+                target labels associates with the `past_values`
+            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
+        """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )

From ddc7521d81949f8b26f3e3af9b56cd7ae602a59e Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 10 Oct 2023 23:59:15 -0400
Subject: [PATCH 111/189] change some variable names and add PatchTST before
 some class names

---
 .../models/patchtst/modeling_patchtst.py      | 79 ++++++++++---------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f9b644ad6dd148..089837326d2fd0 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -224,32 +224,31 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model):
     # Positional encoding
     if position_embedding_type is None:
         # position_embedding_type = None and learned = False can be used to measure impact of positional encoding
-        w_pos = torch.empty((q_len, d_model))
-        nn.init.uniform_(w_pos, -0.02, 0.02)
+        position_enc = torch.empty((q_len, d_model))
+        nn.init.uniform_(position_enc, -0.02, 0.02)
         learned = False
     elif position_embedding_type == "zeros":
-        w_pos = torch.empty((q_len, d_model))
-        nn.init.uniform_(w_pos, -0.02, 0.02)
+        position_enc = torch.empty((q_len, d_model))
+        nn.init.uniform_(position_enc, -0.02, 0.02)
     elif position_embedding_type == "normal":
-        w_pos = torch.zeros((q_len, 1))
-        torch.nn.init.normal_(w_pos, mean=0.0, std=0.1)
+        position_enc = torch.zeros((q_len, 1))
+        torch.nn.init.normal_(position_enc, mean=0.0, std=0.1)
     elif position_embedding_type == "uniform":
-        w_pos = torch.zeros((q_len, 1))
-        nn.init.uniform_(w_pos, a=0.0, b=0.1)
+        position_enc = torch.zeros((q_len, 1))
+        nn.init.uniform_(position_enc, a=0.0, b=0.1)
     elif position_embedding_type == "sincos":
-        pos_enc = torch.zeros(q_len, d_model)
+        position_enc = torch.zeros(q_len, d_model)
         position = torch.arange(0, q_len).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
-        pos_enc[:, 0::2] = torch.sin(position * div_term)
-        pos_enc[:, 1::2] = torch.cos(position * div_term)
-        pos_enc = pos_enc - pos_enc.mean()
-        pos_enc = pos_enc / (pos_enc.std() * 10)
-        w_pos = pos_enc
+        position_enc[:, 0::2] = torch.sin(position * div_term)
+        position_enc[:, 1::2] = torch.cos(position * div_term)
+        position_enc = position_enc - position_enc.mean()
+        position_enc = position_enc / (position_enc.std() * 10)
     else:
         raise ValueError(
             f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
         )
-    return nn.Parameter(w_pos, requires_grad=learned)
+    return nn.Parameter(position_enc, requires_grad=learned)
 
 
 def random_masking(
@@ -628,19 +627,19 @@ def __init__(self, config: PatchTSTConfig):
 
         self.pre_norm = config.pre_norm
 
-    def forward(self, src: torch.Tensor):
+    def forward(self, hidden_state: torch.Tensor):
         """
         Parameters:
-            src (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
+            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                 Past values of the time series
         Return:
             `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
 
         """
-        batch_size, num_input_channels, sequence_length, d_model = src.shape
+        batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape
 
         # First sublayer: attention across time
-        src = src.view(
+        src = hidden_state.view(
             batch_size * num_input_channels, sequence_length, d_model
         )  # src: [(bs*num_channels) x sequence_length x d_model]
         if self.pre_norm:
@@ -723,6 +722,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 class PatchTSTEncoder(PatchTSTPreTrainedModel):
+    """
+    PatchTST Encoder
+    """
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.num_input_channels = config.num_input_channels
@@ -735,20 +737,20 @@ def __init__(self, config: PatchTSTConfig):
 
         # Input encoding: projection of feature vectors onto a d-dim vector space
         if not config.shared_embedding:
-            self.w_p = nn.ModuleList()
+            self.input_embedding = nn.ModuleList()
             for _ in range(self.num_input_channels):
-                self.w_p.append(nn.Linear(config.patch_length, config.d_model))
+                self.input_embedding.append(nn.Linear(config.patch_length, config.d_model))
         else:
-            self.w_p = nn.Linear(config.patch_length, config.d_model)
+            self.input_embedding = nn.Linear(config.patch_length, config.d_model)
 
         # Positional encoding
         if config.use_cls_token:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.w_pos = positional_encoding(
+            self.position_enc = positional_encoding(
                 config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model
             )
         else:
-            self.w_pos = positional_encoding(
+            self.position_enc = positional_encoding(
                 config.positional_encoding, config.learn_pe, config.num_patches, config.d_model
             )
 
@@ -773,8 +775,7 @@ def forward(
             output_hidden_states (bool, optional): Indicates if hidden states should be output.
 
         return:
-            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` or `(batch_size, num_channels,
-            num_patches+1, d_model)` if cls_token is used
+            `BaseModelOutputWithNoAttention`
         """
         _, num_input_channels, _, _ = past_values.shape
 
@@ -785,24 +786,24 @@ def forward(
         if not self.shared_embedding:
             x_out = []
             for i in range(num_input_channels):
-                z = self.w_p[i](past_values[:, i, :, :])
+                z = self.input_embedding[i](past_values[:, i, :, :])
                 x_out.append(z)
             past_values = torch.stack(x_out, dim=1)
         else:
-            past_values = self.w_p(past_values)  # x: [bs x num_channels  x num_patches x d_model]
+            past_values = self.input_embedding(past_values)  # x: [bs x num_channels  x num_patches x d_model]
 
         if self.use_cls_token:
             # x: [bs x num_channels x num_patches x d_model]
-            past_values = self.positional_dropout(past_values + self.w_pos[1:, :])
+            past_values = self.positional_dropout(past_values + self.position_enc[1:, :])
             # append cls token
-            cls_token = self.cls_token + self.w_pos[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
+            cls_token = self.cls_token + self.position_enc[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
             past_values = torch.cat(
                 (cls_tokens, past_values), dim=1
             )  # x: [bs x num_channels x (num_patches+1) x d_model]
         else:
             past_values = self.positional_dropout(
-                past_values + self.w_pos
+                past_values + self.position_enc
             )  # x: [bs x num_channels x num_patches x d_model]
 
         # Encoder
@@ -1417,7 +1418,7 @@ def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
         self.model = PatchTSTModel(config)
-        self.head = ClassificationHead(config)
+        self.head = PatchTSTClassificationHead(config)
         self.loss = nn.CrossEntropyLoss()
 
         # Initialize weights and apply final processing
@@ -1469,7 +1470,7 @@ def forward(
         return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states)
 
 
-class ClassificationHead(nn.Module):
+class PatchTSTClassificationHead(nn.Module):
     """
     Classification head
     """
@@ -1505,7 +1506,7 @@ def forward(self, embedding: torch.Tensor):
         return y
 
 
-class PredictionHead(nn.Module):
+class PatchTSTPredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
@@ -1581,7 +1582,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = PredictionHead(config, self.distribution_output)
+        self.head = PatchTSTPredictionHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1686,7 +1687,7 @@ def generate(
         return SamplePatchTSTPredictionOutput(sequences=samples)
 
 
-class ForecastHead(nn.Module):
+class PatchTSTForecastHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
@@ -1789,7 +1790,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = ForecastHead(config, self.distribution_output)
+        self.head = PatchTSTForecastHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1912,7 +1913,7 @@ def generate(
         return SamplePatchTSTForecastOutput(sequences=samples)
 
 
-class RegressionHead(nn.Module):
+class PatchTSTRegressionHead(nn.Module):
     """
     Regression head
     """
@@ -1985,7 +1986,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = RegressionHead(config, self.distribution_output)
+        self.head = PatchTSTRegressionHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()

From 0d7d92d5bd2f819b2ec27e6a0be2455814bae8b0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 11 Oct 2023 09:28:57 +0200
Subject: [PATCH 112/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 089837326d2fd0..3b36c0a4850160 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -328,8 +328,8 @@ def forecast_masking(
 
     Parameters:
         inputs (`torch.Tensor`):
-            Input of shape `(bs, num_channels, num_patch, patch_len)`
-            or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)`
+            Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch,
+            patch_len)`
         patch_lengths (`list`):
             List of patch lengths to mask in the end of the data.
         mix_ratio (`list`, *optional*):
@@ -343,8 +343,8 @@ def forecast_masking(
             Value to set for the random seed.
 
     Returns:
-        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs, num_channels
-        , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
+        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
     """
     if seed_number:
         set_seed(seed_number)
@@ -444,7 +444,9 @@ def forward(self, past_values: torch.Tensor):
         output = output.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
         )  # output: [bs x num_patches x num_input_channels x patch_length]
-        output = output.transpose(-2, -3).contiguous()  # output: [bs x num_input_channels x num_patches x patch_length]
+        output = output.transpose(
+            -2, -3
+        ).contiguous()  # output: [bs x num_input_channels x num_patches x patch_length]
         return output
 
 
@@ -558,7 +560,8 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo
         Return:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`)
 
-            all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` of shapes `(batch_size, num_channels, sequence_length, d_model)`)
+            all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor`
+            of shapes `(batch_size, num_channels, sequence_length, d_model)`)
 
         """
         all_hidden_states = []
@@ -725,6 +728,7 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel):
     """
     PatchTST Encoder
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.num_input_channels = config.num_input_channels
@@ -1323,6 +1327,7 @@ class MaskPretrainHead(nn.Module):
     """
     Pretraining head for mask modelling
     """
+
     def __init__(self, config):
         super().__init__()
         self.dropout = nn.Dropout(config.dropout)
@@ -1350,6 +1355,7 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
     """
     Mask pretrain model: PatchTST model + pretrain head
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1382,7 +1388,8 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
 
         """
         output_hidden_states = (
@@ -1414,6 +1421,7 @@ class PatchTSTForClassification(PatchTSTPreTrainedModel):
     """
     PatchTST model for classification. The model contains PatchTST model + classification head
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1447,7 +1455,8 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+            `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
 
         """
         output_hidden_states = (
@@ -1474,6 +1483,7 @@ class PatchTSTClassificationHead(nn.Module):
     """
     Classification head
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.use_cls_token = config.use_cls_token
@@ -1562,6 +1572,7 @@ class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     """
     PatchTST model for prediction. The model contains PatchTST model + prediction head
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
 
@@ -1611,7 +1622,8 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
 
         """
         output_hidden_states = (
@@ -1772,6 +1784,7 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     """
     PatchTST for forecasting. The model contains PatchTST model + Forecasting head
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.model = PatchTSTModel(config)
@@ -1819,7 +1832,8 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
 
         """
         output_hidden_states = (
@@ -1917,6 +1931,7 @@ class PatchTSTRegressionHead(nn.Module):
     """
     Regression head
     """
+
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
         self.y_range = config.prediction_range
@@ -2015,7 +2030,8 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+            `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
 
         """
         output_hidden_states = (

From 23819947610e5c3b479985babc4da73da70cf8b9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 11 Oct 2023 09:47:02 +0200
Subject: [PATCH 113/189] fix argument types

---
 .../models/patchtst/configuration_patchtst.py         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 89872754623825..25839adde23c7c 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -41,12 +41,12 @@ class PatchTSTConfig(PretrainedConfig):
         num_input_channels (`int`, *optional*, defaults to 1):
             The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
             multivariate targets.
-        context_length (`int`, defaults to 32, *optional*, defaults to 32):
+        context_length (`int`, *optional*, defaults to 32):
             The context length for the encoder.
-        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+        distribution_output (`str`, *optional*, defaults to `"student_t"`):
             The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
             "negative_binomial".
-        loss (`string`, *optional*, defaults to `"mse"`):
+        loss (`str`, *optional*, defaults to `"mse"`): 
             The loss function for the model corresponding to the `distribution_output` head. For parametric
             distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
             error "mse".
@@ -94,9 +94,9 @@ class PatchTSTConfig(PretrainedConfig):
             The standard deviation of the truncated normal weight initialization distribution.
         shared_projection (`bool`, *optional*, defaults to `True`):
             Sharing the projection layer across different channels in the forecast head.
-        seed_number (`int`, *optional*):
+        seed_number (`Optional`, *optional*):
             Use seed number for random masking.
-        scaling (`string` or `bool`, *optional*, defaults to `"mean"`):
+        scaling (`Union`, *optional*, defaults to `"mean"`):
             Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
             scaler is set to "mean".
         mask_input (`bool`, *optional*, defaults to False):
@@ -128,7 +128,6 @@ class PatchTSTConfig(PretrainedConfig):
         num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for probablistic forecast.
 
-        Example:
 
     ```python
     >>> from transformers import PatchTSTConfig, PatchTSTModel

From e79f0fd4f6e8acae249975cfca602017448aaf01 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 11 Oct 2023 12:07:38 +0200
Subject: [PATCH 114/189] fix tests

---
 .../models/patchtst/configuration_patchtst.py        |  2 +-
 tests/models/patchtst/test_modeling_patchtst.py      | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 25839adde23c7c..27f762459cd52b 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -46,7 +46,7 @@ class PatchTSTConfig(PretrainedConfig):
         distribution_output (`str`, *optional*, defaults to `"student_t"`):
             The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
             "negative_binomial".
-        loss (`str`, *optional*, defaults to `"mse"`): 
+        loss (`str`, *optional*, defaults to `"mse"`):
             The loss function for the model corresponding to the `distribution_output` head. For parametric
             distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
             error "mse".
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index d25cc525326ab5..92bc76c375e0bd 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -188,8 +188,11 @@ def test_config(self):
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
-        # if classification model:
-        if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
+        #  if PatchTSTForPretraining
+        if model_class == PatchTSTForPretraining:
+            inputs_dict.pop("future_values")
+        # else if classification model:
+        elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
             labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng)
             inputs_dict["labels"] = labels
@@ -272,7 +275,9 @@ def test_forward_signature(self):
                 "past_observed_mask",
                 "future_values",
             ]
-            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
+            if model_class == PatchTSTForPretraining:
+                expected_arg_names.remove("future_values")
+            elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
                 MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
             ):
                 expected_arg_names.remove("future_values")
@@ -282,6 +287,7 @@ def test_forward_signature(self):
             expected_arg_names.extend(
                 [
                     "output_hidden_states",
+                    "return_dict",
                 ]
             )
 

From b61bec0ce64b3456dd723765bb1f77f8261cb358 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 11 Oct 2023 12:27:04 -0400
Subject: [PATCH 115/189] change x variable to patch_input

---
 .../models/patchtst/modeling_patchtst.py         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 3b36c0a4850160..426090bd4a73de 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -500,14 +500,14 @@ def __init__(
 
         super().__init__()
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, patch_input: torch.Tensor):
         """
         Parameters:
-            x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Patch input
 
         Return:
-            x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
+            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                 Masked patched input
             mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                 Bool tensor indicating True on masked points
@@ -515,8 +515,8 @@ def forward(self, x: torch.Tensor):
         """
 
         if self.mask_type == "random":
-            x_mask, mask = random_masking(
-                inputs=x,
+            masked_input, mask = random_masking(
+                inputs=patch_input,
                 mask_ratio=self.mask_ratio,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
@@ -524,8 +524,8 @@ def forward(self, x: torch.Tensor):
                 seed_number=self.seed_number,
             )
         elif self.mask_type == "forecast":
-            x_mask, mask = forecast_masking(
-                inputs=x,
+            masked_input, mask = forecast_masking(
+                inputs=patch_input,
                 patch_lengths=self.mask_patches,
                 mix_ratio=self.mask_patch_ratios,
                 unmasked_channel_indices=self.unmasked_channel_indices,
@@ -537,7 +537,7 @@ def forward(self, x: torch.Tensor):
 
         mask = mask.bool()  # mask: [bs x num_input_channels x num_patch]
 
-        return x_mask, mask
+        return masked_input, mask
 
 
 class PatchTSTEncoderBlock(nn.Module):

From e9088621b9bafcec5615463296e7362478b4079f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 11 Oct 2023 17:46:08 -0400
Subject: [PATCH 116/189] format

---
 .../models/patchtst/modeling_patchtst.py      | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 426090bd4a73de..0db1ae5768b933 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -550,7 +550,10 @@ def __init__(self, config: PatchTSTConfig):
 
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
-    def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
+    def forward(self,
+                hidden_state: torch.Tensor,
+                output_hidden_states: Optional[bool] = None
+                ):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -770,7 +773,9 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
+        self,
+        past_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
@@ -1148,7 +1153,10 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5)
         self.minimum_scale = minimum_scale
 
     @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self,
+                data: torch.Tensor,
+                weights: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         denominator = weights.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
         loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
@@ -1185,9 +1193,10 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self,
+                data: torch.Tensor,
+                observed_indicator: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1233,9 +1242,10 @@ def __init__(self, dim: int, keepdim: bool = False):
         self.dim = dim
         self.keepdim = keepdim
 
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self,
+                data: torch.Tensor,
+                observed_indicator: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale

From 25e669bcb1356d47558ec0ca75326b145323066f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 12 Oct 2023 12:33:08 +0200
Subject: [PATCH 117/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 0db1ae5768b933..426090bd4a73de 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -550,10 +550,7 @@ def __init__(self, config: PatchTSTConfig):
 
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
-    def forward(self,
-                hidden_state: torch.Tensor,
-                output_hidden_states: Optional[bool] = None
-                ):
+    def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -773,9 +770,7 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self,
-        past_values: torch.Tensor,
-        output_hidden_states: Optional[bool] = None
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
@@ -1153,10 +1148,7 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5)
         self.minimum_scale = minimum_scale
 
     @torch.no_grad()
-    def forward(self,
-                data: torch.Tensor,
-                weights: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         denominator = weights.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
         loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
@@ -1193,10 +1185,9 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(self,
-                data: torch.Tensor,
-                observed_indicator: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1242,10 +1233,9 @@ def __init__(self, dim: int, keepdim: bool = False):
         self.dim = dim
         self.keepdim = keepdim
 
-    def forward(self,
-                data: torch.Tensor,
-                observed_indicator: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale

From b9c01ff2de9e6a012b4d68ab7248c5cb9f465d37 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 18 Oct 2023 13:39:46 +0200
Subject: [PATCH 118/189] fix-copies

---
 README.md               | 2 +-
 README_es.md            | 2 +-
 README_hd.md            | 2 +-
 README_ja.md            | 2 +-
 README_ko.md            | 2 +-
 README_zh-hans.md       | 2 +-
 README_zh-hant.md       | 2 +-
 docs/source/en/index.md | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ff96020aa400c1..f4da5aff298803 100644
--- a/README.md
+++ b/README.md
@@ -434,8 +434,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_es.md b/README_es.md
index a78d20a42996a4..a20cb25f308e82 100644
--- a/README_es.md
+++ b/README_es.md
@@ -409,8 +409,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/README_hd.md b/README_hd.md
index 972a5e4c6eeeef..dee06e1a0ece3a 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -381,8 +381,8 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI से) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. द्वाराअनुसंधान पत्र [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) के साथ जारी किया गया
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 1e77601d0fc7b3..dc259decdacb4e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -443,8 +443,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. から公開された研究論文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
diff --git a/README_ko.md b/README_ko.md
index 10162d8db9c582..71db05859b84e8 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -358,8 +358,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다.
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI 에서 제공)은 Matthias Minderer, Alexey Gritsenko, Neil Houlsby.의 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)논문과 함께 발표했습니다.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 0c02049371dad5..380dacfde1fcf0 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -382,8 +382,8 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (来自 Google AI) 伴随论文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) 由 Matthias Minderer, Alexey Gritsenko, Neil Houlsby 发布。
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 60c72b34ed5e54..e7cec87615d14b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -394,8 +394,8 @@ conda install -c huggingface transformers
 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ebc740a7cc9466..dc2204b94f22c8 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -209,8 +209,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
 |                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
 |                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
-|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
 |                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
+|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
 |                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
 |                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
 |                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |

From 9955e4fea795abf43d6cb18ab5955a76722ad46e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:45:31 +0200
Subject: [PATCH 119/189] Update
 tests/models/patchtst/test_modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/models/patchtst/test_modeling_patchtst.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 92bc76c375e0bd..eaa1033cc0fc1a 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -214,7 +214,6 @@ def test_save_load_strict(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    #
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)

From 5dbe6193f86974fb197f5b6afab5031c3f0c40f0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:53:30 +0200
Subject: [PATCH 120/189] move loss to forward

---
 .../models/patchtst/modeling_patchtst.py      | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 426090bd4a73de..36a5bc8e768ad8 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -232,7 +232,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model):
         nn.init.uniform_(position_enc, -0.02, 0.02)
     elif position_embedding_type == "normal":
         position_enc = torch.zeros((q_len, 1))
-        torch.nn.init.normal_(position_enc, mean=0.0, std=0.1)
+        nn.init.normal_(position_enc, mean=0.0, std=0.1)
     elif position_embedding_type == "uniform":
         position_enc = torch.zeros((q_len, 1))
         nn.init.uniform_(position_enc, a=0.0, b=0.1)
@@ -710,7 +710,7 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize weights"""
         if self.config.use_cls_token:
-            torch.nn.init.normal_(self.config.cls_token, std=0.02)
+            nn.init.normal_(self.config.cls_token, std=0.02)
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -1362,7 +1362,6 @@ def __init__(self, config: PatchTSTConfig):
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
         self.head = MaskPretrainHead(config)
-        self.loss = torch.nn.MSELoss(reduction="none")
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1408,7 +1407,8 @@ def forward(
         x_hat = self.head(model_output[0])
 
         # calculate masked_loss
-        loss_val = self.loss(x_hat, model_output.patched_input)
+        loss = nn.MSELoss(reduction="none")
+        loss_val = loss(x_hat, model_output.patched_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
         encoder_states = model_output.hidden_states
@@ -1427,7 +1427,6 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         self.head = PatchTSTClassificationHead(config)
-        self.loss = nn.CrossEntropyLoss()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1471,7 +1470,8 @@ def forward(
 
         loss_val = None
         if labels is not None:
-            loss_val = self.loss(y_hat, labels)
+            loss = nn.CrossEntropyLoss()
+            loss_val = loss(y_hat, labels)
 
         encoder_states = model_output.hidden_states
         if not return_dict:
@@ -1578,10 +1578,8 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         if config.loss == "mse":
-            self.loss = nn.MSELoss(reduction="mean")
             self.distribution_output = None
         else:
-            self.loss = nll
             if config.distribution_output == "student_t":
                 self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "normal":
@@ -1643,11 +1641,12 @@ def forward(
         if future_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
-                loss_val = self.loss(distribution, future_values)
+                loss_val = nll(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
             else:
-                loss_val = self.loss(y_hat, future_values)
+                loss = nn.MSELoss(reduction="mean")
+                loss_val = loss(y_hat, future_values)
 
         encoder_states = model_output.hidden_states
         if not return_dict:
@@ -1790,10 +1789,8 @@ def __init__(self, config: PatchTSTConfig):
         self.model = PatchTSTModel(config)
 
         if config.loss == "mse":
-            self.loss = nn.MSELoss(reduction="mean")
             self.distribution_output = None
         else:
-            self.loss = nll
             if config.distribution_output == "student_t":
                 self.distribution_output = StudentTOutput(dim=config.prediction_length)
             elif config.distribution_output == "normal":
@@ -1855,7 +1852,7 @@ def forward(
                 distribution = self.distribution_output.distribution(
                     y_hat, loc=model_output.loc, scale=model_output.scale
                 )
-                loss_val = self.loss(distribution, future_values)
+                loss_val = nll(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
                 # for testing
@@ -1863,7 +1860,8 @@ def forward(
                 # loss_val = weighted_average(loss_val)
             else:
                 y_hat = y_hat * model_output.scale + model_output.loc
-                loss_val = self.loss(y_hat, future_values)
+                loss = nn.MSELoss(reduction="mean")
+                loss_val = loss(y_hat, future_values)
 
         encoder_states = model_output.hidden_states
         loc = model_output.loc
@@ -1986,10 +1984,8 @@ def __init__(self, config: PatchTSTConfig):
 
         self.model = PatchTSTModel(config)
         if config.loss == "mse":
-            self.loss = nn.MSELoss(reduction="mean")
             self.distribution_output = None
         else:
-            self.loss = nll
             if config.distribution_output == "student_t":
                 self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
             elif config.distribution_output == "normal":
@@ -2049,11 +2045,12 @@ def forward(
         if labels is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
-                loss_val = self.loss(distribution, labels)
+                loss_val = nll(distribution, labels)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
             else:
-                loss_val = self.loss(y_hat, labels)
+                loss = nn.MSELoss(reduction="mean")
+                loss_val = loss(y_hat, labels)
 
         encoder_states = model_output.hidden_states
 

From 099b76cef747d355247e9208bab885eb2af8d310 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:57:07 +0200
Subject: [PATCH 121/189] Update
 src/transformers/models/patchtst/modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 36a5bc8e768ad8..7e7e5f4b0fb685 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -570,7 +570,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo
             hidden_state = mod(hidden_state)
             if output_hidden_states:
                 all_hidden_states.append(hidden_state)
-        if output_hidden_states is None:
+        if output_hidden_states is False:
             return hidden_state, None
         return hidden_state, all_hidden_states
 

From b9c935f60802240d15a4b48267a8dda6175d6cc2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:58:01 +0200
Subject: [PATCH 122/189] Update
 src/transformers/models/patchtst/modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 7e7e5f4b0fb685..6679968c45ca95 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -550,7 +550,7 @@ def __init__(self, config: PatchTSTConfig):
 
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
-    def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None):
+    def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):

From b8d59f8f281f3d5f897a891339ce412efa209458 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:58:12 +0200
Subject: [PATCH 123/189] Update
 src/transformers/models/patchtst/modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/patchtst/modeling_patchtst.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6679968c45ca95..a455358265b9da 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -205,7 +205,8 @@ class PatchTSTTranspose(nn.Module):
 
     def __init__(self, *dims, contiguous=False):
         super().__init__()
-        self.dims, self.contiguous = dims, contiguous
+        self.dims = dims
+        self.contiguous = dims
 
     def forward(self, inputs: torch.Tensor):
         """

From b7c04c746b0b577ac0eec701d0c44191feb046f0 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:58:34 +0200
Subject: [PATCH 124/189] Update
 src/transformers/models/patchtst/modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/patchtst/modeling_patchtst.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a455358265b9da..d435287b59341e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -488,6 +488,7 @@ def __init__(
         mask_value: int = 0,
         seed_number: Optional[int] = None,
     ):
+        super().__init__()
         self.mask_ratio = mask_ratio
         self.channel_consistent_masking = channel_consistent_masking
         self.mask_type = mask_type

From c920eee26aa4195cb17ad29d50fa59856bf297d9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 10:58:53 +0200
Subject: [PATCH 125/189] Update
 src/transformers/models/patchtst/modeling_patchtst.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/patchtst/modeling_patchtst.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d435287b59341e..4cbf2d2177dff1 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -500,7 +500,6 @@ def __init__(
             self.unmasked_channel_indices.sort()
         self.seed_number = seed_number
 
-        super().__init__()
 
     def forward(self, patch_input: torch.Tensor):
         """

From 6642ab937a76a2bb28b960b954d6efe23867c629 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 19 Oct 2023 11:42:44 +0200
Subject: [PATCH 126/189] formatting

---
 src/transformers/models/patchtst/modeling_patchtst.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4cbf2d2177dff1..6be095e368a700 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -500,7 +500,6 @@ def __init__(
             self.unmasked_channel_indices.sort()
         self.seed_number = seed_number
 
-
     def forward(self, patch_input: torch.Tensor):
         """
         Parameters:

From 78697674d890f5b62aa8768171a29a1bfd9793e9 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 19 Oct 2023 08:35:55 -0400
Subject: [PATCH 127/189] fix a bug when pre_norm is set to True

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6be095e368a700..d6414599ddd743 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -648,7 +648,7 @@ def forward(self, hidden_state: torch.Tensor):
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src)[0])
+                self.self_attn(self.norm_sublayer1(src))[0]
             )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
@@ -669,7 +669,7 @@ def forward(self, hidden_state: torch.Tensor):
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 src = src + self.dropout_path2(
-                    self.self_attn(self.norm_sublayer2(src)[0])
+                    self.self_attn(self.norm_sublayer2(src))[0]
                 )  # Add: residual connection with residual dropout
             else:
                 ## Multi-Head attention and Add residual connection and Norm

From 2fb741742c9b3741c56256c52aae77fbfcf0af4b Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 19 Oct 2023 14:53:43 -0400
Subject: [PATCH 128/189] output_hidden_states is set to False as default

---
 .../models/patchtst/modeling_patchtst.py      | 50 +++++++------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d6414599ddd743..f528a80769f52b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -770,7 +770,7 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = False
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
@@ -783,9 +783,6 @@ def forward(
         """
         _, num_input_channels, _, _ = past_values.shape
 
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         # Input encoding
         if not self.shared_embedding:
             x_out = []
@@ -1286,12 +1283,10 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if past_observed_mask is None:
@@ -1370,7 +1365,7 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
         """
@@ -1391,9 +1386,7 @@ def forward(
             `config.return_dict`=False)
 
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # past_values: [bs x num_channels x num_patches x d_model] or
@@ -1436,7 +1429,7 @@ def forward(
         past_values: torch.Tensor,
         labels: torch.Tensor = None,
         past_observed_mask: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForClassificationOutput]:
         """
@@ -1458,9 +1451,7 @@ def forward(
             `config.return_dict`=False)
 
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         model_output = self.model(
@@ -1601,8 +1592,8 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
         """
         Parameters:
@@ -1624,9 +1615,7 @@ def forward(
             `config.return_dict`=False)
 
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # get model output
@@ -1684,7 +1673,7 @@ def generate(
             past_values=past_values,
             future_values=None,
             past_observed_mask=past_observed_mask,
-            output_hidden_states=None,
+            output_hidden_states=False,
         )
 
         # get distribution
@@ -1810,7 +1799,7 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForForecastingOutput]:
         """
@@ -1833,9 +1822,6 @@ def forward(
             `config.return_dict`=False)
 
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # get model output
@@ -1909,7 +1895,7 @@ def generate(
             past_values=past_values,
             future_values=None,
             past_observed_mask=past_observed_mask,
-            output_hidden_states=None,
+            output_hidden_states=False,
         )
 
         # get distribution
@@ -2007,7 +1993,7 @@ def forward(
         past_values: torch.Tensor,
         labels: Optional[torch.Tensor],
         past_observed_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForRegressionOutput]:
         """
@@ -2030,9 +2016,6 @@ def forward(
             `config.return_dict`=False)
 
         """
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         model_output = self.model(
@@ -2086,7 +2069,10 @@ def generate(
 
         # get model output
         outputs = self(
-            past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=None
+            past_values=past_values,
+            labels=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=False
         )
 
         # get distribution

From 9168ca28a46fbe474db696ac8d1a2f6c6e163969 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 19 Oct 2023 14:54:12 -0400
Subject: [PATCH 129/189] set pre_norm=True as default

---
 src/transformers/models/patchtst/configuration_patchtst.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 27f762459cd52b..e5c78104d3262d 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -82,8 +82,9 @@ class PatchTSTConfig(PretrainedConfig):
             Consider bias in the feed-forward networks.
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
-        pre_norm (`bool`, *optional*, defaults to `False`):
-            TODO
+        pre_norm (`bool`, *optional*, defaults to `True`):
+            Normalization is applied before self-attention if pre_norm is set to True. Otherwise, normalization is
+            applied after residual block.
         positional_encoding (`str`, *optional*, defaults to `"sincos"`):
             Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
         learn_pe (`bool`, *optional*, defaults to `False`):
@@ -173,7 +174,7 @@ def __init__(
         ff_dropout: float = 0.0,
         bias: bool = True,
         activation_function: str = "gelu",
-        pre_norm: bool = False,
+        pre_norm: bool = True,
         positional_encoding: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,

From 7829a57c9310c2051b68ddef76fc64e967745817 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 19 Oct 2023 22:15:12 -0400
Subject: [PATCH 130/189] format docstring

---
 .../models/patchtst/modeling_patchtst.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f528a80769f52b..b296b5d20072bc 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -555,7 +555,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                 Past values of the time series
-            output_hidden_states (`bool`, *optional*):
+            output_hidden_states (`bool`, *optional*, default to False):
                 output hidden state option
         Return:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`)
@@ -860,7 +860,7 @@ def forward(
             For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the
             number of variates in the time series per time step.
 
-        output_hidden_states (`bool`, *optional*):
+        output_hidden_states (`bool`, *optional*, default to False):
             Whether or not to return the hidden states of all layers.
 """
 
@@ -1378,7 +1378,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1443,7 +1443,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1607,7 +1607,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1814,7 +1814,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -2008,7 +2008,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
                 target labels associates with the `past_values`
-            output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:

From 5cc98cba747f89c253308f244933e583853a780a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 20 Oct 2023 08:24:04 +0200
Subject: [PATCH 131/189] format

---
 .../models/patchtst/modeling_patchtst.py      | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index b296b5d20072bc..322fe6effca0b7 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1286,7 +1286,6 @@ def forward(
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if past_observed_mask is None:
@@ -1378,7 +1377,8 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False):
+                Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1443,7 +1443,8 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False):
+                Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1607,7 +1608,8 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False):
+                Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -1814,7 +1816,8 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False):
+                Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -2008,7 +2011,8 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
                 target labels associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers
+            output_hidden_states (`bool`, *optional*, default to False):
+                Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
@@ -2069,10 +2073,7 @@ def generate(
 
         # get model output
         outputs = self(
-            past_values=past_values,
-            labels=None,
-            past_observed_mask=past_observed_mask,
-            output_hidden_states=False
+            past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=False
         )
 
         # get distribution

From 3a09a1e7cf267ad7c128658456e6856166b2cd23 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 20 Oct 2023 13:16:48 +0200
Subject: [PATCH 132/189] output_hidden_states is None by default

---
 .../models/patchtst/modeling_patchtst.py        | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 322fe6effca0b7..6fb7d69b5b3aab 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -770,7 +770,7 @@ def __init__(self, config: PatchTSTConfig):
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = False
+        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
     ) -> BaseModelOutputWithNoAttention:
         """
         Parameters:
@@ -808,6 +808,9 @@ def forward(
             )  # x: [bs x num_channels x num_patches x d_model]
 
         # Encoder
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         past_values, hidden_states = self.encoder(
             past_values, output_hidden_states
         )  # x: [bs x num_channels x num_patches x d_model]
@@ -1283,7 +1286,7 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1364,7 +1367,7 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
         """
@@ -1429,7 +1432,7 @@ def forward(
         past_values: torch.Tensor,
         labels: torch.Tensor = None,
         past_observed_mask: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForClassificationOutput]:
         """
@@ -1593,7 +1596,7 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
         """
@@ -1801,7 +1804,7 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForForecastingOutput]:
         """
@@ -1996,7 +1999,7 @@ def forward(
         past_values: torch.Tensor,
         labels: Optional[torch.Tensor],
         past_observed_mask: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForRegressionOutput]:
         """

From 1777fc3b429542ce1f727b40a7eb23ec4cbd6403 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 20 Oct 2023 16:06:47 +0200
Subject: [PATCH 133/189] add missing docs

---
 src/transformers/models/patchtst/configuration_patchtst.py | 5 +++--
 src/transformers/models/patchtst/modeling_patchtst.py      | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index e5c78104d3262d..d7336b1b0b5742 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -107,9 +107,10 @@ class PatchTSTConfig(PretrainedConfig):
         mask_ratio (`float`, *optional*, defaults to 0.5):
             Masking ratio is applied to mask the input data during pretraining.
         mask_patches (`List`, *optional*, defaults to `[2, 3]`):
-            TODO
+            List of patch lengths to mask in the end of the data.
         mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`):
-            TODO
+            List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
+            then equal weights to both patch lengths. Defaults to None.
         channel_consistent_masking (`bool`, *optional*, defaults to `False`):
             If channel consistent masking is True, all the channels will have the same masking.
         unmasked_channel_indices (`list`, *optional*):
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 6fb7d69b5b3aab..9190f9b36d0dca 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -459,8 +459,9 @@ class PatchTSTMasking(nn.Module):
         mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random.
         mask_ratio (`float`, *optional*): Mask ratio.
         mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
-        mask_patch_ratios (`list`, *optional*): List of weights to use for each patch length. For Ex.
-        if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None.
+        mask_patch_ratios (`list`, *optional*):
+            List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
+            then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
         channel_consistent_masking (`bool`, *optional*):

From 21803e0f5bc1a51c6bb47fe19cd50869ef9b419e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 20 Oct 2023 16:12:39 +0200
Subject: [PATCH 134/189] better var names

---
 .../models/patchtst/modeling_patchtst.py       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 9190f9b36d0dca..8ce3a9b33cc9d9 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -360,11 +360,11 @@ def forecast_masking(
     total_length = 0
     total_ratio = sum(mix_ratio)
 
-    for i, j in zip(patch_lengths, mix_ratio):
-        if i <= 0 or i >= sequence_length:
+    for patch_length, ratio in zip(patch_lengths, mix_ratio):
+        if patch_length <= 0 or patch_length >= sequence_length:
             raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
-        temp_len = int(batch_size * j / total_ratio)
-        t_list.append([i, j, temp_len])
+        temp_len = int(batch_size * ratio / total_ratio)
+        t_list.append([patch_length, ratio, temp_len])
         total_length += temp_len
 
     t_list = sorted(t_list, key=lambda x: x[2])
@@ -374,11 +374,11 @@ def forecast_masking(
     elif total_length > batch_size:
         t_list[-1][2] = t_list[-1][2] + (total_length - batch_size)
 
-    b1 = 0
-    for p, _, l in t_list:
-        b2 = b1 + l
-        mask[b1:b2, :, -p:] = 1
-        b1 = b2
+    batch1 = 0
+    for patch_len, _, temp_len in t_list:
+        batch2 = batch1 + temp_len
+        mask[batch1:batch2, :, -patch_len:] = 1
+        batch1 = batch2
 
     perm = torch.randperm(mask.shape[0])
     mask = mask[perm]

From 87068ca095df60343d46b15f0816dd73272b15db Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 20 Oct 2023 11:06:12 -0400
Subject: [PATCH 135/189] docstring: remove default to False in
 output_hidden_states

---
 src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 8ce3a9b33cc9d9..04cac532c0ed7f 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1381,7 +1381,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*, default to False):
+            output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
@@ -1447,7 +1447,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            output_hidden_states (`bool`, *optional*, default to False):
+            output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
@@ -1612,7 +1612,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False):
+            output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
@@ -1820,7 +1820,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                 future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False):
+            output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
@@ -2015,7 +2015,7 @@ def forward(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
                 target labels associates with the `past_values`
-            output_hidden_states (`bool`, *optional*, default to False):
+            output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 

From 4c8c7d0ccde947f7247e02396dca0f42896b29fa Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 22 Oct 2023 23:30:59 -0400
Subject: [PATCH 136/189] change labels name to target_values in regression
 task

---
 .../models/patchtst/modeling_patchtst.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 04cac532c0ed7f..5595d92e327b23 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1998,7 +1998,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        labels: Optional[torch.Tensor],
+        target_values: Optional[torch.Tensor],
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2013,8 +2013,8 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
-                target labels associates with the `past_values`
+            target_values (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
+                target values associates with the `past_values`
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
@@ -2033,15 +2033,15 @@ def forward(
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if labels is not None:
+        if target_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
-                loss_val = nll(distribution, labels)
+                loss_val = nll(distribution, target_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
             else:
                 loss = nn.MSELoss(reduction="mean")
-                loss_val = loss(y_hat, labels)
+                loss_val = loss(y_hat, target_values)
 
         encoder_states = model_output.hidden_states
 
@@ -2077,7 +2077,7 @@ def generate(
 
         # get model output
         outputs = self(
-            past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=False
+            past_values=past_values, target_values=None, past_observed_mask=past_observed_mask, output_hidden_states=False
         )
 
         # get distribution

From dccbc31583eed8f71ff6e1febd8f989f5ea42b3d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 23 Oct 2023 11:45:34 +0200
Subject: [PATCH 137/189] format

---
 src/transformers/models/patchtst/modeling_patchtst.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5595d92e327b23..3730eec898f67a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -2077,7 +2077,10 @@ def generate(
 
         # get model output
         outputs = self(
-            past_values=past_values, target_values=None, past_observed_mask=past_observed_mask, output_hidden_states=False
+            past_values=past_values,
+            target_values=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=False,
         )
 
         # get distribution

From 3d12866ace7b7ca75a42431f6c884a55037952f6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 23 Oct 2023 11:56:09 +0200
Subject: [PATCH 138/189] fix tests

---
 src/transformers/models/patchtst/modeling_patchtst.py |  4 ++--
 tests/models/patchtst/test_modeling_patchtst.py       | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 3730eec898f67a..ced40e2d3c1f44 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1998,7 +1998,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        target_values: Optional[torch.Tensor],
+        target_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2013,7 +2013,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            target_values (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*):
+            target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
                 target values associates with the `past_values`
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index eaa1033cc0fc1a..496cfc8301aa86 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -199,8 +199,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             inputs_dict.pop("future_values")
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
-            labels = floats_tensor([self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng)
-            inputs_dict["labels"] = labels
+            target_values = floats_tensor(
+                [self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng
+            )
+            inputs_dict["target_values"] = target_values
             inputs_dict.pop("future_values")
         return inputs_dict
 
@@ -281,7 +283,9 @@ def test_forward_signature(self):
             ):
                 expected_arg_names.remove("future_values")
                 expected_arg_names.remove("past_observed_mask")
-                expected_arg_names.append("labels")
+                expected_arg_names.append("labels") if model_class in get_values(
+                    MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING
+                ) else expected_arg_names.append("target_values")
                 expected_arg_names.append("past_observed_mask")
             expected_arg_names.extend(
                 [

From c489972f864c98f76505ed49ade0bcda142a89ce Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 23 Oct 2023 12:32:28 -0400
Subject: [PATCH 139/189] change to forecast_mask_ratios and random_mask_ratio

---
 .../models/patchtst/configuration_patchtst.py  |  8 ++++----
 .../models/patchtst/modeling_patchtst.py       | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index d7336b1b0b5742..bd4733f0da607d 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -104,11 +104,11 @@ class PatchTSTConfig(PretrainedConfig):
             Apply masking during the pretraining.
         mask_type (`str`, *optional*, defaults to `"random"`):
             Masking type. Only `"random"` is currently supported.
-        mask_ratio (`float`, *optional*, defaults to 0.5):
-            Masking ratio is applied to mask the input data during pretraining.
-        mask_patches (`List`, *optional*, defaults to `[2, 3]`):
+        random_mask_ratio (`float`, *optional*, defaults to 0.5):
+            Masking ratio is applied to mask the input data during random pretraining.
+        forecast_mask_patches (`List`, *optional*, defaults to `[2, 3]`):
             List of patch lengths to mask in the end of the data.
-        mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`):
+        forecast_mask_ratios (`List`, *optional*, defaults to `[1, 1]`):
             List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
             then equal weights to both patch lengths. Defaults to None.
         channel_consistent_masking (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5595d92e327b23..170ca4c264f2e2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -317,13 +317,13 @@ def random_masking(
 
 def forecast_masking(
     inputs: torch.Tensor,
-    patch_lengths: list,
+    forecast_mask_patches: list,
     mix_ratio: list = None,
     unmasked_channel_indices: list = None,
     mask_value: int = 0,
     seed_number: Optional[int] = None,
 ):
-    """Forecast masking that masks the last K patches where K is from the patch_lengths list.
+    """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list.
     For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in
     unmasked_channel_indices.
 
@@ -331,10 +331,10 @@ def forecast_masking(
         inputs (`torch.Tensor`):
             Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch,
             patch_len)`
-        patch_lengths (`list`):
+        forecast_mask_patches (`list`): [2, 4]
             List of patch lengths to mask in the end of the data.
-        mix_ratio (`list`, *optional*):
-            List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
+        mix_ratio (`list`, *optional*): [0.7, 0.3]
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1],
             then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
@@ -351,7 +351,7 @@ def forecast_masking(
         set_seed(seed_number)
 
     if mix_ratio is None:
-        mix_ratio = [1 for _ in patch_lengths]
+        mix_ratio = [1 for _ in forecast_mask_patches]
 
     batch_size, num_channels, sequence_length, num_features = inputs.shape
     mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
@@ -360,7 +360,7 @@ def forecast_masking(
     total_length = 0
     total_ratio = sum(mix_ratio)
 
-    for patch_length, ratio in zip(patch_lengths, mix_ratio):
+    for patch_length, ratio in zip(forecast_mask_patches, mix_ratio):
         if patch_length <= 0 or patch_length >= sequence_length:
             raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
         temp_len = int(batch_size * ratio / total_ratio)
@@ -460,7 +460,7 @@ class PatchTSTMasking(nn.Module):
         mask_ratio (`float`, *optional*): Mask ratio.
         mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
         mask_patch_ratios (`list`, *optional*):
-            List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1],
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1],
             then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
@@ -527,7 +527,7 @@ def forward(self, patch_input: torch.Tensor):
         elif self.mask_type == "forecast":
             masked_input, mask = forecast_masking(
                 inputs=patch_input,
-                patch_lengths=self.mask_patches,
+                forecast_mask_patches=self.forecast_mask_patches,
                 mix_ratio=self.mask_patch_ratios,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 mask_value=self.mask_value,

From 6318cd3d2741577149a1ebc56164800d86167a92 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 23 Oct 2023 16:08:51 -0400
Subject: [PATCH 140/189] change mask names

---
 .../models/patchtst/configuration_patchtst.py | 12 ++---
 .../models/patchtst/modeling_patchtst.py      | 46 +++++++++----------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index bd4733f0da607d..b3fad61f911f5d 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -186,9 +186,9 @@ def __init__(
         # mask pretraining
         mask_input: Optional[bool] = None,
         mask_type: str = "random",
-        mask_ratio: float = 0.5,
-        mask_patches: List[int] = [2, 3],
-        mask_patch_ratios: List[int] = [1, 1],
+        random_mask_ratio: float = 0.5,
+        forecast_mask_patches: List[int] = [2, 3],
+        forecast_mask_ratios: List[int] = [1, 1],
         channel_consistent_masking: bool = False,
         unmasked_channel_indices: Optional[List[int]] = None,
         mask_value=0,
@@ -240,9 +240,9 @@ def __init__(
         self.seed_number = seed_number
         self.mask_input = mask_input
         self.mask_type = mask_type
-        self.mask_ratio = mask_ratio
-        self.mask_patches = mask_patches
-        self.mask_patch_ratios = mask_patch_ratios
+        self.random_mask_ratio = random_mask_ratio  # for random masking
+        self.forecast_mask_patches = forecast_mask_patches  # for forecast masking
+        self.forecast_mask_ratios = forecast_mask_ratios
         self.channel_consistent_masking = channel_consistent_masking
         self.unmasked_channel_indices = unmasked_channel_indices
         self.mask_value = mask_value
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 86f9cf9aeb0047..5bb1a5cb716815 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -318,13 +318,13 @@ def random_masking(
 def forecast_masking(
     inputs: torch.Tensor,
     forecast_mask_patches: list,
-    mix_ratio: list = None,
+    forecast_mask_ratios: list = None,
     unmasked_channel_indices: list = None,
     mask_value: int = 0,
     seed_number: Optional[int] = None,
 ):
     """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list.
-    For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in
+    For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices mentioned in
     unmasked_channel_indices.
 
     Parameters:
@@ -333,8 +333,8 @@ def forecast_masking(
             patch_len)`
         forecast_mask_patches (`list`): [2, 4]
             List of patch lengths to mask in the end of the data.
-        mix_ratio (`list`, *optional*): [0.7, 0.3]
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1],
+        forecast_mask_ratios (`list`, *optional*): [0.7, 0.3]
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1],
             then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
@@ -350,17 +350,17 @@ def forecast_masking(
     if seed_number:
         set_seed(seed_number)
 
-    if mix_ratio is None:
-        mix_ratio = [1 for _ in forecast_mask_patches]
+    if forecast_mask_ratios is None:
+        forecast_mask_ratios = [1 for _ in forecast_mask_patches]
 
     batch_size, num_channels, sequence_length, num_features = inputs.shape
     mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
 
     t_list = []
     total_length = 0
-    total_ratio = sum(mix_ratio)
+    total_ratio = sum(forecast_mask_ratios)
 
-    for patch_length, ratio in zip(forecast_mask_patches, mix_ratio):
+    for patch_length, ratio in zip(forecast_mask_patches, forecast_mask_ratios):
         if patch_length <= 0 or patch_length >= sequence_length:
             raise Exception("masked_patch_len should be greater than 0 and less than total patches.")
         temp_len = int(batch_size * ratio / total_ratio)
@@ -457,10 +457,10 @@ class PatchTSTMasking(nn.Module):
 
     Parameters:
         mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random.
-        mask_ratio (`float`, *optional*): Mask ratio.
-        mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
-        mask_patch_ratios (`list`, *optional*):
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1],
+        random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining.
+        forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
+        forecast_mask_ratios (`list`, *optional*):
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1],
             then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
@@ -481,20 +481,20 @@ class PatchTSTMasking(nn.Module):
     def __init__(
         self,
         mask_type: str = "random",
-        mask_ratio: float = 0.5,
-        mask_patches: list = [2, 3],
-        mask_patch_ratios: list = [1, 1],
+        random_mask_ratio: float = 0.5,
+        forecast_mask_patches: list = [2, 3],
+        forecast_mask_ratios: list = [1, 1],
         channel_consistent_masking: bool = False,
         unmasked_channel_indices: list = None,
         mask_value: int = 0,
         seed_number: Optional[int] = None,
     ):
         super().__init__()
-        self.mask_ratio = mask_ratio
+        self.random_mask_ratio = random_mask_ratio
         self.channel_consistent_masking = channel_consistent_masking
         self.mask_type = mask_type
-        self.mask_patches = mask_patches
-        self.mask_patch_ratios = mask_patch_ratios
+        self.forecast_mask_patches = forecast_mask_patches
+        self.forecast_mask_ratios = forecast_mask_ratios
         self.unmasked_channel_indices = unmasked_channel_indices
         self.mask_value = mask_value
         if self.unmasked_channel_indices is not None:
@@ -518,7 +518,7 @@ def forward(self, patch_input: torch.Tensor):
         if self.mask_type == "random":
             masked_input, mask = random_masking(
                 inputs=patch_input,
-                mask_ratio=self.mask_ratio,
+                mask_ratio=self.random_mask_ratio,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 channel_consistent_masking=self.channel_consistent_masking,
                 mask_value=self.mask_value,
@@ -528,7 +528,7 @@ def forward(self, patch_input: torch.Tensor):
             masked_input, mask = forecast_masking(
                 inputs=patch_input,
                 forecast_mask_patches=self.forecast_mask_patches,
-                mix_ratio=self.mask_patch_ratios,
+                forecast_mask_ratios=self.forecast_mask_ratios,
                 unmasked_channel_indices=self.unmasked_channel_indices,
                 mask_value=self.mask_value,
                 seed_number=self.seed_number,
@@ -1267,9 +1267,9 @@ def __init__(self, config: PatchTSTConfig):
         if self.mask_input:
             self.masking = PatchTSTMasking(
                 mask_type=config.mask_type,
-                mask_ratio=config.mask_ratio,
-                mask_patches=config.mask_patches,
-                mask_patch_ratios=config.mask_patch_ratios,
+                random_mask_ratio=config.random_mask_ratio,
+                forecast_mask_patches=config.forecast_mask_patches,
+                forecast_mask_ratios=config.forecast_mask_ratios,
                 channel_consistent_masking=config.channel_consistent_masking,
                 unmasked_channel_indices=config.unmasked_channel_indices,
                 mask_value=config.mask_value,

From 6734b654548baf24b41d6b8e8687315c41837aa9 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 23 Oct 2023 17:29:20 -0400
Subject: [PATCH 141/189] change future_values to target_values param in the
 prediction class

---
 .../models/patchtst/modeling_patchtst.py             | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 5bb1a5cb716815..e8324efadeeb1f 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1596,7 +1596,7 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
+        target_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
@@ -1610,7 +1610,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
+            target_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
                 future target values associates with the `past_values`
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
@@ -1633,15 +1633,15 @@ def forward(
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if future_values is not None:
+        if target_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
-                loss_val = nll(distribution, future_values)
+                loss_val = nll(distribution, target_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
             else:
                 loss = nn.MSELoss(reduction="mean")
-                loss_val = loss(y_hat, future_values)
+                loss_val = loss(y_hat, target_values)
 
         encoder_states = model_output.hidden_states
         if not return_dict:
@@ -1677,7 +1677,7 @@ def generate(
         # get model output
         outputs = self(
             past_values=past_values,
-            future_values=None,
+            target_values=None,
             past_observed_mask=past_observed_mask,
             output_hidden_states=False,
         )

From 8a7f2a06255184a304782ae6fd3ae60263b294b7 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 23 Oct 2023 18:03:40 -0400
Subject: [PATCH 142/189] remove nn.Sequential and make PatchTSTBatchNorm class

---
 .../models/patchtst/modeling_patchtst.py      | 51 ++++++++++++-------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e8324efadeeb1f..f3ca8a0d57b60e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -196,11 +196,12 @@ def forward(
 
 
 class PatchTSTTranspose(nn.Module):
-    """
-    Parameters:
+    """    
     Transpose the tensor to the dimension defined in **dims**
-        dims (`list`): list of dimensions to be transposed contiguous (`bool`): if True, the transposed tensor is
-        contiguous
+
+    Parameters:
+        dims (`list`): list of dimensions to be transposed 
+        contiguous (`bool`, default to False): if True, the transposed tensor is contiguous
     """
 
     def __init__(self, *dims, contiguous=False):
@@ -221,6 +222,32 @@ def forward(self, inputs: torch.Tensor):
             return inputs.transpose(*self.dims)
 
 
+class PatchTSTBatchNorm(nn.Module):
+    """
+    Compute batch normalization
+    Parameters: 
+        d_model (`int`): model dimension
+    """
+    def __init__(self, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.transpose = PatchTSTTranspose(1, 2)
+        self.batchnorm = nn.BatchNorm1d(self.d_model)
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Parameters:
+            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): 
+                input for Batch norm calculation
+        Returns:
+            `torch.Tensor`: tensor
+        """
+        output = self.transpose(inputs)         # output: (batch_size, d_model, sequence_length)
+        output = self.batchnorm(output)
+        output = self.transpose(output)         # output: (batch_size, sequence_length, d_model)
+        return output
+        
+
 def positional_encoding(position_embedding_type, learned, q_len, d_model):
     # Positional encoding
     if position_embedding_type is None:
@@ -596,9 +623,7 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of the sublayer 1
         self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer1 = nn.Sequential(
-                PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
-            )
+            self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model)
         else:
             self.norm_sublayer1 = nn.LayerNorm(config.d_model)
 
@@ -606,9 +631,7 @@ def __init__(self, config: PatchTSTConfig):
         if self.channel_attention:
             self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
             if "batch" in config.norm.lower():
-                self.norm_sublayer2 = nn.Sequential(
-                    PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
-                )
+                self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model)
             else:
                 self.norm_sublayer2 = nn.LayerNorm(config.d_model)
 
@@ -623,9 +646,7 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of sublayer 3
         self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer3 = nn.Sequential(
-                PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2)
-            )
+            self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model)
         else:
             self.norm_sublayer3 = nn.LayerNorm(config.d_model)
 
@@ -1476,10 +1497,6 @@ def forward(
 
 
 class PatchTSTClassificationHead(nn.Module):
-    """
-    Classification head
-    """
-
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.use_cls_token = config.use_cls_token

From f23ff20dedc8c51f56afc7e4ee43c0902e269574 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 24 Oct 2023 09:48:10 +0200
Subject: [PATCH 143/189] black

---
 .../models/patchtst/modeling_patchtst.py      | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f3ca8a0d57b60e..0d48cda7270938 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -196,11 +196,11 @@ def forward(
 
 
 class PatchTSTTranspose(nn.Module):
-    """    
+    """
     Transpose the tensor to the dimension defined in **dims**
 
     Parameters:
-        dims (`list`): list of dimensions to be transposed 
+        dims (`list`): list of dimensions to be transposed
         contiguous (`bool`, default to False): if True, the transposed tensor is contiguous
     """
 
@@ -224,10 +224,11 @@ def forward(self, inputs: torch.Tensor):
 
 class PatchTSTBatchNorm(nn.Module):
     """
+    Parameters:
     Compute batch normalization
-    Parameters: 
         d_model (`int`): model dimension
     """
+
     def __init__(self, d_model):
         super().__init__()
         self.d_model = d_model
@@ -237,16 +238,16 @@ def __init__(self, d_model):
     def forward(self, inputs: torch.Tensor):
         """
         Parameters:
-            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): 
+            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                 input for Batch norm calculation
         Returns:
             `torch.Tensor`: tensor
         """
-        output = self.transpose(inputs)         # output: (batch_size, d_model, sequence_length)
+        output = self.transpose(inputs)  # output: (batch_size, d_model, sequence_length)
         output = self.batchnorm(output)
-        output = self.transpose(output)         # output: (batch_size, sequence_length, d_model)
+        output = self.transpose(output)  # output: (batch_size, sequence_length, d_model)
         return output
-        
+
 
 def positional_encoding(position_embedding_type, learned, q_len, d_model):
     # Positional encoding
@@ -351,8 +352,8 @@ def forecast_masking(
     seed_number: Optional[int] = None,
 ):
     """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list.
-    For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices mentioned in
-    unmasked_channel_indices.
+    For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices
+    mentioned in unmasked_channel_indices.
 
     Parameters:
         inputs (`torch.Tensor`):
@@ -361,8 +362,8 @@ def forecast_masking(
         forecast_mask_patches (`list`): [2, 4]
             List of patch lengths to mask in the end of the data.
         forecast_mask_ratios (`list`, *optional*): [0.7, 0.3]
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1],
-            then equal weights to both patch lengths. Defaults to None.
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and
+            forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Control Variable channel indices. These channels will not be masked. Defaults to None.
         mask_value (`int`, *optional* defaults to 0):
@@ -487,8 +488,8 @@ class PatchTSTMasking(nn.Module):
         random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining.
         forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
         forecast_mask_ratios (`list`, *optional*):
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1],
-            then equal weights to both patch lengths. Defaults to None.
+            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and
+            forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None.
         unmasked_channel_indices (`list`, *optional*):
             Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
         channel_consistent_masking (`bool`, *optional*):

From d6eebdb3b6f846da44acd2b7d874b25388939b40 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 24 Oct 2023 13:22:56 +0200
Subject: [PATCH 144/189] fix argument name for prediction

---
 src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 0d48cda7270938..e1cbc33511b6f4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1614,7 +1614,7 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        target_values: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
@@ -1628,7 +1628,7 @@ def forward(
 
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            target_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
+            future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
                 future target values associates with the `past_values`
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
@@ -1651,15 +1651,15 @@ def forward(
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
-        if target_values is not None:
+        if future_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(y_hat)
-                loss_val = nll(distribution, target_values)
+                loss_val = nll(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
             else:
                 loss = nn.MSELoss(reduction="mean")
-                loss_val = loss(y_hat, target_values)
+                loss_val = loss(y_hat, future_values)
 
         encoder_states = model_output.hidden_states
         if not return_dict:

From 61b9da5b54150b41e71e7d8d89d1f9c19ba69cf9 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 24 Oct 2023 17:14:06 -0400
Subject: [PATCH 145/189] add output_attentions option

---
 .../models/patchtst/modeling_patchtst.py      | 136 ++++++++++++------
 1 file changed, 92 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e1cbc33511b6f4..2f734b266b1afb 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -22,7 +22,7 @@
 from torch import nn
 
 from ...activations import ACT2CLS
-from ...modeling_outputs import BaseModelOutputWithNoAttention
+from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...trainer_utils import set_seed
@@ -653,7 +653,10 @@ def __init__(self, config: PatchTSTConfig):
 
         self.pre_norm = config.pre_norm
 
-    def forward(self, hidden_state: torch.Tensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                output_attentions: Optional[bool] = None
+                ):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -662,23 +665,31 @@ def forward(self, hidden_state: torch.Tensor):
             `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
 
         """
-        batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape
+        batch_size, num_input_channels, sequence_length, d_model = hidden_states.shape
 
         # First sublayer: attention across time
-        src = hidden_state.view(
+        hidden_states = hidden_states.view(
             batch_size * num_input_channels, sequence_length, d_model
-        )  # src: [(bs*num_channels) x sequence_length x d_model]
+        )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
-            src = src + self.dropout_path1(
-                self.self_attn(self.norm_sublayer1(src))[0]
-            )  # Add: residual connection with residual dropout
+            hidden_states, attn_weights, _ = self.self_attn(
+                hidden_states=self.norm_sublayer1(hidden_states),
+                output_attentions=output_attentions
+           )
+            hidden_states = hidden_states + self.dropout_path1(hidden_states)  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            src = self.norm_sublayer1(
-                src + self.dropout_path1(self.self_attn(src)[0])
-            )  # src: [(bs*num_channels) x sequence_length x d_model]
-        src = src.reshape(
+            hidden_states, attn_weights, _ = self.self_attn(
+                hidden_states=hidden_states,
+                output_attentions=output_attentions
+            )
+            hidden_states = self.norm_sublayer1(
+                hidden_states + self.dropout_path1(hidden_states)
+            )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+
+        hidden_states = hidden_states.reshape(
             batch_size, num_input_channels, sequence_length, d_model
         )  # [bs x num_channels x sequence_length x d_model]
 
@@ -686,42 +697,55 @@ def forward(self, hidden_state: torch.Tensor):
         # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model]
         #                                                 -> [(bs*sequence_length) x num_channels x d_model]
         if self.channel_attention:
-            src = (
-                src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model)
+            hidden_states = (
+                hidden_states.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model)
             )  # [(bs*sequence_length) x num_channels x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
-                src = src + self.dropout_path2(
-                    self.self_attn(self.norm_sublayer2(src))[0]
-                )  # Add: residual connection with residual dropout
+                hidden_states, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=self.norm_sublayer2(hidden_states),
+                    output_attentions=output_attentions
+                )
+                hidden_states = hidden_states + self.dropout_path2(hidden_states)  # Add: residual connection with residual dropout
             else:
                 ## Multi-Head attention and Add residual connection and Norm
-                src = self.norm_sublayer2(
-                    src + self.dropout_path2(self.self_attn(src)[0])
-                )  # src: [(bs*sequence_length) x num_channels x d_model]
-            src = (
-                src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
+                hidden_states, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=hidden_states,
+                    output_attentions=output_attentions
+                )
+                hidden_states = self.norm_sublayer2(
+                    hidden_states + self.dropout_path2(hidden_states)
+                )  # hidden_states: [(bs*sequence_length) x num_channels x d_model]
+
+            hidden_states = (
+                hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
             )  # src: [bs x num_channels x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        src = src.view(
+        hidden_states = hidden_states.view(
             batch_size * num_input_channels, sequence_length, d_model
         )  # src: [(batch_size*num_channels) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
-            src = src + self.dropout_path3(
-                self.ff(self.norm_sublayer3(src))
+            hidden_states = hidden_states + self.dropout_path3(
+                self.ff(self.norm_sublayer3(hidden_states))
             )  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm
-            src = self.norm_sublayer3(
-                src + self.dropout_path3(self.ff(src))
+            hidden_states = self.norm_sublayer3(
+                hidden_states + self.dropout_path3(self.ff(hidden_states))
             )  # Add: residual connection with residual dropout
-        src = src.reshape(
+
+        hidden_states = hidden_states.reshape(
             batch_size, num_input_channels, sequence_length, d_model
         )  # [bs x num_channels x sequence_length x d_model]
 
-        return src
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights, channel_attn_weights)
+
+        return outputs
 
 
 class PatchTSTPreTrainedModel(PreTrainedModel):
@@ -788,22 +812,30 @@ def __init__(self, config: PatchTSTConfig):
 
         # Encoder
         self.encoder = PatchTSTEncoderBlock(config)
+        self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
-        self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None
-    ) -> BaseModelOutputWithNoAttention:
+        self,
+        past_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+    ) -> BaseModelOutput:
         """
         Parameters:
             past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Past values of the time series
-            output_hidden_states (bool, optional): Indicates if hidden states should be output.
+            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
+            output_attentions (bool, optional): Indicates if attentions should be outputted.
 
         return:
-            `BaseModelOutputWithNoAttention`
+            `BaseModelOutput`
         """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+
         _, num_input_channels, _, _ = past_values.shape
 
         # Input encoding
@@ -822,25 +854,38 @@ def forward(
             # append cls token
             cls_token = self.cls_token + self.position_enc[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
-            past_values = torch.cat(
+            hidden_states = torch.cat(
                 (cls_tokens, past_values), dim=1
             )  # x: [bs x num_channels x (num_patches+1) x d_model]
         else:
-            past_values = self.positional_dropout(
+            hidden_states = self.positional_dropout(
                 past_values + self.position_enc
             )  # x: [bs x num_channels x num_patches x d_model]
 
-        # Encoder
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        past_values, hidden_states = self.encoder(
-            past_values, output_hidden_states
-        )  # x: [bs x num_channels x num_patches x d_model]
-        # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                output_attentions=output_attentions,
+            )
+            # get hidden state
+            hidden_states = layer_outputs[0]    # hidden_states: [bs x num_channels x num_patches x d_model]
+                                                # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+            # append layer attention
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         # return past_values, hidden_states
-        return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states)
+        return BaseModelOutput(
+            last_hidden_state=past_values,
+            hidden_states=encoder_states,
+            attentions=all_attentions
+        )
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -1310,9 +1355,12 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
         if past_observed_mask is None:
             past_observed_mask = torch.ones_like(past_values)

From 2b84706813de8cec5e36482bd9a0c70f1051aedc Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 24 Oct 2023 23:31:02 -0400
Subject: [PATCH 146/189] add output_attentions to PatchTSTEncoder

---
 src/transformers/models/patchtst/modeling_patchtst.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 2f734b266b1afb..627a8383d6536e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -811,7 +811,7 @@ def __init__(self, config: PatchTSTConfig):
         )
 
         # Encoder
-        self.encoder = PatchTSTEncoderBlock(config)
+        # self.encoder = PatchTSTEncoderBlock(config)
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
         # Initialize weights and apply final processing
@@ -1374,7 +1374,10 @@ def forward(
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
-        encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states)
+        encoder_output = self.encoder(masked_values,
+                                      output_hidden_states=output_hidden_states,
+                                      output_attentions=output_attentions
+                                      )
 
         hidden_states = encoder_output.last_hidden_state
         encoder_states = encoder_output.hidden_states

From 0be64406146ba9076c0523cf81e76a64c44a29b6 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 25 Oct 2023 08:29:37 +0200
Subject: [PATCH 147/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 57 +++++++++----------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 627a8383d6536e..a572b80dd7fa28 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -653,10 +653,7 @@ def __init__(self, config: PatchTSTConfig):
 
         self.pre_norm = config.pre_norm
 
-    def forward(self,
-                hidden_states: torch.Tensor,
-                output_attentions: Optional[bool] = None
-                ):
+    def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] = None):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -675,15 +672,15 @@ def forward(self,
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             hidden_states, attn_weights, _ = self.self_attn(
-                hidden_states=self.norm_sublayer1(hidden_states),
-                output_attentions=output_attentions
-           )
-            hidden_states = hidden_states + self.dropout_path1(hidden_states)  # Add: residual connection with residual dropout
+                hidden_states=self.norm_sublayer1(hidden_states), output_attentions=output_attentions
+            )
+            hidden_states = hidden_states + self.dropout_path1(
+                hidden_states
+            )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             hidden_states, attn_weights, _ = self.self_attn(
-                hidden_states=hidden_states,
-                output_attentions=output_attentions
+                hidden_states=hidden_states, output_attentions=output_attentions
             )
             hidden_states = self.norm_sublayer1(
                 hidden_states + self.dropout_path1(hidden_states)
@@ -698,27 +695,31 @@ def forward(self,
         #                                                 -> [(bs*sequence_length) x num_channels x d_model]
         if self.channel_attention:
             hidden_states = (
-                hidden_states.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model)
+                hidden_states.transpose(2, 1)
+                .contiguous()
+                .view(batch_size * sequence_length, num_input_channels, d_model)
             )  # [(bs*sequence_length) x num_channels x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 hidden_states, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=self.norm_sublayer2(hidden_states),
-                    output_attentions=output_attentions
+                    hidden_states=self.norm_sublayer2(hidden_states), output_attentions=output_attentions
                 )
-                hidden_states = hidden_states + self.dropout_path2(hidden_states)  # Add: residual connection with residual dropout
+                hidden_states = hidden_states + self.dropout_path2(
+                    hidden_states
+                )  # Add: residual connection with residual dropout
             else:
                 ## Multi-Head attention and Add residual connection and Norm
                 hidden_states, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=hidden_states,
-                    output_attentions=output_attentions
+                    hidden_states=hidden_states, output_attentions=output_attentions
                 )
                 hidden_states = self.norm_sublayer2(
                     hidden_states + self.dropout_path2(hidden_states)
                 )  # hidden_states: [(bs*sequence_length) x num_channels x d_model]
 
             hidden_states = (
-                hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous()
+                hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model)
+                .transpose(1, 2)
+                .contiguous()
             )  # src: [bs x num_channels x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
@@ -834,7 +835,9 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         _, num_input_channels, _, _ = past_values.shape
 
@@ -874,18 +877,14 @@ def forward(
                 output_attentions=output_attentions,
             )
             # get hidden state
-            hidden_states = layer_outputs[0]    # hidden_states: [bs x num_channels x num_patches x d_model]
-                                                # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+            hidden_states = layer_outputs[0]  # hidden_states: [bs x num_channels x num_patches x d_model]
+            # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
             # append layer attention
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         # return past_values, hidden_states
-        return BaseModelOutput(
-            last_hidden_state=past_values,
-            hidden_states=encoder_states,
-            attentions=all_attentions
-        )
+        return BaseModelOutput(last_hidden_state=past_values, hidden_states=encoder_states, attentions=all_attentions)
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -1358,7 +1357,6 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
@@ -1374,10 +1372,9 @@ def forward(
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
-        encoder_output = self.encoder(masked_values,
-                                      output_hidden_states=output_hidden_states,
-                                      output_attentions=output_attentions
-                                      )
+        encoder_output = self.encoder(
+            masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
 
         hidden_states = encoder_output.last_hidden_state
         encoder_states = encoder_output.hidden_states

From 8972a920eb504b00e84d56e3657b0a2ff9f07f71 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 11:39:31 -0400
Subject: [PATCH 148/189] Add attention output option to all classes

---
 .../models/patchtst/modeling_patchtst.py      | 106 ++++++++++++------
 1 file changed, 74 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a572b80dd7fa28..c61b47ae08e538 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -812,7 +812,6 @@ def __init__(self, config: PatchTSTConfig):
         )
 
         # Encoder
-        # self.encoder = PatchTSTEncoderBlock(config)
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
         # Initialize weights and apply final processing
@@ -884,7 +883,9 @@ def forward(
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         # return past_values, hidden_states
-        return BaseModelOutput(last_hidden_state=past_values, hidden_states=encoder_states, attentions=all_attentions)
+        return BaseModelOutput(last_hidden_state=past_values,
+                               hidden_states=encoder_states,
+                               attentions=all_attentions)
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -936,7 +937,7 @@ def forward(
 
 
 @dataclass
-class PatchTSTModelOutputWithNoAttention(ModelOutput):
+class PatchTSTModelOutput(ModelOutput):
     """
     Base class for model's outputs, with potential hidden states.
 
@@ -959,6 +960,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput):
 
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
     patched_input: torch.FloatTensor = None
     mask: torch.FloatTensor = None
     loc: torch.FloatTensor = None
@@ -1356,7 +1358,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]:
+    ) -> Union[Tuple, PatchTSTModelOutput]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
@@ -1372,18 +1374,21 @@ def forward(
             masked_values, mask = self.masking(patched_values)
         else:
             masked_values, mask = self.masking(patched_values), None
+
         encoder_output = self.encoder(
-            masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+            past_values=masked_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
 
-        hidden_states = encoder_output.last_hidden_state
-        encoder_states = encoder_output.hidden_states
-
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, patched_values, mask, loc, scale] if v is not None)
-        return PatchTSTModelOutputWithNoAttention(
-            last_hidden_state=hidden_states,
-            hidden_states=encoder_states,
+            outputs = (encoder_output.last_hidden_state, encoder_output.hidden_states, encoder_output.attentions)
+            outputs = outputs + (patched_values, mask, loc, scale)
+            return tuple(v for v in outputs if v is not None)
+        return PatchTSTModelOutput(
+            last_hidden_state=encoder_output.last_hidden_state,
+            hidden_states=encoder_output.hidden_states,
+            attentions=encoder_output.attentions,
             patched_input=patched_values,
             mask=mask,
             loc=loc,
@@ -1439,6 +1444,7 @@ def forward(
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
         """
@@ -1466,7 +1472,10 @@ def forward(
         # past_values: [bs x num_channels x num_patches x d_model] or
         # [bs x num_channels x (num_patches+1) x d_model] if use cls_token
         model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
 
         # model_output[0]: [bs x num_channels x num_patches x patch_length] or
@@ -1480,8 +1489,13 @@ def forward(
 
         encoder_states = model_output.hidden_states
         if not return_dict:
-            return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None)
-        return PatchTSTForPretrainingOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states)
+            outputs = (masked_loss, x_hat, model_output.hidden_states, model_output.attentions)
+            return tuple(v for v in outputs if v is not None)
+        return PatchTSTForPretrainingOutput(loss=masked_loss,
+                                            prediction_output=x_hat,
+                                            hidden_states=encoder_states,
+                                            attentions=model_output.attentions
+                                            )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1504,6 +1518,7 @@ def forward(
         labels: torch.Tensor = None,
         past_observed_mask: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForClassificationOutput]:
         """
@@ -1530,7 +1545,10 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
         y_hat = self.head(model_output[0])
 
@@ -1539,10 +1557,14 @@ def forward(
             loss = nn.CrossEntropyLoss()
             loss_val = loss(y_hat, labels)
 
-        encoder_states = model_output.hidden_states
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states)
+            outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
+            return tuple(v for v in outputs if v is not None)
+        return PatchTSTForClassificationOutput(loss=loss_val,
+                                               prediction_logits=y_hat,
+                                               hidden_states=model_output.hidden_states,
+                                               attentions=model_output.attentions
+                                               )
 
 
 class PatchTSTClassificationHead(nn.Module):
@@ -1664,6 +1686,7 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = True,
     ) -> Union[Tuple, PatchTSTForPredictionOutput]:
         """
@@ -1692,7 +1715,10 @@ def forward(
 
         # get model output
         model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
 
         # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape
@@ -1709,10 +1735,14 @@ def forward(
                 loss = nn.MSELoss(reduction="mean")
                 loss_val = loss(y_hat, future_values)
 
-        encoder_states = model_output.hidden_states
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTForPredictionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
+            outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
+            return tuple(v for v in outputs if v is not None)
+        return PatchTSTForPredictionOutput(loss=loss_val,
+                                           prediction_output=y_hat,
+                                           hidden_states=model_output.hidden_states,
+                                           attentions=model_output.attentions
+                                           )
 
     def generate(
         self,
@@ -1872,6 +1902,7 @@ def forward(
         past_observed_mask: Optional[torch.Tensor] = None,
         future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTForForecastingOutput]:
         """
@@ -1899,7 +1930,10 @@ def forward(
 
         # get model output
         model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
         # get output head
         y_hat = self.head(model_output.last_hidden_state)
@@ -1922,16 +1956,17 @@ def forward(
                 loss = nn.MSELoss(reduction="mean")
                 loss_val = loss(y_hat, future_values)
 
-        encoder_states = model_output.hidden_states
         loc = model_output.loc
         scale = model_output.scale
 
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None)
+            outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions, loc, scale)
+            return tuple(v for v in outputs if v is not None)
         return PatchTSTForForecastingOutput(
             loss=loss_val,
             forecast_outputs=y_hat,
-            hidden_states=encoder_states,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
             loc=loc,
             scale=scale,
         )
@@ -2067,6 +2102,7 @@ def forward(
         target_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, PatchTSTForRegressionOutput]:
         """
@@ -2093,7 +2129,10 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         model_output = self.model(
-            past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions
         )
         # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape
         y_hat = self.head(model_output.last_hidden_state)
@@ -2109,11 +2148,14 @@ def forward(
                 loss = nn.MSELoss(reduction="mean")
                 loss_val = loss(y_hat, target_values)
 
-        encoder_states = model_output.hidden_states
-
         if not return_dict:
-            return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None)
-        return PatchTSTForRegressionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states)
+            outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
+            return tuple(v for v in outputs if v is not None)
+        return PatchTSTForRegressionOutput(loss=loss_val,
+                                           prediction_output=y_hat,
+                                           hidden_states=model_output.hidden_states,
+                                           attentions=model_output.attentions
+                                           )
 
     def generate(
         self,

From a2ff8ef8b63c0ee3e3d391bbded4723ceaf02657 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 12:36:21 -0400
Subject: [PATCH 149/189] Remove PatchTSTEncoderBlock

---
 .../models/patchtst/modeling_patchtst.py      | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index c61b47ae08e538..1130d62fa34d7b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -569,41 +569,6 @@ def forward(self, patch_input: torch.Tensor):
         return masked_input, mask
 
 
-class PatchTSTEncoderBlock(nn.Module):
-    """
-    PatchTST encoder block
-    """
-
-    def __init__(self, config: PatchTSTConfig):
-        super().__init__()
-
-        self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
-
-    def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False):
-        """
-        Parameters:
-            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
-                Past values of the time series
-            output_hidden_states (`bool`, *optional*, default to False):
-                output hidden state option
-        Return:
-            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`)
-
-            all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor`
-            of shapes `(batch_size, num_channels, sequence_length, d_model)`)
-
-        """
-        all_hidden_states = []
-
-        for mod in self.layers:
-            hidden_state = mod(hidden_state)
-            if output_hidden_states:
-                all_hidden_states.append(hidden_state)
-        if output_hidden_states is False:
-            return hidden_state, None
-        return hidden_state, all_hidden_states
-
-
 class PatchTSTEncoderLayer(nn.Module):
     """
     PatchTST encoder layer

From d11ea0eab442d33ad6b4f9cce7f69ed3b6425e46 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 16:50:38 -0400
Subject: [PATCH 150/189] create PatchTSTEmbedding class

---
 .../models/patchtst/modeling_patchtst.py      | 158 ++++++++++--------
 1 file changed, 88 insertions(+), 70 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 1130d62fa34d7b..d2ea7d153069df 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -573,7 +573,6 @@ class PatchTSTEncoderLayer(nn.Module):
     """
     PatchTST encoder layer
     """
-
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -618,7 +617,9 @@ def __init__(self, config: PatchTSTConfig):
 
         self.pre_norm = config.pre_norm
 
-    def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] = None):
+    def forward(self,
+                hidden_state: torch.Tensor,
+                output_attentions: Optional[bool] = None):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -627,31 +628,33 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool]
             `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
 
         """
-        batch_size, num_input_channels, sequence_length, d_model = hidden_states.shape
+        batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape
 
         # First sublayer: attention across time
-        hidden_states = hidden_states.view(
+        hidden_state = hidden_state.view(
             batch_size * num_input_channels, sequence_length, d_model
         )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
 
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
-            hidden_states, attn_weights, _ = self.self_attn(
-                hidden_states=self.norm_sublayer1(hidden_states), output_attentions=output_attentions
+            attn_output, attn_weights, _ = self.self_attn(
+                hidden_states=self.norm_sublayer1(hidden_state),
+                output_attentions=output_attentions
             )
-            hidden_states = hidden_states + self.dropout_path1(
-                hidden_states
+            hidden_state = hidden_state + self.dropout_path1(
+                attn_output
             )  # Add: residual connection with residual dropout
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
-            hidden_states, attn_weights, _ = self.self_attn(
-                hidden_states=hidden_states, output_attentions=output_attentions
+            attn_output, attn_weights, _ = self.self_attn(
+                hidden_states=hidden_state,
+                output_attentions=output_attentions
             )
-            hidden_states = self.norm_sublayer1(
-                hidden_states + self.dropout_path1(hidden_states)
+            hidden_state = self.norm_sublayer1(
+                hidden_state + self.dropout_path1(attn_output)
             )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
 
-        hidden_states = hidden_states.reshape(
+        hidden_state = hidden_state.reshape(
             batch_size, num_input_channels, sequence_length, d_model
         )  # [bs x num_channels x sequence_length x d_model]
 
@@ -659,54 +662,55 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool]
         # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model]
         #                                                 -> [(bs*sequence_length) x num_channels x d_model]
         if self.channel_attention:
-            hidden_states = (
-                hidden_states.transpose(2, 1)
+            hidden_state = (
+                hidden_state.transpose(2, 1)
                 .contiguous()
                 .view(batch_size * sequence_length, num_input_channels, d_model)
             )  # [(bs*sequence_length) x num_channels x d_model]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
-                hidden_states, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=self.norm_sublayer2(hidden_states), output_attentions=output_attentions
+                attn_output, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=self.norm_sublayer2(hidden_state),
+                    output_attentions=output_attentions
                 )
-                hidden_states = hidden_states + self.dropout_path2(
-                    hidden_states
+                hidden_state = hidden_state + self.dropout_path2(
+                    attn_output
                 )  # Add: residual connection with residual dropout
             else:
                 ## Multi-Head attention and Add residual connection and Norm
-                hidden_states, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=hidden_states, output_attentions=output_attentions
+                attn_output, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=hidden_state, output_attentions=output_attentions
                 )
-                hidden_states = self.norm_sublayer2(
-                    hidden_states + self.dropout_path2(hidden_states)
+                hidden_state = self.norm_sublayer2(
+                    hidden_state + self.dropout_path2(attn_output)
                 )  # hidden_states: [(bs*sequence_length) x num_channels x d_model]
 
-            hidden_states = (
-                hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model)
+            hidden_state = (
+                hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model)
                 .transpose(1, 2)
                 .contiguous()
             )  # src: [bs x num_channels x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
-        hidden_states = hidden_states.view(
+        hidden_state = hidden_state.view(
             batch_size * num_input_channels, sequence_length, d_model
         )  # src: [(batch_size*num_channels) x sequence_length x d_model]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
-            hidden_states = hidden_states + self.dropout_path3(
-                self.ff(self.norm_sublayer3(hidden_states))
+            hidden_state = hidden_state + self.dropout_path3(
+                self.ff(self.norm_sublayer3(hidden_state))
             )  # Add: residual connection with residual dropout
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm
-            hidden_states = self.norm_sublayer3(
-                hidden_states + self.dropout_path3(self.ff(hidden_states))
+            hidden_state = self.norm_sublayer3(
+                hidden_state + self.dropout_path3(self.ff(hidden_state))
             )  # Add: residual connection with residual dropout
 
-        hidden_states = hidden_states.reshape(
+        hidden_state = hidden_state.reshape(
             batch_size, num_input_channels, sequence_length, d_model
         )  # [bs x num_channels x sequence_length x d_model]
 
-        outputs = (hidden_states,)
+        outputs = (hidden_state,)
 
         if output_attentions:
             outputs += (attn_weights, channel_attn_weights)
@@ -737,6 +741,35 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
+class PatchTSTEmbedding(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        # Input encoding: projection of feature vectors onto a d-dim vector space
+        if not config.shared_embedding:
+            self.input_embedding = nn.ModuleList()
+            for _ in range(config.num_input_channels):
+                self.input_embedding.append(nn.Linear(config.patch_length, config.d_model))
+        else:
+            self.input_embedding = nn.Linear(config.patch_length, config.d_model)
+
+    def forward(self, patch_input: torch.Tensor):
+        """
+        Parameters:
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Patch input for embedding
+        return:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
+        """
+        # Input encoding
+        num_input_channels = patch_input.shape[1]
+        if isinstance(self.input_embedding, nn.ModuleList):
+            embeddings = [ self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)]
+            embeddings = torch.stack(embeddings, dim=1)
+        else:
+            embeddings = self.input_embedding(patch_input)  # x: [bs x num_channels  x num_patches x d_model]
+        return embeddings
+
+
 class PatchTSTEncoder(PatchTSTPreTrainedModel):
     """
     PatchTST Encoder
@@ -752,13 +785,8 @@ def __init__(self, config: PatchTSTConfig):
         self.use_cls_token = config.use_cls_token
         self.gradient_checkpointing = False
 
-        # Input encoding: projection of feature vectors onto a d-dim vector space
-        if not config.shared_embedding:
-            self.input_embedding = nn.ModuleList()
-            for _ in range(self.num_input_channels):
-                self.input_embedding.append(nn.Linear(config.patch_length, config.d_model))
-        else:
-            self.input_embedding = nn.Linear(config.patch_length, config.d_model)
+        # Input embedding: projection of feature vectors onto a d-dim vector space
+        self.embedder = PatchTSTEmbedding(config)
 
         # Positional encoding
         if config.use_cls_token:
@@ -784,13 +812,13 @@ def __init__(self, config: PatchTSTConfig):
 
     def forward(
         self,
-        past_values: torch.Tensor,
+        patch_input: torch.Tensor,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
     ) -> BaseModelOutput:
         """
         Parameters:
-            past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                 Past values of the time series
             output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
             output_attentions (bool, optional): Indicates if attentions should be outputted.
@@ -799,34 +827,23 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 
-        _, num_input_channels, _, _ = past_values.shape
-
-        # Input encoding
-        if not self.shared_embedding:
-            x_out = []
-            for i in range(num_input_channels):
-                z = self.input_embedding[i](past_values[:, i, :, :])
-                x_out.append(z)
-            past_values = torch.stack(x_out, dim=1)
-        else:
-            past_values = self.input_embedding(past_values)  # x: [bs x num_channels  x num_patches x d_model]
+        # Input embedding
+        patch_input = self.embedder(patch_input)
 
         if self.use_cls_token:
             # x: [bs x num_channels x num_patches x d_model]
-            past_values = self.positional_dropout(past_values + self.position_enc[1:, :])
+            patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :])
             # append cls token
             cls_token = self.cls_token + self.position_enc[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
-            cls_tokens = cls_token.expand(past_values.shape[0], -1, -1)  # get the same copy for all the batch samples
-            hidden_states = torch.cat(
-                (cls_tokens, past_values), dim=1
+            cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1)  # get the same copy for all the batch samples
+            hidden_state = torch.cat(
+                (cls_tokens, patch_input), dim=1
             )  # x: [bs x num_channels x (num_patches+1) x d_model]
         else:
-            hidden_states = self.positional_dropout(
-                past_values + self.position_enc
+            hidden_state = self.positional_dropout(
+                patch_input + self.position_enc
             )  # x: [bs x num_channels x num_patches x d_model]
 
         encoder_states = () if output_hidden_states else None
@@ -834,21 +851,21 @@ def forward(
 
         for encoder_layer in self.layers:
             if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
+                encoder_states = encoder_states + (hidden_state,)
 
             layer_outputs = encoder_layer(
-                hidden_states=hidden_states,
+                hidden_state=hidden_state,
                 output_attentions=output_attentions,
             )
             # get hidden state
-            hidden_states = layer_outputs[0]  # hidden_states: [bs x num_channels x num_patches x d_model]
+            hidden_state = layer_outputs[0]  # hidden_states: [bs x num_channels x num_patches x d_model]
             # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
             # append layer attention
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         # return past_values, hidden_states
-        return BaseModelOutput(last_hidden_state=past_values,
+        return BaseModelOutput(last_hidden_state=hidden_state,
                                hidden_states=encoder_states,
                                attentions=all_attentions)
 
@@ -913,7 +930,7 @@ class PatchTSTModelOutput(ModelOutput):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
             the model at the output of each layer plus the optional initial embedding outputs.
-        patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
+        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
             patched input to the Transformer
         mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*)
             Bool masked tensor indicating which patches are masked
@@ -926,7 +943,7 @@ class PatchTSTModelOutput(ModelOutput):
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    patched_input: torch.FloatTensor = None
+    patch_input: torch.FloatTensor = None
     mask: torch.FloatTensor = None
     loc: torch.FloatTensor = None
     scale: torch.FloatTensor = None
@@ -1341,7 +1358,7 @@ def forward(
             masked_values, mask = self.masking(patched_values), None
 
         encoder_output = self.encoder(
-            past_values=masked_values,
+            patch_input=masked_values,
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions
         )
@@ -1350,11 +1367,12 @@ def forward(
             outputs = (encoder_output.last_hidden_state, encoder_output.hidden_states, encoder_output.attentions)
             outputs = outputs + (patched_values, mask, loc, scale)
             return tuple(v for v in outputs if v is not None)
+
         return PatchTSTModelOutput(
             last_hidden_state=encoder_output.last_hidden_state,
             hidden_states=encoder_output.hidden_states,
             attentions=encoder_output.attentions,
-            patched_input=patched_values,
+            patch_input=patched_values,
             mask=mask,
             loc=loc,
             scale=scale,
@@ -1449,7 +1467,7 @@ def forward(
 
         # calculate masked_loss
         loss = nn.MSELoss(reduction="none")
-        loss_val = loss(x_hat, model_output.patched_input)
+        loss_val = loss(x_hat, model_output.patch_input)
         masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
 
         encoder_states = model_output.hidden_states

From 93b88cfb36d8fed74e84d79212c802eedeb0b273 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 22:21:37 -0400
Subject: [PATCH 151/189] use config in PatchTSTPatchify

---
 .../models/patchtst/modeling_patchtst.py      | 40 ++++++-------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d2ea7d153069df..f2b731778683a2 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -423,36 +423,26 @@ class PatchTSTPatchify(nn.Module):
     """
     A class to patchify the time series sequence into different patches
 
-    Parameters:
-        sequence_length (`int`, *required*): input sequence length.
-        patch_length (`int`, *required*): patch length.
-        stride (`int`, *required*): stride between patches.
-
     Returns:
         `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
     """
 
-    def __init__(
-        self,
-        sequence_length: int,
-        patch_length: int,
-        stride: int,
-    ):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
-        if sequence_length <= patch_length:
+        self.sequence_length = config.context_length
+        self.patch_length = config.patch_length
+        self.stride = config.stride
+
+        if self.sequence_length <= self.patch_length:
             raise ValueError(
-                f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})"
+                f"Sequence length ({self.sequence_length}) has to be greater than the patch length ({self.patch_length})"
             )
 
-        self.sequence_length = sequence_length
-        self.patch_length = patch_length
-        self.stride = stride
-
         # get the number of patches
-        num_patches = (max(sequence_length, patch_length) - patch_length) // stride + 1
-        new_sequence_length = patch_length + stride * (num_patches - 1)
-        self.s_begin = sequence_length - new_sequence_length
+        num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.stride + 1
+        new_sequence_length = self.patch_length + self.stride * (num_patches - 1)
+        self.sequence_start = self.sequence_length - new_sequence_length
 
     def forward(self, past_values: torch.Tensor):
         """
@@ -469,7 +459,7 @@ def forward(self, past_values: torch.Tensor):
                 f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
             )
 
-        output = past_values[:, self.s_begin :, :]  # output: [bs x new_sequence_length x num_channels]
+        output = past_values[:, self.sequence_start :, :]  # output: [bs x new_sequence_length x num_channels]
         output = output.unfold(
             dimension=-2, size=self.patch_length, step=self.stride
         )  # output: [bs x num_patches x num_input_channels x patch_length]
@@ -1307,11 +1297,7 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
 
-        self.patching = PatchTSTPatchify(
-            config.context_length,
-            patch_length=config.patch_length,
-            stride=config.stride,
-        )
+        self.patchifier = PatchTSTPatchify(config)
         self.mask_input = config.mask_input
 
         if self.mask_input:
@@ -1351,7 +1337,7 @@ def forward(
         scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask)
 
         # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
-        patched_values = self.patching(scaled_past_values)
+        patched_values = self.patchifier(scaled_past_values)
         if self.mask_input:
             masked_values, mask = self.masking(patched_values)
         else:

From 8175505fa97d98d4da5dd880718b18db37ffa021 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 22:25:14 -0400
Subject: [PATCH 152/189] Use config in PatchTSTMasking class

---
 .../models/patchtst/modeling_patchtst.py      | 50 ++++---------------
 1 file changed, 11 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f2b731778683a2..f9fd23a93c47cf 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -474,19 +474,7 @@ class PatchTSTMasking(nn.Module):
     Class to perform random or forecast masking.
 
     Parameters:
-        mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random.
-        random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining.
-        forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data.
-        forecast_mask_ratios (`list`, *optional*):
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and
-            forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None.
-        unmasked_channel_indices (`list`, *optional*):
-            Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None.
-        channel_consistent_masking (`bool`, *optional*):
-            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
-            across channels. Defaults to True.
-        mask_value (`int`, *optional*): Value to use for masking. Defaults to 0.
-        seed_number (`int`, *optional*): Random seed, when None seed is not set. Defaults to None.
+        config (`PatchTSTConfig`): model config
 
     Returns:
         x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
@@ -498,26 +486,19 @@ class PatchTSTMasking(nn.Module):
 
     def __init__(
         self,
-        mask_type: str = "random",
-        random_mask_ratio: float = 0.5,
-        forecast_mask_patches: list = [2, 3],
-        forecast_mask_ratios: list = [1, 1],
-        channel_consistent_masking: bool = False,
-        unmasked_channel_indices: list = None,
-        mask_value: int = 0,
-        seed_number: Optional[int] = None,
+        config: PatchTSTConfig
     ):
         super().__init__()
-        self.random_mask_ratio = random_mask_ratio
-        self.channel_consistent_masking = channel_consistent_masking
-        self.mask_type = mask_type
-        self.forecast_mask_patches = forecast_mask_patches
-        self.forecast_mask_ratios = forecast_mask_ratios
-        self.unmasked_channel_indices = unmasked_channel_indices
-        self.mask_value = mask_value
+        self.random_mask_ratio = config.random_mask_ratio
+        self.channel_consistent_masking = config.channel_consistent_masking
+        self.mask_type = config.mask_type
+        self.forecast_mask_patches = config.forecast_mask_patches
+        self.forecast_mask_ratios = config.forecast_mask_ratios
+        self.unmasked_channel_indices = config.unmasked_channel_indices
+        self.mask_value = config.mask_value
         if self.unmasked_channel_indices is not None:
             self.unmasked_channel_indices.sort()
-        self.seed_number = seed_number
+        self.seed_number = config.seed_number
 
     def forward(self, patch_input: torch.Tensor):
         """
@@ -1301,16 +1282,7 @@ def __init__(self, config: PatchTSTConfig):
         self.mask_input = config.mask_input
 
         if self.mask_input:
-            self.masking = PatchTSTMasking(
-                mask_type=config.mask_type,
-                random_mask_ratio=config.random_mask_ratio,
-                forecast_mask_patches=config.forecast_mask_patches,
-                forecast_mask_ratios=config.forecast_mask_ratios,
-                channel_consistent_masking=config.channel_consistent_masking,
-                unmasked_channel_indices=config.unmasked_channel_indices,
-                mask_value=config.mask_value,
-                seed_number=config.seed_number,
-            )
+            self.masking = PatchTSTMasking(config)
         else:
             self.masking = nn.Identity()
         self.encoder = PatchTSTEncoder(config)

From 63684aa5999912355a10c7d6e9effa606eca63cc Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 22:37:57 -0400
Subject: [PATCH 153/189] add channel_attn_weights

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index f9fd23a93c47cf..99c92dc4610c88 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -684,7 +684,7 @@ def forward(self,
         outputs = (hidden_state,)
 
         if output_attentions:
-            outputs += (attn_weights, channel_attn_weights)
+            outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights, )
 
         return outputs
 

From e8faa8b3cc41c668de71e4ed0796b6befd50b35d Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 25 Oct 2023 22:52:36 -0400
Subject: [PATCH 154/189] Add PatchTSTScaler class

---
 .../models/patchtst/modeling_patchtst.py      | 38 +++++++++++++------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 99c92dc4610c88..4d2bfe1e4367b3 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1170,7 +1170,9 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5)
         self.minimum_scale = minimum_scale
 
     @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self,
+                data: torch.Tensor, weights: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         denominator = weights.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
         loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
@@ -1198,7 +1200,8 @@ class PatchTSTMeanScaler(nn.Module):
     """
 
     def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+        self, dim: int = -1, keepdim: bool = True,
+        default_scale: Optional[float] = None, minimum_scale: float = 1e-10
     ):
         super().__init__()
         self.dim = dim
@@ -1207,9 +1210,9 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self,
+                data: torch.Tensor, observed_indicator: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1263,14 +1266,9 @@ def forward(
         return data, loc, scale
 
 
-@add_start_docstrings(
-    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
-    PATCHTST_START_DOCSTRING,
-)
-class PatchTSTModel(PatchTSTPreTrainedModel):
+class PatchTSTScaler(nn.Module):
     def __init__(self, config: PatchTSTConfig):
-        super().__init__(config)
-
+        super().__init__()
         if config.scaling == "mean" or config.scaling is True:
             self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
@@ -1278,6 +1276,22 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
 
+    def forward(self,
+                data: torch.Tensor, observed_indicator: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        data, loc, scale = self.scaler(data, observed_indicator)
+        return data, loc, scale
+
+
+@add_start_docstrings(
+    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTModel(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.scaler = PatchTSTScaler(config)
         self.patchifier = PatchTSTPatchify(config)
         self.mask_input = config.mask_input
 

From 6389fbfae198f9c418095acf23a7fb3998e8086f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 26 Oct 2023 00:06:45 -0400
Subject: [PATCH 155/189] add output_attentions arg to test function

---
 tests/models/patchtst/test_modeling_patchtst.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 496cfc8301aa86..68ba5030b35430 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -290,6 +290,7 @@ def test_forward_signature(self):
             expected_arg_names.extend(
                 [
                     "output_hidden_states",
+                    "output_attentions",
                     "return_dict",
                 ]
             )

From dd5e25d9a020f94f3366766dc24739c08c288e71 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 26 Oct 2023 09:29:27 +0200
Subject: [PATCH 156/189] format

---
 .../models/patchtst/modeling_patchtst.py      | 105 ++++++++----------
 1 file changed, 47 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4d2bfe1e4367b3..03dcf8e309bd75 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -484,10 +484,7 @@ class PatchTSTMasking(nn.Module):
 
     """
 
-    def __init__(
-        self,
-        config: PatchTSTConfig
-    ):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.random_mask_ratio = config.random_mask_ratio
         self.channel_consistent_masking = config.channel_consistent_masking
@@ -544,6 +541,7 @@ class PatchTSTEncoderLayer(nn.Module):
     """
     PatchTST encoder layer
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
 
@@ -588,9 +586,7 @@ def __init__(self, config: PatchTSTConfig):
 
         self.pre_norm = config.pre_norm
 
-    def forward(self,
-                hidden_state: torch.Tensor,
-                output_attentions: Optional[bool] = None):
+    def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] = None):
         """
         Parameters:
             hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
@@ -609,8 +605,7 @@ def forward(self,
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=self.norm_sublayer1(hidden_state),
-                output_attentions=output_attentions
+                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions
             )
             hidden_state = hidden_state + self.dropout_path1(
                 attn_output
@@ -618,8 +613,7 @@ def forward(self,
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=hidden_state,
-                output_attentions=output_attentions
+                hidden_states=hidden_state, output_attentions=output_attentions
             )
             hidden_state = self.norm_sublayer1(
                 hidden_state + self.dropout_path1(attn_output)
@@ -641,8 +635,7 @@ def forward(self,
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 attn_output, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=self.norm_sublayer2(hidden_state),
-                    output_attentions=output_attentions
+                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions
                 )
                 hidden_state = hidden_state + self.dropout_path2(
                     attn_output
@@ -684,7 +677,7 @@ def forward(self,
         outputs = (hidden_state,)
 
         if output_attentions:
-            outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights, )
+            outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights,)
 
         return outputs
 
@@ -734,7 +727,7 @@ def forward(self, patch_input: torch.Tensor):
         # Input encoding
         num_input_channels = patch_input.shape[1]
         if isinstance(self.input_embedding, nn.ModuleList):
-            embeddings = [ self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)]
+            embeddings = [self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)]
             embeddings = torch.stack(embeddings, dim=1)
         else:
             embeddings = self.input_embedding(patch_input)  # x: [bs x num_channels  x num_patches x d_model]
@@ -798,7 +791,9 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         # Input embedding
         patch_input = self.embedder(patch_input)
@@ -836,9 +831,7 @@ def forward(
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         # return past_values, hidden_states
-        return BaseModelOutput(last_hidden_state=hidden_state,
-                               hidden_states=encoder_states,
-                               attentions=all_attentions)
+        return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions)
 
 
 PATCHTST_START_DOCSTRING = r"""
@@ -1170,9 +1163,7 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5)
         self.minimum_scale = minimum_scale
 
     @torch.no_grad()
-    def forward(self,
-                data: torch.Tensor, weights: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         denominator = weights.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
         loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
@@ -1200,8 +1191,7 @@ class PatchTSTMeanScaler(nn.Module):
     """
 
     def __init__(
-        self, dim: int = -1, keepdim: bool = True,
-        default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
     ):
         super().__init__()
         self.dim = dim
@@ -1210,9 +1200,9 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(self,
-                data: torch.Tensor, observed_indicator: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1276,9 +1266,9 @@ def __init__(self, config: PatchTSTConfig):
         else:
             self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
 
-    def forward(self,
-                data: torch.Tensor, observed_indicator: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         data, loc, scale = self.scaler(data, observed_indicator)
         return data, loc, scale
 
@@ -1330,9 +1320,7 @@ def forward(
             masked_values, mask = self.masking(patched_values), None
 
         encoder_output = self.encoder(
-            patch_input=masked_values,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            patch_input=masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
         )
 
         if not return_dict:
@@ -1430,7 +1418,7 @@ def forward(
             past_values=past_values,
             past_observed_mask=past_observed_mask,
             output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            output_attentions=output_attentions,
         )
 
         # model_output[0]: [bs x num_channels x num_patches x patch_length] or
@@ -1446,11 +1434,9 @@ def forward(
         if not return_dict:
             outputs = (masked_loss, x_hat, model_output.hidden_states, model_output.attentions)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForPretrainingOutput(loss=masked_loss,
-                                            prediction_output=x_hat,
-                                            hidden_states=encoder_states,
-                                            attentions=model_output.attentions
-                                            )
+        return PatchTSTForPretrainingOutput(
+            loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states, attentions=model_output.attentions
+        )
 
 
 class PatchTSTForClassification(PatchTSTPreTrainedModel):
@@ -1503,7 +1489,7 @@ def forward(
             past_values=past_values,
             past_observed_mask=past_observed_mask,
             output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            output_attentions=output_attentions,
         )
         y_hat = self.head(model_output[0])
 
@@ -1515,11 +1501,12 @@ def forward(
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForClassificationOutput(loss=loss_val,
-                                               prediction_logits=y_hat,
-                                               hidden_states=model_output.hidden_states,
-                                               attentions=model_output.attentions
-                                               )
+        return PatchTSTForClassificationOutput(
+            loss=loss_val,
+            prediction_logits=y_hat,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
 
 
 class PatchTSTClassificationHead(nn.Module):
@@ -1673,7 +1660,7 @@ def forward(
             past_values=past_values,
             past_observed_mask=past_observed_mask,
             output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            output_attentions=output_attentions,
         )
 
         # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape
@@ -1693,11 +1680,12 @@ def forward(
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForPredictionOutput(loss=loss_val,
-                                           prediction_output=y_hat,
-                                           hidden_states=model_output.hidden_states,
-                                           attentions=model_output.attentions
-                                           )
+        return PatchTSTForPredictionOutput(
+            loss=loss_val,
+            prediction_output=y_hat,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
 
     def generate(
         self,
@@ -1888,7 +1876,7 @@ def forward(
             past_values=past_values,
             past_observed_mask=past_observed_mask,
             output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            output_attentions=output_attentions,
         )
         # get output head
         y_hat = self.head(model_output.last_hidden_state)
@@ -2087,7 +2075,7 @@ def forward(
             past_values=past_values,
             past_observed_mask=past_observed_mask,
             output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions
+            output_attentions=output_attentions,
         )
         # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape
         y_hat = self.head(model_output.last_hidden_state)
@@ -2106,11 +2094,12 @@ def forward(
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForRegressionOutput(loss=loss_val,
-                                           prediction_output=y_hat,
-                                           hidden_states=model_output.hidden_states,
-                                           attentions=model_output.attentions
-                                           )
+        return PatchTSTForRegressionOutput(
+            loss=loss_val,
+            prediction_output=y_hat,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
 
     def generate(
         self,

From b07c55f3f7a12f807edec76d8c07058b0284b17f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 27 Oct 2023 21:52:24 +0200
Subject: [PATCH 157/189] Update doc with image patchtst.md

---
 docs/source/en/model_doc/patchtst.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 88094385c1500d..ba4b5e27636056 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -28,6 +28,11 @@ Tips:
 
 The model can also be used for time series classification and time series regression. See the respective [`PatchTSTForClassification`] and [`PatchTSTForRegression`] classes.
 
+At a high level the model vectorizes time series into patches of a given size and encodes them via a Transformer which then outputs the prediction length forecasts:
+
+![model](https://github.com/namctin/transformers/assets/8100/150af169-29de-419a-8d98-eb78251c21fa)
+
+
 This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif).
 
 The original code can be found [here](https://github.com/yuqinie98/PatchTST).
@@ -71,4 +76,4 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST).
 ## PatchTSTForRegression
 
 [[autodoc]] PatchTSTForRegression
-    - forward
\ No newline at end of file
+    - forward

From 546f3e2db3bcba84727bc5d9fba4aaea920e766f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Nov 2023 13:20:44 +0100
Subject: [PATCH 158/189] fix-copies

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 03dcf8e309bd75..d9c1371cc2dcd3 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -51,12 +51,15 @@ def __init__(
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[PatchTSTConfig] = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
+        self.config = config
 
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
@@ -65,6 +68,7 @@ def __init__(
             )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
+        self.is_causal = is_causal
 
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

From e7c687e7a37d2f2f4e27a0f5ec93a8444a656c36 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Nov 2023 19:25:57 +0100
Subject: [PATCH 159/189] rename Forecast <-> Prediction

---
 .../models/patchtst/modeling_patchtst.py      | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index d9c1371cc2dcd3..63352c121d556c 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -947,15 +947,15 @@ class PatchTSTForPretrainingOutput(ModelOutput):
 
 
 @dataclass
-class PatchTSTForPredictionOutput(ModelOutput):
+class PatchTSTForForecastingOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForPredictiontion`].
+    Output type of [`PatchTSTForForecastingtion`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             MSE loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction outputs of the time series modeling heads.
+        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
+            Forecast outputs of the time series modeling heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
@@ -970,7 +970,7 @@ class PatchTSTForPredictionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_output: torch.FloatTensor = None
+    forecast_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -983,7 +983,7 @@ class PatchTSTForRegressionOutput(ModelOutput):
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             MSE loss.
-        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction outputs of the time series modeling heads.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
@@ -999,22 +999,22 @@ class PatchTSTForRegressionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_output: torch.FloatTensor = None
+    forecast_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
 @dataclass
-class PatchTSTForForecastingOutput(ModelOutput):
+class PatchTSTForPredictionOutput(ModelOutput):
     """
-    Output type of [`PatchTSTForForecasting`].
+    Output type of [`PatchTSTForPrediction`].
 
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             MSE loss.
 
-        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Forecasting outputs of the time series modeling heads.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, -1)`):
+            Prediction outputs of the time series modeling heads.
 
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
@@ -1030,7 +1030,7 @@ class PatchTSTForForecastingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    forecast_outputs: torch.FloatTensor = None
+    prediction_outputs: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     loc: torch.FloatTensor = None
@@ -1546,7 +1546,7 @@ def forward(self, embedding: torch.Tensor):
         return y
 
 
-class PatchTSTPredictionHead(nn.Module):
+class PatchTSTForecastHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
@@ -1598,7 +1598,7 @@ def forward(self, embedding: torch.Tensor):
         return y
 
 
-class PatchTSTForPrediction(PatchTSTPreTrainedModel):
+class PatchTSTForForecasting(PatchTSTPreTrainedModel):
     """
     PatchTST model for prediction. The model contains PatchTST model + prediction head
     """
@@ -1621,7 +1621,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = PatchTSTPredictionHead(config, self.distribution_output)
+        self.head = PatchTSTForecastHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1634,7 +1634,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = True,
-    ) -> Union[Tuple, PatchTSTForPredictionOutput]:
+    ) -> Union[Tuple, PatchTSTForForecastingOutput]:
         """
         Parameters:
             past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
@@ -1652,7 +1652,7 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
             `config.return_dict`=False)
 
         """
@@ -1684,9 +1684,9 @@ def forward(
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForPredictionOutput(
+        return PatchTSTForForecastingOutput(
             loss=loss_val,
-            prediction_output=y_hat,
+            forecast_outputs=y_hat,
             hidden_states=model_output.hidden_states,
             attentions=model_output.attentions,
         )
@@ -1726,7 +1726,7 @@ def generate(
         )
 
         # get distribution
-        distribution = self.distribution_output.distribution(outputs.prediction_output)
+        distribution = self.distribution_output.distribution(outputs.forecast_outputs)
         # get samples
         samples = [
             distribution.sample() for _ in range(num_parallel_samples)
@@ -1736,7 +1736,7 @@ def generate(
         return SamplePatchTSTPredictionOutput(sequences=samples)
 
 
-class PatchTSTForecastHead(nn.Module):
+class PatchTSTPredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
 
@@ -1817,7 +1817,7 @@ def forward(self, embedding: torch.Tensor):
         return output
 
 
-class PatchTSTForForecasting(PatchTSTPreTrainedModel):
+class PatchTSTForPrediction(PatchTSTPreTrainedModel):
     """
     PatchTST for forecasting. The model contains PatchTST model + Forecasting head
     """
@@ -1838,7 +1838,7 @@ def __init__(self, config: PatchTSTConfig):
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
-        self.head = PatchTSTForecastHead(config, self.distribution_output)
+        self.head = PatchTSTPredictionHead(config, self.distribution_output)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1851,7 +1851,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, PatchTSTForForecastingOutput]:
+    ) -> Union[Tuple, PatchTSTForPredictionOutput]:
         """
         Parameters:
             past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
@@ -1869,7 +1869,7 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
             `config.return_dict`=False)
 
         """
@@ -1909,9 +1909,9 @@ def forward(
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions, loc, scale)
             return tuple(v for v in outputs if v is not None)
-        return PatchTSTForForecastingOutput(
+        return PatchTSTForPredictionOutput(
             loss=loss_val,
-            forecast_outputs=y_hat,
+            prediction_outputs=y_hat,
             hidden_states=model_output.hidden_states,
             attentions=model_output.attentions,
             loc=loc,
@@ -1955,7 +1955,7 @@ def generate(
 
         # get distribution
         distribution = self.distribution_output.distribution(
-            outputs.forecast_outputs, loc=outputs.loc, scale=outputs.scale
+            outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale
         )
         # get samples
         samples = [
@@ -2100,7 +2100,7 @@ def forward(
             return tuple(v for v in outputs if v is not None)
         return PatchTSTForRegressionOutput(
             loss=loss_val,
-            prediction_output=y_hat,
+            forecast_outputs=y_hat,
             hidden_states=model_output.hidden_states,
             attentions=model_output.attentions,
         )
@@ -2140,7 +2140,7 @@ def generate(
         )
 
         # get distribution
-        distribution = self.distribution_output.distribution(outputs.prediction_output)
+        distribution = self.distribution_output.distribution(outputs.forecast_outputs)
         # get samples
         samples = [
             distribution.sample() for _ in range(num_parallel_samples)

From 609a9d35d5aee21876989031872a1033d2870c6d Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 6 Nov 2023 14:27:47 -0500
Subject: [PATCH 160/189] change name of a few parameters to match with
 PatchTSMixer.

---
 .../models/patchtst/modeling_patchtst.py      | 224 ++----------------
 1 file changed, 17 insertions(+), 207 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 63352c121d556c..e5a0af1e86d02b 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1074,7 +1074,7 @@ class SamplePatchTSTPredictionOutput(ModelOutput):
     distribution.
 
     Parameters:
-        sequences `(batch_size, num_samples, prediction_length, num_output_channels)`):
+        sequences `(batch_size, num_samples, prediction_length, num_targets)`):
                 Sampled values from the chosen distribution.
     """
 
@@ -1103,7 +1103,7 @@ class SamplePatchTSTRegressionOutput(ModelOutput):
     distribution.
 
     Parameters:
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)`
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_targets)`
                 Sampled values from the chosen distribution.
     """
 
@@ -1460,7 +1460,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        labels: torch.Tensor = None,
+        target_values: torch.Tensor = None,
         past_observed_mask: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1470,7 +1470,7 @@ def forward(
         Parameters:
             past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                 Input sequence to the model
-            labels (`torch.Tensor`, *optional*): labels associates with the `past_values`
+            target_values (`torch.Tensor`, *optional*): labels associates with the `past_values`
             past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                 Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                 in `[0, 1]`:
@@ -1498,9 +1498,9 @@ def forward(
         y_hat = self.head(model_output[0])
 
         loss_val = None
-        if labels is not None:
+        if target_values is not None:
             loss = nn.CrossEntropyLoss()
-            loss_val = loss(y_hat, labels)
+            loss_val = loss(y_hat, target_values)
 
         if not return_dict:
             outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
@@ -1520,7 +1520,7 @@ def __init__(self, config: PatchTSTConfig):
         self.pooling = config.pooling
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels)
+        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_targets)
 
     def forward(self, embedding: torch.Tensor):
         """
@@ -1529,7 +1529,7 @@ def forward(self, embedding: torch.Tensor):
                     or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                     Embedding from the model
         Returns:
-            `torch.Tensor` of shape `(bs, num_labels)`
+            `torch.Tensor` of shape `(bs, num_targets)`
 
         """
         if self.use_cls_token:
@@ -1546,196 +1546,6 @@ def forward(self, embedding: torch.Tensor):
         return y
 
 
-class PatchTSTForecastHead(nn.Module):
-    def __init__(self, config: PatchTSTConfig, distribution_output=None):
-        super().__init__()
-
-        self.num_output_channels = config.num_output_channels
-        self.use_cls_token = config.use_cls_token
-        self.pooling = config.pooling
-
-        head_dim = config.num_input_channels * config.d_model
-
-        self.flatten = nn.Flatten(start_dim=1)
-        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
-
-        if distribution_output is None:
-            self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels)
-        else:
-            self.projection = distribution_output.get_parameter_projection(head_dim)
-
-    def forward(self, embedding: torch.Tensor):
-        """
-        Parameters:
-            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)`
-                    or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
-                    Embedding from the model
-        Returns:
-            `torch.Tensor` of shape `(bs, pred_len, num_output_channels)`
-
-        """
-        batch_size = embedding.shape[0]
-        if self.use_cls_token:
-            x = embedding[:, :, 0, :]  # use the first output token, x: [bs x num_channels x d_model]
-        elif self.pooling == "mean":
-            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
-        elif self.pooling == "max":
-            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
-        else:
-            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
-
-        # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (num_channels * d_model)
-        # projection
-        y = self.projection(x)
-        # reshape y
-        if isinstance(y, tuple):  # for distribution head
-            y = (
-                z.reshape(batch_size, -1, self.num_output_channels) for z in y
-            )  # tuple of [bs x prediction_len x num_output_channels]
-        else:  # for linear head
-            y = y.reshape(batch_size, -1, self.num_output_channels)  # [bs x prediction_len x num_output_channels]
-        return y
-
-
-class PatchTSTForForecasting(PatchTSTPreTrainedModel):
-    """
-    PatchTST model for prediction. The model contains PatchTST model + prediction head
-    """
-
-    def __init__(self, config: PatchTSTConfig):
-        super().__init__(config)
-
-        self.model = PatchTSTModel(config)
-        if config.loss == "mse":
-            self.distribution_output = None
-        else:
-            if config.distribution_output == "student_t":
-                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
-            elif config.distribution_output == "normal":
-                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
-            elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(
-                    dim=config.prediction_length * config.num_output_channels
-                )
-            else:
-                raise ValueError(f"Unknown distribution output {config.distribution_output}")
-
-        self.head = PatchTSTForecastHead(config, self.distribution_output)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        past_values: torch.Tensor,
-        past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple, PatchTSTForForecastingOutput]:
-        """
-        Parameters:
-            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
-                Input sequence to the model
-            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
-                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
-                in `[0, 1]`:
-
-                - 1 for values that are **observed**,
-                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-            future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*):
-                future target values associates with the `past_values`
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers
-            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
-
-        Returns:
-            `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
-            `config.return_dict`=False)
-
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # get model output
-        model_output = self.model(
-            past_values=past_values,
-            past_observed_mask=past_observed_mask,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-        )
-
-        # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape
-        y_hat = self.head(model_output.last_hidden_state)
-
-        loss_val = None
-        if future_values is not None:
-            if self.distribution_output:
-                distribution = self.distribution_output.distribution(y_hat)
-                loss_val = nll(distribution, future_values)
-                # take average of the loss
-                loss_val = weighted_average(loss_val)
-            else:
-                loss = nn.MSELoss(reduction="mean")
-                loss_val = loss(y_hat, future_values)
-
-        if not return_dict:
-            outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions)
-            return tuple(v for v in outputs if v is not None)
-        return PatchTSTForForecastingOutput(
-            loss=loss_val,
-            forecast_outputs=y_hat,
-            hidden_states=model_output.hidden_states,
-            attentions=model_output.attentions,
-        )
-
-    def generate(
-        self,
-        past_values: torch.Tensor,
-        past_observed_mask: Optional[torch.Tensor] = None,
-    ) -> SamplePatchTSTPredictionOutput:
-        """
-        Generate sequences of sample predictions from a model with a probability distribution head.
-
-        Args:
-            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
-                Past values of the time series that serves as context in order to predict the future.
-
-            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
-                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
-                in `[0, 1]`:
-
-                - 1 for values that are **observed**,
-                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
-
-        Return:
-            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
-            number of samples, prediction_length, num_output_channels)`
-        """
-        # get number of samples
-        num_parallel_samples = self.config.num_parallel_samples
-
-        # get model output
-        outputs = self(
-            past_values=past_values,
-            target_values=None,
-            past_observed_mask=past_observed_mask,
-            output_hidden_states=False,
-        )
-
-        # get distribution
-        distribution = self.distribution_output.distribution(outputs.forecast_outputs)
-        # get samples
-        samples = [
-            distribution.sample() for _ in range(num_parallel_samples)
-        ]  # samples: list of [bs x pred_len x num_output_channels]
-        # stack tensors
-        samples = torch.stack(samples, dim=1)  # [bs x num_samples x pred_len x num_output_channels]
-        return SamplePatchTSTPredictionOutput(sequences=samples)
-
-
 class PatchTSTPredictionHead(nn.Module):
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
@@ -1973,7 +1783,7 @@ class PatchTSTRegressionHead(nn.Module):
 
     def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
-        self.y_range = config.prediction_range
+        self.y_range = config.output_range
         self.use_cls_token = config.use_cls_token
         self.pooling = config.pooling
         self.distribution_output = distribution_output
@@ -1984,7 +1794,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
 
         if distribution_output is None:
-            self.projection = nn.Linear(head_dim, config.num_output_channels)
+            self.projection = nn.Linear(head_dim, config.num_targets)
         else:
             self.projection = distribution_output.get_parameter_projection(head_dim)
 
@@ -2028,12 +1838,12 @@ def __init__(self, config: PatchTSTConfig):
             self.distribution_output = None
         else:
             if config.distribution_output == "student_t":
-                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels)
+                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_targets)
             elif config.distribution_output == "normal":
-                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels)
+                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_targets)
             elif config.distribution_output == "negative_binomial":
                 self.distribution_output = NegativeBinomialOutput(
-                    dim=config.prediction_length * config.num_output_channels
+                    dim=config.prediction_length * config.num_targets
                 )
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
@@ -2081,7 +1891,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
         )
-        # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape
+        # get output head. y_hat is of shape [bs x num_targets] or tuple of this shape
         y_hat = self.head(model_output.last_hidden_state)
 
         loss_val = None
@@ -2126,7 +1936,7 @@ def generate(
 
         Return:
             [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
-            number of samples, num_output_channels)`.
+            number of samples, num_targets)`.
         """
         # get number of samples
         num_parallel_samples = self.config.num_parallel_samples
@@ -2144,7 +1954,7 @@ def generate(
         # get samples
         samples = [
             distribution.sample() for _ in range(num_parallel_samples)
-        ]  # samples: list of [bs x num_output_channels]
+        ]  # samples: list of [bs x num_targets]
         # stack tensors
-        samples = torch.stack(samples, dim=1)  # [bs x num_samples x num_output_channels]
+        samples = torch.stack(samples, dim=1)  # [bs x num_samples x num_targets]
         return SamplePatchTSTRegressionOutput(sequences=samples)

From 7f0561086d685ea79407ddf39d26adbdac2c4e62 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 6 Nov 2023 14:28:16 -0500
Subject: [PATCH 161/189] Remove *ForForecasting class to match with other time
 series models.

---
 .../models/patchtst/configuration_patchtst.py | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index b3fad61f911f5d..e7fb491f435052 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -123,12 +123,12 @@ class PatchTSTConfig(PretrainedConfig):
             The dropout probability for head.
         prediction_length (`int`, *optional*, defaults to 24):
             The prediction length for the encoder. In other words, the prediction horizon of the model.
-        num_output_channels (`int`, *optional*, defaults to 1):
-            Number of output channels.
-        prediction_range (`list`, *optional*):
-            The range of prediction values can be set to enforce the model to produce values within a range.
+        num_targets (`int`, *optional*, defaults to 1):
+            Number of targets for regression and classificastion tasks. For classification, it is the number of classes.
+        output_range (`list`, *optional*):
+            Output range for regression task. The range of output values can be set to enforce the model to produce values within a range.
         num_parallel_samples (`int`, *optional*, defaults to 100):
-            The number of samples to generate in parallel for probablistic forecast.
+            The number of samples to generate in parallel for probablistic prediction.
 
 
     ```python
@@ -196,8 +196,8 @@ def __init__(
         pooling: str = "mean",
         head_dropout: float = 0.0,
         prediction_length: int = 24,
-        num_output_channels: int = 1,
-        prediction_range: List = None,
+        num_targets: int = 1,
+        output_range: List = None,
         # distribution head
         num_parallel_samples: int = 100,
         **kwargs,
@@ -231,7 +231,7 @@ def __init__(
         self.init_std = init_std
         self.scaling = scaling
 
-        # PatchTST
+        # PatchTST parameters
         self.patch_length = patch_length
         self.stride = stride
         self.num_patches = self._num_patches()
@@ -251,16 +251,16 @@ def __init__(
         self.pooling = pooling
         self.head_dropout = head_dropout
 
-        # Forecast head
+        # For prediction head
         self.shared_projection = shared_projection
-
-        # Forcasting and prediction
         self.prediction_length = prediction_length
+
+        # For prediction and regression head
         self.num_parallel_samples = num_parallel_samples
 
         # Regression
-        self.num_output_channels = num_output_channels
-        self.prediction_range = prediction_range
+        self.num_targets = num_targets
+        self.output_range = output_range
 
         super().__init__(**kwargs)
 

From 9807142518b02a69ddeb3bb0b7e33d8f6a3e54dc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 6 Nov 2023 20:44:44 +0100
Subject: [PATCH 162/189] make style

---
 .../models/patchtst/configuration_patchtst.py             | 6 ++++--
 src/transformers/models/patchtst/modeling_patchtst.py     | 8 ++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index e7fb491f435052..7a91549684feb7 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -124,9 +124,11 @@ class PatchTSTConfig(PretrainedConfig):
         prediction_length (`int`, *optional*, defaults to 24):
             The prediction length for the encoder. In other words, the prediction horizon of the model.
         num_targets (`int`, *optional*, defaults to 1):
-            Number of targets for regression and classificastion tasks. For classification, it is the number of classes.
+            Number of targets for regression and classificastion tasks. For classification, it is the number of
+            classes.
         output_range (`list`, *optional*):
-            Output range for regression task. The range of output values can be set to enforce the model to produce values within a range.
+            Output range for regression task. The range of output values can be set to enforce the model to produce
+            values within a range.
         num_parallel_samples (`int`, *optional*, defaults to 100):
             The number of samples to generate in parallel for probablistic prediction.
 
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index e5a0af1e86d02b..9aeb6420c13c81 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1842,9 +1842,7 @@ def __init__(self, config: PatchTSTConfig):
             elif config.distribution_output == "normal":
                 self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_targets)
             elif config.distribution_output == "negative_binomial":
-                self.distribution_output = NegativeBinomialOutput(
-                    dim=config.prediction_length * config.num_targets
-                )
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_targets)
             else:
                 raise ValueError(f"Unknown distribution output {config.distribution_output}")
 
@@ -1952,9 +1950,7 @@ def generate(
         # get distribution
         distribution = self.distribution_output.distribution(outputs.forecast_outputs)
         # get samples
-        samples = [
-            distribution.sample() for _ in range(num_parallel_samples)
-        ]  # samples: list of [bs x num_targets]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]  # samples: list of [bs x num_targets]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x num_targets]
         return SamplePatchTSTRegressionOutput(sequences=samples)

From 3b8a3063efecadff8dd306ac942ef332041c5712 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 6 Nov 2023 14:49:18 -0500
Subject: [PATCH 163/189] Remove PatchTSTForForecasting in the test

---
 tests/models/patchtst/test_modeling_patchtst.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 68ba5030b35430..be833bef3ba982 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -40,7 +40,6 @@
         MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
         PatchTSTConfig,
         PatchTSTForClassification,
-        PatchTSTForForecasting,
         PatchTSTForPrediction,
         PatchTSTForPretraining,
         PatchTSTForRegression,
@@ -148,7 +147,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         (
             PatchTSTModel,
             PatchTSTForPrediction,
-            PatchTSTForForecasting,
             PatchTSTForPretraining,
             PatchTSTForClassification,
             PatchTSTForRegression,
@@ -157,7 +155,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         else ()
     )
     all_generative_model_classes = (
-        (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else ()
+        (PatchTSTForPrediction, PatchTSTForRegression, PatchTSTForPretraining) if is_torch_available() else ()
     )
     pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
     test_pruning = False
@@ -335,7 +333,7 @@ def test_pretrain_head(self):
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
     def test_forecast_head(self):
-        model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device)
+        model = PatchTSTForPrediction.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device)
 
         batch = prepare_batch(file="test-batch.pt")
 

From ff45a2036f2bed25a948151df42507a145146a39 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 6 Nov 2023 14:58:45 -0500
Subject: [PATCH 164/189] remove PatchTSTForForecastingOutput class

---
 .../models/patchtst/modeling_patchtst.py      | 50 ++-----------------
 1 file changed, 3 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 9aeb6420c13c81..ca318fbfcb8d86 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -946,35 +946,6 @@ class PatchTSTForPretrainingOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-@dataclass
-class PatchTSTForForecastingOutput(ModelOutput):
-    """
-    Output type of [`PatchTSTForForecastingtion`].
-
-    Parameters:
-        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
-            MSE loss.
-        forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
-            Forecast outputs of the time series modeling heads.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    forecast_outputs: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 @dataclass
 class PatchTSTForRegressionOutput(ModelOutput):
     """
@@ -1081,21 +1052,6 @@ class SamplePatchTSTPredictionOutput(ModelOutput):
     sequences: torch.FloatTensor = None
 
 
-@dataclass
-class SamplePatchTSTForecastOutput(ModelOutput):
-    """
-    Base class for time series model's predictions outputs that contains the sampled values from the chosen
-    distribution.
-
-    Parameters:
-        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size,
-        num_samples, prediction_length, number_channels)`):
-                Sampled values from the chosen distribution.
-    """
-
-    sequences: torch.FloatTensor = None
-
-
 @dataclass
 class SamplePatchTSTRegressionOutput(ModelOutput):
     """
@@ -1732,7 +1688,7 @@ def generate(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-    ) -> SamplePatchTSTForecastOutput:
+    ) -> SamplePatchTSTPredictionOutput:
         """
         Generate sequences of sample predictions from a model with a probability distribution head.
 
@@ -1748,7 +1704,7 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-            [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number
             of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
             num_input_channels)` for multivariate predictions.
         """
@@ -1773,7 +1729,7 @@ def generate(
         ]  # samples: list of [bs x forecast_len x num_channels]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x num_channels]
-        return SamplePatchTSTForecastOutput(sequences=samples)
+        return SamplePatchTSTPredictionOutput(sequences=samples)
 
 
 class PatchTSTRegressionHead(nn.Module):

From abc64c0d7b4210ef819d8dd761232ff0cf008e7f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Mon, 6 Nov 2023 15:00:33 -0500
Subject: [PATCH 165/189] change test_forecast_head to test_prediction_head

---
 tests/models/patchtst/test_modeling_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index be833bef3ba982..07d30826acc813 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -332,7 +332,7 @@ def test_pretrain_head(self):
         self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
 
     # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
-    def test_forecast_head(self):
+    def test_prediction_head(self):
         model = PatchTSTForPrediction.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device)
 
         batch = prepare_batch(file="test-batch.pt")
@@ -342,7 +342,7 @@ def test_forecast_head(self):
             output = model(
                 past_values=batch["past_values"].to(torch_device),
                 future_values=batch["future_values"].to(torch_device),
-            ).forecast_outputs
+            ).prediction_outputs
         expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
         self.assertEqual(output.shape, expected_shape)
 

From 6b3fb305bb0b1e3696137cf6769a62f451fa271c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Nov 2023 09:33:22 +0100
Subject: [PATCH 166/189] style

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index ca318fbfcb8d86..23c1254acd14ee 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1704,8 +1704,8 @@ def generate(
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
 
         Return:
-            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number
-            of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
+            [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length,
             num_input_channels)` for multivariate predictions.
         """
         # get number of samples

From 69897e3225fd7033cef4d6ec089af29d8e1f4773 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Nov 2023 10:10:19 +0100
Subject: [PATCH 167/189] fix docs

---
 docs/source/en/model_doc/patchtst.md         | 6 ------
 src/transformers/__init__.py                 | 2 --
 src/transformers/models/patchtst/__init__.py | 2 --
 src/transformers/utils/dummy_pt_objects.py   | 7 -------
 utils/check_repo.py                          | 1 -
 5 files changed, 18 deletions(-)

diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index ba4b5e27636056..c18abeb20e64ef 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -55,12 +55,6 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST).
     - forward
 
 
-## PatchTSTForForecasting
-
-[[autodoc]] PatchTSTForForecasting
-    - forward
-
-
 ## PatchTSTForClassification
 
 [[autodoc]] PatchTSTForClassification
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1a29b855d5d38a..8c66ea0f2333c3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2482,7 +2482,6 @@
         [
             "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
             "PatchTSTForClassification",
-            "PatchTSTForForecasting",
             "PatchTSTForPrediction",
             "PatchTSTForPretraining",
             "PatchTSTForRegression",
@@ -6375,7 +6374,6 @@
         from .models.patchtst import (
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
-            PatchTSTForForecasting,
             PatchTSTForPrediction,
             PatchTSTForPretraining,
             PatchTSTForRegression,
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
index e2ac594688d90e..8c7db64c198406 100644
--- a/src/transformers/models/patchtst/__init__.py
+++ b/src/transformers/models/patchtst/__init__.py
@@ -35,7 +35,6 @@
         "PatchTSTModel",
         "PatchTSTPreTrainedModel",
         "PatchTSTForPrediction",
-        "PatchTSTForForecasting",
         "PatchTSTForPretraining",
         "PatchTSTForRegression",
         "PatchTSTForClassification",
@@ -54,7 +53,6 @@
         from .modeling_patchtst import (
             PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
             PatchTSTForClassification,
-            PatchTSTForForecasting,
             PatchTSTForPrediction,
             PatchTSTForPretraining,
             PatchTSTForRegression,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d05b00c864cb5f..8d07da493c72a8 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5990,13 +5990,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class PatchTSTForForecasting(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class PatchTSTForPrediction(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index ea3acc1311f1f9..d510fe43531a62 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -185,7 +185,6 @@
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
     "AutoformerForPrediction",
-    "PatchTSTForForecasting",
     "PatchTSTForPretraining",
     "PatchTSTForPrediction",
     "JukeboxVQVAE",

From 42ec43bd2667e95281698aff1da0cec69c05a04c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 7 Nov 2023 10:28:42 +0100
Subject: [PATCH 168/189] fix tests

---
 src/transformers/models/patchtst/modeling_patchtst.py | 2 --
 tests/models/patchtst/test_modeling_patchtst.py       | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 23c1254acd14ee..693f3050f851ec 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -983,10 +983,8 @@ class PatchTSTForPredictionOutput(ModelOutput):
     Parameters:
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             MSE loss.
-
         prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, -1)`):
             Prediction outputs of the time series modeling heads.
-
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 07d30826acc813..de3a42ef5fd9b5 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -193,7 +193,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
             labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng)
-            inputs_dict["labels"] = labels
+            inputs_dict["target_values"] = labels
             inputs_dict.pop("future_values")
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
@@ -281,7 +281,7 @@ def test_forward_signature(self):
             ):
                 expected_arg_names.remove("future_values")
                 expected_arg_names.remove("past_observed_mask")
-                expected_arg_names.append("labels") if model_class in get_values(
+                expected_arg_names.append("target_values") if model_class in get_values(
                     MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING
                 ) else expected_arg_names.append("target_values")
                 expected_arg_names.append("past_observed_mask")

From f451c0574d2ab081adb7185c204d0241d596ac7c Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Tue, 7 Nov 2023 17:11:01 -0500
Subject: [PATCH 169/189] change num_labels to num_targets

---
 tests/models/patchtst/test_modeling_patchtst.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index de3a42ef5fd9b5..4313591da29609 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -70,7 +70,7 @@ def __init__(
         lags_sequence=[1, 2, 3, 4, 5],
         distil=False,
         seed_number=42,
-        num_labels=2,
+        num_targets=2,
         num_output_channels=2,
     ):
         self.parent = parent
@@ -92,7 +92,7 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
 
         self.seed_number = seed_number
-        self.num_labels = num_labels
+        self.num_targets = num_targets
         self.num_output_channels = num_output_channels
         self.distil = distil
         self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
@@ -112,7 +112,7 @@ def get_config(self):
             context_length=self.context_length,
             activation_function=self.hidden_act,
             seed_number=self.seed_number,
-            num_labels=self.num_labels,
+            num_targets=self.num_targets,
             num_output_channels=self.num_output_channels,
         )
 
@@ -192,7 +192,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         # else if classification model:
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
             rng = random.Random(self.model_tester.seed_number)
-            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng)
+            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng)
             inputs_dict["target_values"] = labels
             inputs_dict.pop("future_values")
         elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):

From 131454d0c8d72c44a10e01dc88cf8e438aa9b208 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 17:00:37 -0500
Subject: [PATCH 170/189] Remove PatchTSTTranspose

---
 .../models/patchtst/modeling_patchtst.py      | 45 +++----------------
 1 file changed, 7 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 693f3050f851ec..194e19ae3f508e 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -199,33 +199,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-class PatchTSTTranspose(nn.Module):
-    """
-    Transpose the tensor to the dimension defined in **dims**
-
-    Parameters:
-        dims (`list`): list of dimensions to be transposed
-        contiguous (`bool`, default to False): if True, the transposed tensor is contiguous
-    """
-
-    def __init__(self, *dims, contiguous=False):
-        super().__init__()
-        self.dims = dims
-        self.contiguous = dims
-
-    def forward(self, inputs: torch.Tensor):
-        """
-        Parameters:
-            inputs (`torch.Tensor`): input to be transposed
-        Returns:
-            `torch.Tensor`: transposed tensor
-        """
-        if self.contiguous:
-            return inputs.transpose(*self.dims).contiguous()
-        else:
-            return inputs.transpose(*self.dims)
-
-
 class PatchTSTBatchNorm(nn.Module):
     """
     Parameters:
@@ -233,11 +206,9 @@ class PatchTSTBatchNorm(nn.Module):
         d_model (`int`): model dimension
     """
 
-    def __init__(self, d_model):
+    def __init__(self, config):
         super().__init__()
-        self.d_model = d_model
-        self.transpose = PatchTSTTranspose(1, 2)
-        self.batchnorm = nn.BatchNorm1d(self.d_model)
+        self.batchnorm = nn.BatchNorm1d(config.d_model)
 
     def forward(self, inputs: torch.Tensor):
         """
@@ -245,12 +216,11 @@ def forward(self, inputs: torch.Tensor):
             inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                 input for Batch norm calculation
         Returns:
-            `torch.Tensor`: tensor
+            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
         """
-        output = self.transpose(inputs)  # output: (batch_size, d_model, sequence_length)
+        output = inputs.transpose(1, 2)  # output: (batch_size, d_model, sequence_length)
         output = self.batchnorm(output)
-        output = self.transpose(output)  # output: (batch_size, sequence_length, d_model)
-        return output
+        return output.transpose(1, 2)
 
 
 def positional_encoding(position_embedding_type, learned, q_len, d_model):
@@ -742,7 +712,6 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel):
     """
     PatchTST Encoder
     """
-
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.num_input_channels = config.num_input_channels
@@ -1297,7 +1266,7 @@ def forward(
         )
 
 
-class MaskPretrainHead(nn.Module):
+class PatchTSTMaskPretrainHead(nn.Module):
     """
     Pretraining head for mask modelling
     """
@@ -1335,7 +1304,7 @@ def __init__(self, config: PatchTSTConfig):
 
         config.mask_input = True
         self.model = PatchTSTModel(config=config)
-        self.head = MaskPretrainHead(config)
+        self.head = PatchTSTMaskPretrainHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()

From 58dd1ec0bdf43ae2f117c0bfa9c3fef46b1dd1c2 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 17:28:43 -0500
Subject: [PATCH 171/189] remove arguments in PatchTSTMeanScaler

---
 .../models/patchtst/modeling_patchtst.py      | 95 +++++++++----------
 1 file changed, 44 insertions(+), 51 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 194e19ae3f508e..018df19f60c153 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1103,83 +1103,67 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST
 class PatchTSTMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
 
     @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
-        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
-        num_observed = observed_indicator.sum(self.dim, keepdim=True)
-
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        ts_sum = (data * observed_indicator).abs().sum(dim=1, keepdim=True)
+        num_observed = observed_indicator.sum(dim=1, keepdim=True)
         scale = ts_sum / torch.clamp(num_observed, min=1)
 
-        # If `default_scale` is provided, we use it, otherwise we use the scale
-        # of the batch.
-        if self.default_scale is None:
-            batch_sum = ts_sum.sum(dim=0)
-            batch_observations = torch.clamp(num_observed.sum(0), min=1)
-            default_scale = torch.squeeze(batch_sum / batch_observations)
-        else:
-            default_scale = self.default_scale * torch.ones_like(scale)
+        # use the scale of the batch.
+        batch_sum = ts_sum.sum(dim=0)
+        batch_observations = torch.clamp(num_observed.sum(0), min=1)
+        default_scale = torch.squeeze(batch_sum / batch_observations)
 
         # apply default scale where there are no observations
         scale = torch.where(num_observed > 0, scale, default_scale)
 
-        # ensure the scale is at least `self.minimum_scale`
-        scale = torch.clamp(scale, min=self.minimum_scale)
+        # ensure the scale is at least 1e-10
+        scale = torch.clamp(scale, min=1e-10)
         scaled_data = data / scale
-
-        if not self.keepdim:
-            scale = scale.squeeze(dim=self.dim)
-
         return scaled_data, torch.zeros_like(scale), scale
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST
 class PatchTSTNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
-        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=1, keepdim=True)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=1, keepdim=True)
         return data, loc, scale
 
 
@@ -1187,15 +1171,24 @@ class PatchTSTScaler(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True)
+            self.scaler = PatchTSTMeanScaler()
         elif config.scaling == "std":
-            self.scaler = PatchTSTStdScaler(dim=1, keepdim=True)
+            self.scaler = PatchTSTStdScaler()
         else:
-            self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True)
+            self.scaler = PatchTSTNOPScaler()
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, um_input_channels)`)
+        """
         data, loc, scale = self.scaler(data, observed_indicator)
         return data, loc, scale
 

From 9a69973cc8168ebb30c4f1d05bd5a53f544e25dd Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 17:52:02 -0500
Subject: [PATCH 172/189] remove arguments in PatchTSTStdScaler

---
 .../models/patchtst/modeling_patchtst.py      | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 018df19f60c153..a35e69ded624dd 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1069,34 +1069,33 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it
     by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
 
     @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(dim=1, keepdim=True)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(dim=1, keepdim=True) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
-        scale = torch.sqrt(variance + self.minimum_scale)
+        variance = (((data - loc) * observed_indicator) ** 2).sum(dim=1, keepdim=True) / denominator
+        scale = torch.sqrt(variance + 1e-10)
         return (data - loc) / scale, loc, scale
 
 
@@ -1106,7 +1105,6 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
     """
-
     def __init__(self):
         super().__init__()
 
@@ -1118,6 +1116,8 @@ def forward(
         Parameters:
             data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
         Returns:
             tuple of `torch.Tensor` of shapes
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
@@ -1151,7 +1151,7 @@ def __init__(self):
         super().__init__()
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor=None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -1184,6 +1184,8 @@ def forward(
         Parameters:
             data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                 input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
         Returns:
             tuple of `torch.Tensor` of shapes
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,

From 7ab8d59233d42da87bcd1c37c35bf53bd6513f60 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 19:55:19 -0500
Subject: [PATCH 173/189] add config as an argument to all the scaler classes

---
 .../models/patchtst/modeling_patchtst.py      | 62 ++++++++++++-------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index a35e69ded624dd..9422ba9ccba459 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -206,7 +206,7 @@ class PatchTSTBatchNorm(nn.Module):
         d_model (`int`): model dimension
     """
 
-    def __init__(self, config):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.batchnorm = nn.BatchNorm1d(config.d_model)
 
@@ -1073,8 +1073,11 @@ class PatchTSTStdScaler(nn.Module):
     by subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
+        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
+        self.keepdim = True if config.keepdim is None else config.keepdim
+        self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
 
     @torch.no_grad()
     def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
@@ -1090,12 +1093,12 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                 `(batch_size, 1, num_input_channels)`)
         """
-        denominator = observed_indicator.sum(dim=1, keepdim=True)
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * observed_indicator).sum(dim=1, keepdim=True) / denominator
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * observed_indicator) ** 2).sum(dim=1, keepdim=True) / denominator
-        scale = torch.sqrt(variance + 1e-10)
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
@@ -1105,8 +1108,12 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
     """
-    def __init__(self):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
+        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
+        self.keepdim = True if config.keepdim is None else config.keepdim
+        self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
+        self.default_scale = config.default_scale if config.default_scale else None
 
     @torch.no_grad()
     def forward(
@@ -1123,21 +1130,30 @@ def forward(
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                 `(batch_size, 1, num_input_channels)`)
         """
-        ts_sum = (data * observed_indicator).abs().sum(dim=1, keepdim=True)
-        num_observed = observed_indicator.sum(dim=1, keepdim=True)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
         scale = ts_sum / torch.clamp(num_observed, min=1)
 
-        # use the scale of the batch.
-        batch_sum = ts_sum.sum(dim=0)
-        batch_observations = torch.clamp(num_observed.sum(0), min=1)
-        default_scale = torch.squeeze(batch_sum / batch_observations)
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
 
         # apply default scale where there are no observations
         scale = torch.where(num_observed > 0, scale, default_scale)
 
-        # ensure the scale is at least 1e-10
-        scale = torch.clamp(scale, min=1e-10)
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
         scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
         return scaled_data, torch.zeros_like(scale), scale
 
 
@@ -1147,8 +1163,10 @@ class PatchTSTNOPScaler(nn.Module):
     Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
+        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
+        self.keepdim = True if config.keepdim is None else config.keepdim
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor=None
@@ -1162,8 +1180,8 @@ def forward(
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                 `(batch_size, 1, num_input_channels)`)
         """
-        scale = torch.ones_like(data, requires_grad=False).mean(dim=1, keepdim=True)
-        loc = torch.zeros_like(data, requires_grad=False).mean(dim=1, keepdim=True)
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
 
 
@@ -1171,11 +1189,11 @@ class PatchTSTScaler(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = PatchTSTMeanScaler()
+            self.scaler = PatchTSTMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = PatchTSTStdScaler()
+            self.scaler = PatchTSTStdScaler(config)
         else:
-            self.scaler = PatchTSTNOPScaler()
+            self.scaler = PatchTSTNOPScaler(config)
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
@@ -1266,7 +1284,7 @@ class PatchTSTMaskPretrainHead(nn.Module):
     Pretraining head for mask modelling
     """
 
-    def __init__(self, config):
+    def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.dropout = nn.Dropout(config.dropout)
         self.linear = nn.Linear(config.d_model, config.patch_length)

From 4caa376e9225736ddeb105da6aece13c8c54482f Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 20:10:58 -0500
Subject: [PATCH 174/189] reformat

---
 .../models/patchtst/modeling_patchtst.py      | 99 ++++++++-----------
 1 file changed, 43 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 9422ba9ccba459..032d23326f0925 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -295,13 +295,13 @@ def random_masking(
         noise = torch.rand(batch_size, 1, sequence_length, device=device)  # noise in [0, 1], bs x 1 x  L
         noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
     else:
+        # noise in [0, 1], bs x num_channels x L
         noise = torch.rand(
-            batch_size, num_channels, sequence_length, device=device
-        )  # noise in [0, 1], bs x num_channels x L
+            batch_size, num_channels, sequence_length, device=device)
 
+    # mask: [bs x num_channels x num_patch]
     mask = torch.ones(
-        batch_size, num_channels, sequence_length, device=device
-    )  # mask: [bs x num_channels x num_patch]
+        batch_size, num_channels, sequence_length, device=device)
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
@@ -432,14 +432,13 @@ def forward(self, past_values: torch.Tensor):
             raise ValueError(
                 f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
             )
-
-        output = past_values[:, self.sequence_start :, :]  # output: [bs x new_sequence_length x num_channels]
+        # output: [bs x new_sequence_length x num_channels]
+        output = past_values[:, self.sequence_start :, :]
+        # output: [bs x num_patches x num_input_channels x patch_length]
         output = output.unfold(
-            dimension=-2, size=self.patch_length, step=self.stride
-        )  # output: [bs x num_patches x num_input_channels x patch_length]
-        output = output.transpose(
-            -2, -3
-        ).contiguous()  # output: [bs x num_input_channels x num_patches x patch_length]
+            dimension=-2, size=self.patch_length, step=self.stride)
+        # output: [bs x num_input_channels x num_patches x patch_length]
+        output = output.transpose(-2, -3).contiguous()
         return output
 
 
@@ -533,7 +532,7 @@ def __init__(self, config: PatchTSTConfig):
         if "batch" in config.norm.lower():
             self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model)
         else:
-            self.norm_sublayer1 = nn.LayerNorm(config.d_model)
+            self.norm_sublayer1 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
         # Add & Norm of the sublayer 2
         if self.channel_attention:
@@ -541,7 +540,7 @@ def __init__(self, config: PatchTSTConfig):
             if "batch" in config.norm.lower():
                 self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model)
             else:
-                self.norm_sublayer2 = nn.LayerNorm(config.d_model)
+                self.norm_sublayer2 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
         # Position-wise Feed-Forward
         self.ff = nn.Sequential(
@@ -556,7 +555,7 @@ def __init__(self, config: PatchTSTConfig):
         if "batch" in config.norm.lower():
             self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model)
         else:
-            self.norm_sublayer3 = nn.LayerNorm(config.d_model)
+            self.norm_sublayer3 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
         self.pre_norm = config.pre_norm
 
@@ -572,30 +571,28 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
         batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape
 
         # First sublayer: attention across time
-        hidden_state = hidden_state.view(
-            batch_size * num_input_channels, sequence_length, d_model
-        )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+        # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+        hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model)
 
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             attn_output, attn_weights, _ = self.self_attn(
                 hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions
             )
-            hidden_state = hidden_state + self.dropout_path1(
-                attn_output
-            )  # Add: residual connection with residual dropout
+            # Add: residual connection with residual dropout
+            hidden_state = hidden_state + self.dropout_path1(attn_output)
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             attn_output, attn_weights, _ = self.self_attn(
                 hidden_states=hidden_state, output_attentions=output_attentions
             )
+            # hidden_states: [(bs*num_channels) x sequence_length x d_model]
             hidden_state = self.norm_sublayer1(
-                hidden_state + self.dropout_path1(attn_output)
-            )  # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+                hidden_state + self.dropout_path1(attn_output))
 
+        # [bs x num_channels x sequence_length x d_model]
         hidden_state = hidden_state.reshape(
-            batch_size, num_input_channels, sequence_length, d_model
-        )  # [bs x num_channels x sequence_length x d_model]
+            batch_size, num_input_channels, sequence_length, d_model)
 
         # second sublayer: attention across variable at any given time
         # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model]
@@ -611,17 +608,16 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
                 attn_output, channel_attn_weights, _ = self.self_attn(
                     hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions
                 )
-                hidden_state = hidden_state + self.dropout_path2(
-                    attn_output
-                )  # Add: residual connection with residual dropout
+                # Add: residual connection with residual dropout
+                hidden_state = hidden_state + self.dropout_path2(attn_output)
             else:
                 ## Multi-Head attention and Add residual connection and Norm
                 attn_output, channel_attn_weights, _ = self.self_attn(
                     hidden_states=hidden_state, output_attentions=output_attentions
                 )
+                # hidden_states: [(bs*sequence_length) x num_channels x d_model]
                 hidden_state = self.norm_sublayer2(
-                    hidden_state + self.dropout_path2(attn_output)
-                )  # hidden_states: [(bs*sequence_length) x num_channels x d_model]
+                    hidden_state + self.dropout_path2(attn_output))
 
             hidden_state = (
                 hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model)
@@ -630,26 +626,24 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
             )  # src: [bs x num_channels x sequence_length x d_model]
 
         # Third sublayer: mixing across hidden
+        # src: [(batch_size*num_channels) x sequence_length x d_model]
         hidden_state = hidden_state.view(
-            batch_size * num_input_channels, sequence_length, d_model
-        )  # src: [(batch_size*num_channels) x sequence_length x d_model]
+            batch_size * num_input_channels, sequence_length, d_model)
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
+            # Add: residual connection with residual dropout
             hidden_state = hidden_state + self.dropout_path3(
-                self.ff(self.norm_sublayer3(hidden_state))
-            )  # Add: residual connection with residual dropout
+                self.ff(self.norm_sublayer3(hidden_state)))
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm
+            # Add: residual connection with residual dropout
             hidden_state = self.norm_sublayer3(
-                hidden_state + self.dropout_path3(self.ff(hidden_state))
-            )  # Add: residual connection with residual dropout
+                hidden_state + self.dropout_path3(self.ff(hidden_state)))
 
-        hidden_state = hidden_state.reshape(
-            batch_size, num_input_channels, sequence_length, d_model
-        )  # [bs x num_channels x sequence_length x d_model]
+        # [bs x num_channels x sequence_length x d_model]
+        hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model)
 
         outputs = (hidden_state,)
-
         if output_attentions:
             outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights,)
 
@@ -777,13 +771,11 @@ def forward(
             # append cls token
             cls_token = self.cls_token + self.position_enc[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
             cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1)  # get the same copy for all the batch samples
-            hidden_state = torch.cat(
-                (cls_tokens, patch_input), dim=1
-            )  # x: [bs x num_channels x (num_patches+1) x d_model]
+            # x: [bs x num_channels x (num_patches+1) x d_model]
+            hidden_state = torch.cat((cls_tokens, patch_input), dim=1)
         else:
-            hidden_state = self.positional_dropout(
-                patch_input + self.position_enc
-            )  # x: [bs x num_channels x num_patches x d_model]
+            # x: [bs x num_channels x num_patches x d_model]
+            hidden_state = self.positional_dropout(patch_input + self.position_enc)
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -792,10 +784,8 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_state,)
 
-            layer_outputs = encoder_layer(
-                hidden_state=hidden_state,
-                output_attentions=output_attentions,
-            )
+            layer_outputs = encoder_layer(hidden_state=hidden_state,
+                output_attentions=output_attentions)
             # get hidden state
             hidden_state = layer_outputs[0]  # hidden_states: [bs x num_channels x num_patches x d_model]
             # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
@@ -1542,9 +1532,8 @@ def forward(self, embedding: torch.Tensor):
             for i in range(self.num_input_channels):
                 z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
                 z = self.dropouts[i](z)
-                z = self.projections[i](
-                    z
-                )  # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
+                # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
+                z = self.projections[i](z)
                 x_out.append(z)
             output = torch.stack(x_out, dim=1)  # x: [bs x num_channels x forecast_len]
         else:
@@ -1554,12 +1543,10 @@ def forward(self, embedding: torch.Tensor):
             # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
 
         if isinstance(output, tuple):
-            output = tuple(
-                z.transpose(2, 1) for z in output
-            )  # ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels])
+            # output: ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels])
+            output = tuple(z.transpose(2, 1) for z in output)
         else:
             output = output.transpose(2, 1)  # [bs x forecast_len x num_channels]
-
         return output
 
 

From 03d27f778ab422ec3e5f9a1cbd60226315a453bf Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 21:38:10 -0500
Subject: [PATCH 175/189] Add norm_eps for batchnorm and layernorm

---
 src/transformers/models/patchtst/configuration_patchtst.py | 4 ++++
 src/transformers/models/patchtst/modeling_patchtst.py      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 7a91549684feb7..c36241c3010740 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -68,6 +68,8 @@ class PatchTSTConfig(PretrainedConfig):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
         norm (`str` , *optional*, defaults to `"BatchNorm"`):
             Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`.
+        norm_eps (`float`, *optional*, defaults to 1e-5):
+            A value added to the denominator for numerical stability of normalization. Default: 1e-5
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention probabilities.
         dropout (`float`, *optional*, defaults to 0.0):
@@ -170,6 +172,7 @@ def __init__(
         channel_attention: bool = False,
         encoder_ffn_dim: int = 256,
         norm: str = "BatchNorm",
+        norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
         dropout: float = 0.0,
         positional_dropout: float = 0.0,
@@ -221,6 +224,7 @@ def __init__(
         self.shared_embedding = shared_embedding
         self.channel_attention = channel_attention
         self.norm = norm
+        self.norm_eps = norm_eps
         self.positional_dropout = positional_dropout
         self.dropout_path = dropout_path
         self.ff_dropout = ff_dropout
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 032d23326f0925..0cbffcb844aca6 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -208,7 +208,7 @@ class PatchTSTBatchNorm(nn.Module):
 
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.batchnorm = nn.BatchNorm1d(config.d_model)
+        self.batchnorm = nn.BatchNorm1d(config.d_model, eps=config.norm_eps)
 
     def forward(self, inputs: torch.Tensor):
         """

From 03e32203b24f53b00123eb50dd2f082d40c13201 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 21:49:19 -0500
Subject: [PATCH 176/189] reformat.

---
 .../models/patchtst/modeling_patchtst.py      | 49 +++++++------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 0cbffcb844aca6..dc3845745063f1 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -577,58 +577,47 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions
-            )
+                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions)
             # Add: residual connection with residual dropout
             hidden_state = hidden_state + self.dropout_path1(attn_output)
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=hidden_state, output_attentions=output_attentions
-            )
+                hidden_states=hidden_state, output_attentions=output_attentions)
             # hidden_states: [(bs*num_channels) x sequence_length x d_model]
-            hidden_state = self.norm_sublayer1(
-                hidden_state + self.dropout_path1(attn_output))
+            hidden_state = self.norm_sublayer1(hidden_state + self.dropout_path1(attn_output))
 
-        # [bs x num_channels x sequence_length x d_model]
-        hidden_state = hidden_state.reshape(
-            batch_size, num_input_channels, sequence_length, d_model)
+        # hidden_state: [bs x num_channels x sequence_length x d_model]
+        hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model)
 
         # second sublayer: attention across variable at any given time
-        # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model]
-        #                                                 -> [(bs*sequence_length) x num_channels x d_model]
         if self.channel_attention:
-            hidden_state = (
-                hidden_state.transpose(2, 1)
-                .contiguous()
-                .view(batch_size * sequence_length, num_input_channels, d_model)
-            )  # [(bs*sequence_length) x num_channels x d_model]
+            # hidden_state: [bs x sequence_length x num_channels x d_model]
+            hidden_state = hidden_state.transpose(2, 1).contiguous()
+            # hidden_state: [(bs*sequence_length) x num_channels x d_model]
+            hidden_state = hidden_state.view(batch_size * sequence_length, num_input_channels, d_model)
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 attn_output, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions
-                )
+                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions)
                 # Add: residual connection with residual dropout
                 hidden_state = hidden_state + self.dropout_path2(attn_output)
             else:
                 ## Multi-Head attention and Add residual connection and Norm
                 attn_output, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=hidden_state, output_attentions=output_attentions
-                )
+                    hidden_states=hidden_state, output_attentions=output_attentions)
                 # hidden_states: [(bs*sequence_length) x num_channels x d_model]
-                hidden_state = self.norm_sublayer2(
-                    hidden_state + self.dropout_path2(attn_output))
+                hidden_state = self.norm_sublayer2(hidden_state + self.dropout_path2(attn_output))
 
-            hidden_state = (
-                hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model)
-                .transpose(1, 2)
-                .contiguous()
-            )  # src: [bs x num_channels x sequence_length x d_model]
+            # Reshape hidden state
+            # hidden_state: [bs x sequence_length x num_channels x d_model]
+            hidden_state = hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model)
+            # hidden_state: [bs x num_channels x sequence_length x d_model]
+            hidden_state = hidden_state.transpose(1, 2).contiguous()
 
         # Third sublayer: mixing across hidden
-        # src: [(batch_size*num_channels) x sequence_length x d_model]
-        hidden_state = hidden_state.view(
-            batch_size * num_input_channels, sequence_length, d_model)
+        # hidden_state: [(batch_size*num_channels) x sequence_length x d_model]
+        hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model)
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             # Add: residual connection with residual dropout

From a8dc48adcdff3ac767cceb63d6919fe59d689d20 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 22:07:56 -0500
Subject: [PATCH 177/189] reformat

---
 .../models/patchtst/modeling_patchtst.py      | 40 ++++++++++++++-----
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index dc3845745063f1..877b4089720da7 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1072,11 +1072,11 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
                 (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                 `(batch_size, 1, num_input_channels)`)
         """
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
@@ -1217,11 +1217,31 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
-        future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PatchTSTModelOutput]:
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
+
+        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
@@ -1585,7 +1605,7 @@ def forward(
                 - 1 for values that are **observed**,
                 - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
             future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
-                future target values associates with the `past_values`
+                future target values associated with the `past_values`
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
@@ -1679,10 +1699,8 @@ def generate(
         distribution = self.distribution_output.distribution(
             outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale
         )
-        # get samples
-        samples = [
-            distribution.sample() for _ in range(num_parallel_samples)
-        ]  # samples: list of [bs x forecast_len x num_channels]
+        # get samples: list of [bs x forecast_len x num_channels]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x forecast_len x num_channels]
         return SamplePatchTSTPredictionOutput(sequences=samples)
@@ -1861,8 +1879,8 @@ def generate(
 
         # get distribution
         distribution = self.distribution_output.distribution(outputs.forecast_outputs)
-        # get samples
-        samples = [distribution.sample() for _ in range(num_parallel_samples)]  # samples: list of [bs x num_targets]
+        # get samples: list of [bs x num_targets]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]
         # stack tensors
         samples = torch.stack(samples, dim=1)  # [bs x num_samples x num_targets]
         return SamplePatchTSTRegressionOutput(sequences=samples)

From d002bac92bc93abadb04cf426f5f6f6c6c4afad3 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 22:21:35 -0500
Subject: [PATCH 178/189] edit docstring

---
 .../models/patchtst/configuration_patchtst.py | 10 ++++-----
 .../models/patchtst/modeling_patchtst.py      | 22 ++++++++-----------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index c36241c3010740..b8a9d0f512025f 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -51,9 +51,9 @@ class PatchTSTConfig(PretrainedConfig):
             distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
             error "mse".
         patch_length (`int`, *optional*, defaults to 1):
-            Define the patch length of the patchification process. Default to 1.
+            Define the patch length of the patchification process.
         stride (`int`, *optional*, defaults to 1):
-            define the stride of the patchification process. Default to 1.
+            define the stride of the patchification process.
         encoder_layers (`int`, *optional*, defaults to 3):
             Number of encoder layers.
         d_model (`int`, *optional*, defaults to 64):
@@ -69,7 +69,7 @@ class PatchTSTConfig(PretrainedConfig):
         norm (`str` , *optional*, defaults to `"BatchNorm"`):
             Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`.
         norm_eps (`float`, *optional*, defaults to 1e-5):
-            A value added to the denominator for numerical stability of normalization. Default: 1e-5
+            A value added to the denominator for numerical stability of normalization.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention probabilities.
         dropout (`float`, *optional*, defaults to 0.0):
@@ -85,7 +85,7 @@ class PatchTSTConfig(PretrainedConfig):
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported.
         pre_norm (`bool`, *optional*, defaults to `True`):
-            Normalization is applied before self-attention if pre_norm is set to True. Otherwise, normalization is
+            Normalization is applied before self-attention if pre_norm is set to `True`. Otherwise, normalization is
             applied after residual block.
         positional_encoding (`str`, *optional*, defaults to `"sincos"`):
             Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
@@ -98,7 +98,7 @@ class PatchTSTConfig(PretrainedConfig):
         shared_projection (`bool`, *optional*, defaults to `True`):
             Sharing the projection layer across different channels in the forecast head.
         seed_number (`Optional`, *optional*):
-            Use seed number for random masking.
+            Seed number used for random masking. If unset, no seed is set.
         scaling (`Union`, *optional*, defaults to `"mean"`):
             Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
             scaler is set to "mean".
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 877b4089720da7..39bc1b2fb3d211 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -270,11 +270,11 @@ def random_masking(
         mask_ratio (`float`):
             Mask ratio.
         unmasked_channel_indices (list, *optional*):
-            indices of unmasked channels. These channels will not be masked. Defaults to None.
+            indices of unmasked channels. These channels will not be masked.
         channel_consistent_masking (bool, *optional* defaults to False):
             When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
-            across channels. Defaults to False.
-        mask_value (int, *optional* defaults to 0):
+            across channels.
+        mask_value (int, *optional*, defaults to 0):
             Value to use for masking.
         seed_number (int, *optional*):
             Value to set for the random seed.
@@ -337,11 +337,11 @@ def forecast_masking(
             List of patch lengths to mask in the end of the data.
         forecast_mask_ratios (`list`, *optional*): [0.7, 0.3]
             List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and
-            forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None.
+            forecast_mask_ratios is [1,1], then equal weights to both patch lengths.
         unmasked_channel_indices (`list`, *optional*):
-            Control Variable channel indices. These channels will not be masked. Defaults to None.
-        mask_value (`int`, *optional* defaults to 0):
-            Value to use for masking. Defaults to 0.
+            Control Variable channel indices. These channels will not be masked.
+        mask_value (`int`, *optional*, defaults to 0):
+            Value to use for masking.
         seed_number (`int`, *optional*):
             Value to set for the random seed.
 
@@ -716,13 +716,11 @@ def __init__(self, config: PatchTSTConfig):
             )
         else:
             self.position_enc = positional_encoding(
-                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model
-            )
+                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model)
 
         # Positional dropout
         self.positional_dropout = (
-            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
-        )
+            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity())
 
         # Encoder
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
@@ -1058,7 +1056,6 @@ def __init__(self, config: PatchTSTConfig):
         self.keepdim = True if config.keepdim is None else config.keepdim
         self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
 
-    @torch.no_grad()
     def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
                 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
@@ -1094,7 +1091,6 @@ def __init__(self, config: PatchTSTConfig):
         self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
         self.default_scale = config.default_scale if config.default_scale else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

From 97d75e66544b71ca327a1b09c642f5d1b7ecda96 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 22:30:56 -0500
Subject: [PATCH 179/189] update docstring

---
 .../models/patchtst/configuration_patchtst.py          | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index b8a9d0f512025f..a75c63a4065fc2 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -102,10 +102,10 @@ class PatchTSTConfig(PretrainedConfig):
         scaling (`Union`, *optional*, defaults to `"mean"`):
             Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
             scaler is set to "mean".
-        mask_input (`bool`, *optional*, defaults to False):
+        mask_input (`bool`, *optional*, defaults to `False`):
             Apply masking during the pretraining.
         mask_type (`str`, *optional*, defaults to `"random"`):
-            Masking type. Only `"random"` is currently supported.
+            Masking type. Only `"random"` and `"forecast"` are currently supported.
         random_mask_ratio (`float`, *optional*, defaults to 0.5):
             Masking ratio is applied to mask the input data during random pretraining.
         forecast_mask_patches (`List`, *optional*, defaults to `[2, 3]`):
@@ -116,9 +116,9 @@ class PatchTSTConfig(PretrainedConfig):
         channel_consistent_masking (`bool`, *optional*, defaults to `False`):
             If channel consistent masking is True, all the channels will have the same masking.
         unmasked_channel_indices (`list`, *optional*):
-            Channels are not masked during pretraining.
+            Channels that are not masked during pretraining.
         mask_value (`int`, *optional*, defaults to 0):
-            Mask value to set.
+            Define the value of entries to be masked when pretraining.
         pooling (`str`, *optional*, defaults to `"mean"`):
             Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
         head_dropout (`float`, *optional*, defaults to 0.0):
@@ -132,7 +132,7 @@ class PatchTSTConfig(PretrainedConfig):
             Output range for regression task. The range of output values can be set to enforce the model to produce
             values within a range.
         num_parallel_samples (`int`, *optional*, defaults to 100):
-            The number of samples to generate in parallel for probablistic prediction.
+            The number of samples is generated in parallel for probablistic prediction.
 
 
     ```python

From 49232db420939771c6644e350cc18398b2332401 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Wed, 8 Nov 2023 22:56:43 -0500
Subject: [PATCH 180/189] change variable name pooling to pooling_type

---
 .../models/patchtst/configuration_patchtst.py |   8 +-
 .../models/patchtst/modeling_patchtst.py      | 104 ++++++++++--------
 2 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index a75c63a4065fc2..d0b9963fb49b7e 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -119,8 +119,8 @@ class PatchTSTConfig(PretrainedConfig):
             Channels that are not masked during pretraining.
         mask_value (`int`, *optional*, defaults to 0):
             Define the value of entries to be masked when pretraining.
-        pooling (`str`, *optional*, defaults to `"mean"`):
-            Pooling in the latent representation. `"mean"`, `"max"` and None are supported.
+        pooling_type (`str`, *optional*, defaults to `"mean"`):
+            Pooling of the embedding. `"mean"`, `"max"` and `None` are supported.
         head_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for head.
         prediction_length (`int`, *optional*, defaults to 24):
@@ -198,7 +198,7 @@ def __init__(
         unmasked_channel_indices: Optional[List[int]] = None,
         mask_value=0,
         # head
-        pooling: str = "mean",
+        pooling_type: str = "mean",
         head_dropout: float = 0.0,
         prediction_length: int = 24,
         num_targets: int = 1,
@@ -254,7 +254,7 @@ def __init__(
         self.mask_value = mask_value
 
         # general head params
-        self.pooling = pooling
+        self.pooling_type = pooling_type
         self.head_dropout = head_dropout
 
         # For prediction head
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 39bc1b2fb3d211..645db3e90d21c7 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1448,7 +1448,7 @@ class PatchTSTClassificationHead(nn.Module):
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.use_cls_token = config.use_cls_token
-        self.pooling = config.pooling
+        self.pooling_type = config.pooling_type
         self.flatten = nn.Flatten(start_dim=1)
         self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
         self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_targets)
@@ -1464,17 +1464,21 @@ def forward(self, embedding: torch.Tensor):
 
         """
         if self.use_cls_token:
-            x = embedding[:, :, 0, :]  # use the first output token, x: bs x num_channels x d_model
-        elif self.pooling == "mean":
-            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
-        elif self.pooling == "max":
-            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
+            # use the first output token, pooled_embedding: bs x num_channels x d_model
+            pooled_embedding = embedding[:, :, 0, :]
+        elif self.pooling_type == "mean":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.mean(dim=2)
+        elif self.pooling_type == "max":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.max(dim=2)
         else:
-            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
-
-        x = self.flatten(x)  # x: bs x num_channels * d_model
-        y = self.linear(self.dropout(x))  # y: bs x n_classes
-        return y
+            raise Exception(f"pooling operator {self.pooling_type} is not implemented yet")
+        # pooled_embedding: bs x num_channels * d_model
+        pooled_embedding = self.flatten(pooled_embedding)
+        # output: bs x n_classes
+        output = self.linear(self.dropout(pooled_embedding))
+        return output
 
 
 class PatchTSTPredictionHead(nn.Module):
@@ -1484,8 +1488,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         self.shared_projection = config.shared_projection
         self.num_input_channels = config.num_input_channels
         self.use_cls_token = config.use_cls_token
-        self.pooling = config.pooling
-        head_dim = config.d_model if self.pooling else config.d_model * config.num_patches
+        self.pooling_type = config.pooling_type
+        head_dim = config.d_model if self.pooling_type else config.d_model * config.num_patches
 
         if not self.shared_projection:
             # if each channel has its own head
@@ -1523,29 +1527,38 @@ def forward(self, embedding: torch.Tensor):
 
         """
         if self.use_cls_token:
-            y = embedding[:, :, 0, :]  # y: [bs x num_channels x d_model]
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding[:, :, 0, :]
         else:
-            if self.pooling == "mean":
-                y = embedding.mean(dim=2)  # y: [bs x num_channels x d_model]
-            elif self.pooling == "max":
-                y = embedding.max(dim=2)  # y: [bs x num_channels x d_model]
+            if self.pooling_type == "mean":
+                # pooled_embedding: [bs x num_channels x d_model]
+                pooled_embedding = embedding.mean(dim=2)
+            elif self.pooling_type == "max":
+                # pooled_embedding: [bs x num_channels x d_model]
+                pooled_embedding = embedding.max(dim=2)
             else:
-                y = embedding  # y: [bs x num_channels x num_patches x d_model]
+                # pooled_embedding: [bs x num_channels x num_patches x d_model]
+                pooled_embedding = embedding
 
         if not self.shared_projection:
-            x_out = []
+            output = []
             for i in range(self.num_input_channels):
-                z = self.flattens[i](y[:, i, :])  # y: [bs x (d_model * num_patches)] or [bs x d_model)]
-                z = self.dropouts[i](z)
-                # z: [bs x forecast_len]  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
-                z = self.projections[i](z)
-                x_out.append(z)
-            output = torch.stack(x_out, dim=1)  # x: [bs x num_channels x forecast_len]
+                # pooled_embedding: [bs x (d_model * num_patches)] or [bs x d_model)]
+                pooled_embedding = self.flattens[i](pooled_embedding[:, i, :])
+                pooled_embedding = self.dropouts[i](pooled_embedding)
+                # pooled_embedding: [bs x forecast_len]
+                #  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
+                pooled_embedding = self.projections[i](pooled_embedding)
+                output.append(pooled_embedding)
+            # output: [bs x num_channels x forecast_len]
+            output = torch.stack(output, dim=1)
         else:
-            z = self.flatten(y)  # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)]
-            z = self.dropout(z)
-            output = self.projection(z)  # output: [bs x num_channels x forecast_len]
-            # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
+            # pooled_embedding: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)]
+            pooled_embedding = self.flatten(pooled_embedding)
+            pooled_embedding = self.dropout(pooled_embedding)
+            # output: [bs x num_channels x forecast_len] or
+            # tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
+            output = self.projection(pooled_embedding)
 
         if isinstance(output, tuple):
             # output: ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels])
@@ -1628,8 +1641,7 @@ def forward(
         if future_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(
-                    y_hat, loc=model_output.loc, scale=model_output.scale
-                )
+                    y_hat, loc=model_output.loc, scale=model_output.scale)
                 loss_val = nll(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)
@@ -1711,7 +1723,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None):
         super().__init__()
         self.y_range = config.output_range
         self.use_cls_token = config.use_cls_token
-        self.pooling = config.pooling
+        self.pooling_type = config.pooling_type
         self.distribution_output = distribution_output
 
         head_dim = config.num_input_channels * config.d_model
@@ -1735,22 +1747,26 @@ def forward(self, embedding: torch.Tensor):
 
         """
         if self.use_cls_token:
-            x = embedding[:, :, 0, :]  # use the first output token, x: [bs x num_channels x d_model]
-        elif self.pooling == "mean":
-            x = embedding.mean(dim=2)  # x: [bs x num_channels x d_model]
-        elif self.pooling == "max":
-            x = embedding.max(dim=2)  # x: [bs x num_channels x d_model]
+            # use the first output token, pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding[:, :, 0, :]
+        elif self.pooling_type == "mean":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.mean(dim=2)
+        elif self.pooling_type == "max":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.max(dim=2)
         else:
-            raise Exception(f"pooling operator {self.pooling} is not implemented yet")
+            raise Exception(f"pooling operator {self.pooling_type} is not implemented yet")
         # flatten the input
-        x = self.dropout(self.flatten(x))  # x: bs x (num_channels * d_model)
+        # pooled_embedding: bs x (num_channels * d_model)
+        pooled_embedding = self.dropout(self.flatten(pooled_embedding))
         # projection
-        y = self.projection(x)  # y: bs x output_dim or a tuple of this shape for distribution head
+        # output: bs x output_dim or a tuple of this shape for distribution head
+        output = self.projection(pooled_embedding)
         #
         if (self.distribution_output is None) & (self.y_range is not None):  # linear head
-            y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
-
-        return y
+            output = torch.sigmoid(output) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
+        return output
 
 
 class PatchTSTForRegression(PatchTSTPreTrainedModel):

From 3684320e1e960191f907559189211a6ff69689b7 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Thu, 9 Nov 2023 08:31:06 -0500
Subject: [PATCH 181/189] fix output_hidden_states as tuple

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 645db3e90d21c7..179a8ad0777e55 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -745,9 +745,7 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 
         # Input embedding
         patch_input = self.embedder(patch_input)

From b818036973d396368369945ae45445fd12817b77 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 10 Nov 2023 10:53:29 -0500
Subject: [PATCH 182/189] fix bug when calling PatchTSTBatchNorm

---
 .../models/patchtst/modeling_patchtst.py      | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 179a8ad0777e55..4bf068adb10776 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -530,7 +530,7 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of the sublayer 1
         self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model)
+            self.norm_sublayer1 = PatchTSTBatchNorm(config)
         else:
             self.norm_sublayer1 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
@@ -538,7 +538,7 @@ def __init__(self, config: PatchTSTConfig):
         if self.channel_attention:
             self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
             if "batch" in config.norm.lower():
-                self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model)
+                self.norm_sublayer2 = PatchTSTBatchNorm(config)
             else:
                 self.norm_sublayer2 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
@@ -553,7 +553,7 @@ def __init__(self, config: PatchTSTConfig):
         # Add & Norm of sublayer 3
         self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity()
         if "batch" in config.norm.lower():
-            self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model)
+            self.norm_sublayer3 = PatchTSTBatchNorm(config)
         else:
             self.norm_sublayer3 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
 
@@ -1050,9 +1050,9 @@ class PatchTSTStdScaler(nn.Module):
 
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
-        self.keepdim = True if config.keepdim is None else config.keepdim
-        self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
+        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10
 
     def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
                 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -1084,10 +1084,11 @@ class PatchTSTMeanScaler(nn.Module):
     """
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
-        self.keepdim = True if config.keepdim is None else config.keepdim
-        self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale
-        self.default_scale = config.default_scale if config.default_scale else None
+        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
+        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, 'default_scale') else None
+
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
@@ -1138,8 +1139,8 @@ class PatchTSTNOPScaler(nn.Module):
 
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = 1 if config.scaling_dim is None else config.scaling_dim
-        self.keepdim = True if config.keepdim is None else config.keepdim
+        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
+        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor=None
@@ -1211,6 +1212,7 @@ def forward(
         self,
         past_values: torch.Tensor,
         past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,

From fb5f49020e536678a83f88c51d5b5509322451ec Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 10 Nov 2023 12:06:04 -0500
Subject: [PATCH 183/189] change stride to patch_stride

---
 .../models/patchtst/configuration_patchtst.py          |  8 ++++----
 src/transformers/models/patchtst/modeling_patchtst.py  |  8 ++++----
 tests/models/patchtst/test_modeling_patchtst.py        | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index d0b9963fb49b7e..8a7372fd68c581 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -52,7 +52,7 @@ class PatchTSTConfig(PretrainedConfig):
             error "mse".
         patch_length (`int`, *optional*, defaults to 1):
             Define the patch length of the patchification process.
-        stride (`int`, *optional*, defaults to 1):
+        patch_stride (`int`, *optional*, defaults to 1):
             define the stride of the patchification process.
         encoder_layers (`int`, *optional*, defaults to 3):
             Number of encoder layers.
@@ -163,7 +163,7 @@ def __init__(
         loss: str = "mse",
         # PatchTST arguments
         patch_length: int = 1,
-        stride: int = 1,
+        patch_stride: int = 1,
         # Transformer architecture configuration
         encoder_layers: int = 3,
         d_model: int = 64,
@@ -239,7 +239,7 @@ def __init__(
 
         # PatchTST parameters
         self.patch_length = patch_length
-        self.stride = stride
+        self.patch_stride = patch_stride
         self.num_patches = self._num_patches()
 
         # Mask pretraining
@@ -271,4 +271,4 @@ def __init__(
         super().__init__(**kwargs)
 
     def _num_patches(self):
-        return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
+        return (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 4bf068adb10776..cf4ae6a7d857dd 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -406,7 +406,7 @@ def __init__(self, config: PatchTSTConfig):
 
         self.sequence_length = config.context_length
         self.patch_length = config.patch_length
-        self.stride = config.stride
+        self.patch_stride = config.patch_stride
 
         if self.sequence_length <= self.patch_length:
             raise ValueError(
@@ -414,8 +414,8 @@ def __init__(self, config: PatchTSTConfig):
             )
 
         # get the number of patches
-        num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.stride + 1
-        new_sequence_length = self.patch_length + self.stride * (num_patches - 1)
+        num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
+        new_sequence_length = self.patch_length + self.patch_stride * (num_patches - 1)
         self.sequence_start = self.sequence_length - new_sequence_length
 
     def forward(self, past_values: torch.Tensor):
@@ -436,7 +436,7 @@ def forward(self, past_values: torch.Tensor):
         output = past_values[:, self.sequence_start :, :]
         # output: [bs x num_patches x num_input_channels x patch_length]
         output = output.unfold(
-            dimension=-2, size=self.patch_length, step=self.stride)
+            dimension=-2, size=self.patch_length, step=self.patch_stride)
         # output: [bs x num_input_channels x num_patches x patch_length]
         output = output.transpose(-2, -3).contiguous()
         return output
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 4313591da29609..8d6f2202ee81ce 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -56,7 +56,7 @@ def __init__(
         prediction_length=7,
         context_length=14,
         patch_length=5,
-        stride=5,
+        patch_stride=5,
         num_input_channels=1,
         num_time_features=1,
         is_training=True,
@@ -78,7 +78,7 @@ def __init__(
         self.prediction_length = prediction_length
         self.context_length = context_length
         self.patch_length = patch_length
-        self.stride = stride
+        self.patch_stride = patch_stride
         self.num_input_channels = num_input_channels
         self.num_time_features = num_time_features
         self.lags_sequence = lags_sequence
@@ -95,13 +95,13 @@ def __init__(
         self.num_targets = num_targets
         self.num_output_channels = num_output_channels
         self.distil = distil
-        self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1
+        self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
 
     def get_config(self):
         return PatchTSTConfig(
             prediction_length=self.prediction_length,
             patch_length=self.patch_length,
-            stride=self.stride,
+            patch_stride=self.patch_stride,
             num_input_channels=self.num_input_channels,
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -321,7 +321,7 @@ def test_pretrain_head(self):
             output = model(past_values=batch["past_values"].to(torch_device)).prediction_output
         num_patch = (
             max(model.config.context_length, model.config.patch_length) - model.config.patch_length
-        ) // model.config.stride + 1
+        ) // model.config.patch_stride + 1
         expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length])
         self.assertEqual(output.shape, expected_shape)
 

From f45baef4e2cda00e3c3ed0d5ccd13e0e83438f60 Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Fri, 10 Nov 2023 13:24:11 -0500
Subject: [PATCH 184/189] create PatchTSTPositionalEncoding class and
 restructure the PatchTSTEncoder

---
 .../models/patchtst/configuration_patchtst.py |  6 +-
 .../models/patchtst/modeling_patchtst.py      | 91 ++++++++++---------
 2 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 8a7372fd68c581..65711f2c599437 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -87,7 +87,7 @@ class PatchTSTConfig(PretrainedConfig):
         pre_norm (`bool`, *optional*, defaults to `True`):
             Normalization is applied before self-attention if pre_norm is set to `True`. Otherwise, normalization is
             applied after residual block.
-        positional_encoding (`str`, *optional*, defaults to `"sincos"`):
+        positional_encoding_type (`str`, *optional*, defaults to `"sincos"`):
             Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported.
         learn_pe (`bool`, *optional*, defaults to `False`):
             Whether the positional encoding is updated during training.
@@ -181,7 +181,7 @@ def __init__(
         bias: bool = True,
         activation_function: str = "gelu",
         pre_norm: bool = True,
-        positional_encoding: str = "sincos",
+        positional_encoding_type: str = "sincos",
         learn_pe: bool = False,
         use_cls_token: bool = False,
         init_std: float = 0.02,
@@ -231,7 +231,7 @@ def __init__(
         self.bias = bias
         self.activation_function = activation_function
         self.pre_norm = pre_norm
-        self.positional_encoding = positional_encoding
+        self.positional_encoding_type = positional_encoding_type
         self.learn_pe = learn_pe
         self.use_cls_token = use_cls_token
         self.init_std = init_std
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index cf4ae6a7d857dd..78d59ca20602f4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -223,23 +223,23 @@ def forward(self, inputs: torch.Tensor):
         return output.transpose(1, 2)
 
 
-def positional_encoding(position_embedding_type, learned, q_len, d_model):
+def positional_encoding(positional_encoding_type, learned, q_len, d_model):
     # Positional encoding
-    if position_embedding_type is None:
-        # position_embedding_type = None and learned = False can be used to measure impact of positional encoding
+    if positional_encoding_type is None:
+        # positional_encoding_type = None and learned = False can be used to measure impact of positional encoding
         position_enc = torch.empty((q_len, d_model))
         nn.init.uniform_(position_enc, -0.02, 0.02)
         learned = False
-    elif position_embedding_type == "zeros":
+    elif positional_encoding_type == "zeros":
         position_enc = torch.empty((q_len, d_model))
         nn.init.uniform_(position_enc, -0.02, 0.02)
-    elif position_embedding_type == "normal":
+    elif positional_encoding_type == "normal":
         position_enc = torch.zeros((q_len, 1))
         nn.init.normal_(position_enc, mean=0.0, std=0.1)
-    elif position_embedding_type == "uniform":
+    elif positional_encoding_type == "uniform":
         position_enc = torch.zeros((q_len, 1))
         nn.init.uniform_(position_enc, a=0.0, b=0.1)
-    elif position_embedding_type == "sincos":
+    elif positional_encoding_type == "sincos":
         position_enc = torch.zeros(q_len, d_model)
         position = torch.arange(0, q_len).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
@@ -249,7 +249,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model):
         position_enc = position_enc / (position_enc.std() * 10)
     else:
         raise ValueError(
-            f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
+            f"{positional_encoding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None."
         )
     return nn.Parameter(position_enc, requires_grad=learned)
 
@@ -691,6 +691,41 @@ def forward(self, patch_input: torch.Tensor):
         return embeddings
 
 
+class PatchTSTPositionalEncoding(nn.Module):
+    """
+    Class for positional encoding
+    """
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.use_cls_token = config.use_cls_token
+        if config.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
+            num_patches = config.num_patches + 1
+        else:
+            num_patches = config.num_patches
+        # postional encoding
+        self.position_enc = positional_encoding(
+            config.positional_encoding_type, config.learn_pe, num_patches, config.d_model)
+        # Positional dropout
+        self.positional_dropout = (
+            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity())
+
+    def forward(self, patch_input: torch.Tensor):
+        if self.use_cls_token:
+            # patch_input: [bs x num_channels x num_patches x d_model]
+            patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :])
+            # append cls token where cls_token: [1 x 1 x 1 x d_model]
+            cls_token = self.cls_token + self.position_enc[:1, :]
+            # get the same copy of cls_token for all the samples in batch
+            cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1)
+            # hidden_state: [bs x num_channels x (num_patches+1) x d_model]
+            hidden_state = torch.cat((cls_tokens, patch_input), dim=1)
+        else:
+            # hidden_state: [bs x num_channels x num_patches x d_model]
+            hidden_state = self.positional_dropout(patch_input + self.position_enc)
+        return hidden_state
+
+
 class PatchTSTEncoder(PatchTSTPreTrainedModel):
     """
     PatchTST Encoder
@@ -707,21 +742,8 @@ def __init__(self, config: PatchTSTConfig):
 
         # Input embedding: projection of feature vectors onto a d-dim vector space
         self.embedder = PatchTSTEmbedding(config)
-
         # Positional encoding
-        if config.use_cls_token:
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
-            self.position_enc = positional_encoding(
-                config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model
-            )
-        else:
-            self.position_enc = positional_encoding(
-                config.positional_encoding, config.learn_pe, config.num_patches, config.d_model)
-
-        # Positional dropout
-        self.positional_dropout = (
-            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity())
-
+        self.positional_encoder = PatchTSTPositionalEncoding(config)
         # Encoder
         self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)])
 
@@ -749,35 +771,22 @@ def forward(
 
         # Input embedding
         patch_input = self.embedder(patch_input)
-
-        if self.use_cls_token:
-            # x: [bs x num_channels x num_patches x d_model]
-            patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :])
-            # append cls token
-            cls_token = self.cls_token + self.position_enc[:1, :]  # cls_token: [1 x 1 x 1 x d_model]
-            cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1)  # get the same copy for all the batch samples
-            # x: [bs x num_channels x (num_patches+1) x d_model]
-            hidden_state = torch.cat((cls_tokens, patch_input), dim=1)
-        else:
-            # x: [bs x num_channels x num_patches x d_model]
-            hidden_state = self.positional_dropout(patch_input + self.position_enc)
+        # Positional encoding
+        hidden_state = self.positional_encoder(patch_input)
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
-
         for encoder_layer in self.layers:
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_state,)
 
-            layer_outputs = encoder_layer(hidden_state=hidden_state,
-                output_attentions=output_attentions)
-            # get hidden state
-            hidden_state = layer_outputs[0]  # hidden_states: [bs x num_channels x num_patches x d_model]
+            layer_outputs = encoder_layer(hidden_state=hidden_state, output_attentions=output_attentions)
+            # get hidden state. hidden_state shape is [bs x num_channels x num_patches x d_model]
             # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
-            # append layer attention
+            hidden_state = layer_outputs[0]
+            # append attention matrix at each layer
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
-
         # return past_values, hidden_states
         return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions)
 

From 32f11dc63375bf22a280d630d3c1726fef20a2fc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 10 Nov 2023 19:29:36 +0100
Subject: [PATCH 185/189] formatting

---
 .../models/patchtst/modeling_patchtst.py      | 76 ++++++++++---------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 78d59ca20602f4..2a09e5a4dfadc4 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -296,12 +296,10 @@ def random_masking(
         noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
     else:
         # noise in [0, 1], bs x num_channels x L
-        noise = torch.rand(
-            batch_size, num_channels, sequence_length, device=device)
+        noise = torch.rand(batch_size, num_channels, sequence_length, device=device)
 
     # mask: [bs x num_channels x num_patch]
-    mask = torch.ones(
-        batch_size, num_channels, sequence_length, device=device)
+    mask = torch.ones(batch_size, num_channels, sequence_length, device=device)
     mask[:, :, :len_keep] = 0
 
     # sort noise for each sample
@@ -435,8 +433,7 @@ def forward(self, past_values: torch.Tensor):
         # output: [bs x new_sequence_length x num_channels]
         output = past_values[:, self.sequence_start :, :]
         # output: [bs x num_patches x num_input_channels x patch_length]
-        output = output.unfold(
-            dimension=-2, size=self.patch_length, step=self.patch_stride)
+        output = output.unfold(dimension=-2, size=self.patch_length, step=self.patch_stride)
         # output: [bs x num_input_channels x num_patches x patch_length]
         output = output.transpose(-2, -3).contiguous()
         return output
@@ -577,13 +574,15 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
         if self.pre_norm:
             ## Norm and Multi-Head attention and Add residual connection
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions)
+                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions
+            )
             # Add: residual connection with residual dropout
             hidden_state = hidden_state + self.dropout_path1(attn_output)
         else:
             ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
             attn_output, attn_weights, _ = self.self_attn(
-                hidden_states=hidden_state, output_attentions=output_attentions)
+                hidden_states=hidden_state, output_attentions=output_attentions
+            )
             # hidden_states: [(bs*num_channels) x sequence_length x d_model]
             hidden_state = self.norm_sublayer1(hidden_state + self.dropout_path1(attn_output))
 
@@ -599,13 +598,15 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
             if self.pre_norm:
                 ## Norm and Multi-Head attention and Add residual connection
                 attn_output, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions)
+                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions
+                )
                 # Add: residual connection with residual dropout
                 hidden_state = hidden_state + self.dropout_path2(attn_output)
             else:
                 ## Multi-Head attention and Add residual connection and Norm
                 attn_output, channel_attn_weights, _ = self.self_attn(
-                    hidden_states=hidden_state, output_attentions=output_attentions)
+                    hidden_states=hidden_state, output_attentions=output_attentions
+                )
                 # hidden_states: [(bs*sequence_length) x num_channels x d_model]
                 hidden_state = self.norm_sublayer2(hidden_state + self.dropout_path2(attn_output))
 
@@ -621,13 +622,11 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool]
         if self.pre_norm:
             ## Norm and Position-wise Feed-Forward and Add residual connection
             # Add: residual connection with residual dropout
-            hidden_state = hidden_state + self.dropout_path3(
-                self.ff(self.norm_sublayer3(hidden_state)))
+            hidden_state = hidden_state + self.dropout_path3(self.ff(self.norm_sublayer3(hidden_state)))
         else:
             ## Position-wise Feed-Forward and Add residual connection and Norm
             # Add: residual connection with residual dropout
-            hidden_state = self.norm_sublayer3(
-                hidden_state + self.dropout_path3(self.ff(hidden_state)))
+            hidden_state = self.norm_sublayer3(hidden_state + self.dropout_path3(self.ff(hidden_state)))
 
         # [bs x num_channels x sequence_length x d_model]
         hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model)
@@ -695,6 +694,7 @@ class PatchTSTPositionalEncoding(nn.Module):
     """
     Class for positional encoding
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
         self.use_cls_token = config.use_cls_token
@@ -705,10 +705,12 @@ def __init__(self, config: PatchTSTConfig):
             num_patches = config.num_patches
         # postional encoding
         self.position_enc = positional_encoding(
-            config.positional_encoding_type, config.learn_pe, num_patches, config.d_model)
+            config.positional_encoding_type, config.learn_pe, num_patches, config.d_model
+        )
         # Positional dropout
         self.positional_dropout = (
-            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity())
+            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
+        )
 
     def forward(self, patch_input: torch.Tensor):
         if self.use_cls_token:
@@ -730,6 +732,7 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel):
     """
     PatchTST Encoder
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__(config)
         self.num_input_channels = config.num_input_channels
@@ -767,7 +770,9 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         # Input embedding
         patch_input = self.embedder(patch_input)
@@ -1053,18 +1058,19 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
-        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
-        self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
 
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
             data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
@@ -1091,13 +1097,13 @@ class PatchTSTMeanScaler(nn.Module):
     Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
     """
+
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
-        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
-        self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10
-        self.default_scale = config.default_scale if hasattr(config, 'default_scale') else None
-
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
@@ -1148,11 +1154,11 @@ class PatchTSTNOPScaler(nn.Module):
 
     def __init__(self, config: PatchTSTConfig):
         super().__init__()
-        self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1
-        self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor=None
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -1243,8 +1249,7 @@ def forward(
             return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
 
         Returns:
-            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
-            `config.return_dict`=False)
+            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
 
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1650,7 +1655,8 @@ def forward(
         if future_values is not None:
             if self.distribution_output:
                 distribution = self.distribution_output.distribution(
-                    y_hat, loc=model_output.loc, scale=model_output.scale)
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
                 loss_val = nll(distribution, future_values)
                 # take average of the loss
                 loss_val = weighted_average(loss_val)

From dcfd2015b56af0a1b76eeebfecdeddb9791e8138 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 10 Nov 2023 19:48:29 +0100
Subject: [PATCH 186/189] initialize scalers with configs

---
 .../models/autoformer/modeling_autoformer.py  | 118 +++++++++---------
 .../models/informer/modeling_informer.py      | 118 +++++++++---------
 .../models/patchtst/configuration_patchtst.py |   2 +-
 .../models/patchtst/modeling_patchtst.py      |   6 +-
 .../modeling_time_series_transformer.py       | 112 +++++++++--------
 5 files changed, 181 insertions(+), 175 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 92e9df2c7e5b1b..8f26274b44bcdb 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -208,71 +208,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -300,26 +299,29 @@ def forward(
         return scaled_data, torch.zeros_like(scale), scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -1433,11 +1435,11 @@ def __init__(self, config: AutoformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = AutoformerMeanScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = AutoformerStdScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerStdScaler(config)
         else:
-            self.scaler = AutoformerNOPScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = AutoformerFeatureEmbedder(
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index c0a5a205950285..205c8ba22f743e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -81,71 +81,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -173,26 +172,29 @@ def forward(
         return scaled_data, torch.zeros_like(scale), scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -1446,11 +1448,11 @@ def __init__(self, config: InformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = InformerMeanScaler(dim=1, keepdim=True)
+            self.scaler = InformerMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = InformerStdScaler(dim=1, keepdim=True)
+            self.scaler = InformerStdScaler(config)
         else:
-            self.scaler = InformerNOPScaler(dim=1, keepdim=True)
+            self.scaler = InformerNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = InformerFeatureEmbedder(
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
index 65711f2c599437..4ced00c3604600 100644
--- a/src/transformers/models/patchtst/configuration_patchtst.py
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -68,7 +68,7 @@ class PatchTSTConfig(PretrainedConfig):
             Dimension of the "intermediate" (often named feed-forward) layer in encoder.
         norm (`str` , *optional*, defaults to `"BatchNorm"`):
             Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`.
-        norm_eps (`float`, *optional*, defaults to 1e-5):
+        norm_eps (`float`, *optional*, defaults to 1e-05):
             A value added to the denominator for numerical stability of normalization.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention probabilities.
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 2a09e5a4dfadc4..658140fc1c087a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -1055,7 +1055,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor]
         return input_tensor.mean(dim=dim)
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
 class PatchTSTStdScaler(nn.Module):
     """
     Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
@@ -1091,7 +1091,7 @@ def forward(
         return (data - loc) / scale, loc, scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
 class PatchTSTMeanScaler(nn.Module):
     """
     Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
@@ -1146,7 +1146,7 @@ def forward(
         return scaled_data, torch.zeros_like(scale), scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
 class PatchTSTNOPScaler(nn.Module):
     """
     Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 904c02b4f04308..2c875dd56e1b08 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -83,67 +83,66 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
 
 class TimeSeriesStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
 class TimeSeriesMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -173,23 +172,26 @@ def forward(
 
 class TimeSeriesNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -1180,11 +1182,11 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesStdScaler(config)
         else:
-            self.scaler = TimeSeriesNOPScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = TimeSeriesFeatureEmbedder(

From 5ed7a9fb86a677a32472f14c37b3718b30d7425d Mon Sep 17 00:00:00 2001
From: nnguyen <nnguyen@us.ibm.com>
Date: Sun, 12 Nov 2023 21:43:54 -0500
Subject: [PATCH 187/189] edit output_hidden_states

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 2a09e5a4dfadc4..6c48c686e5c163 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -770,9 +770,7 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 
         # Input embedding
         patch_input = self.embedder(patch_input)

From 01294fd0e9302e467e8e928fb5ec3aae9d89806b Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 13 Nov 2023 11:21:13 +0100
Subject: [PATCH 188/189] style

---
 src/transformers/models/patchtst/modeling_patchtst.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 640a7786c9411f..658140fc1c087a 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -770,7 +770,9 @@ def forward(
             `BaseModelOutput`
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
 
         # Input embedding
         patch_input = self.embedder(patch_input)

From 9bf4074b7d93e544b8daff316e04ad5004cbbb4e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Mon, 13 Nov 2023 12:23:55 +0100
Subject: [PATCH 189/189] fix forecast_mask_patches doc string

---
 src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 658140fc1c087a..30522a048f024d 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -331,10 +331,10 @@ def forecast_masking(
         inputs (`torch.Tensor`):
             Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch,
             patch_len)`
-        forecast_mask_patches (`list`): [2, 4]
-            List of patch lengths to mask in the end of the data.
-        forecast_mask_ratios (`list`, *optional*): [0.7, 0.3]
-            List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and
+        forecast_mask_patches (`list`):
+            List of patch lengths to mask at the end of the data e.g. [2, 4].
+        forecast_mask_ratios (`list`, *optional*):
+            List of weights to use for each patch length. For example if forecast_mask_patches is [5,4] and
             forecast_mask_ratios is [1,1], then equal weights to both patch lengths.
         unmasked_channel_indices (`list`, *optional*):
             Control Variable channel indices. These channels will not be masked.