From c7d3fc3490841bee84609121e8f58f7946da763e Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Wed, 16 Aug 2023 16:49:13 -0400 Subject: [PATCH 001/189] Initial commit of PatchTST model classes Co-authored-by: Phanwadee Sinthong Co-authored-by: Nam Nguyen Co-authored-by: Vijay Ekambaram Co-authored-by: Ngoc Diep Do <55230119+diepi@users.noreply.github.com> Co-authored-by: Wesley Gifford <79663411+wgifford@users.noreply.github.com> --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/patchtst.md | 50 ++ src/transformers/__init__.py | 14 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 11 + src/transformers/models/auto/modeling_auto.py | 4 + src/transformers/models/patchtst/__init__.py | 60 ++ .../models/patchtst/configuration_patchtst.py | 241 +++++++ .../models/patchtst/modeling_patchtst.py | 596 ++++++++++++++++++ tests/models/patchtst/__init__.py | 0 .../models/patchtst/test_modeling_patchtst.py | 512 +++++++++++++++ 11 files changed, 1491 insertions(+) create mode 100644 docs/source/en/model_doc/patchtst.md create mode 100644 src/transformers/models/patchtst/__init__.py create mode 100644 src/transformers/models/patchtst/configuration_patchtst.py create mode 100755 src/transformers/models/patchtst/modeling_patchtst.py create mode 100644 tests/models/patchtst/__init__.py create mode 100644 tests/models/patchtst/test_modeling_patchtst.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index fd55a47cd80543..823f49e6c92609 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -712,6 +712,8 @@ title: Autoformer - local: model_doc/informer title: Informer + - local: model_doc/patchtst + title: PatchTST - local: model_doc/time_series_transformer title: Time Series Transformer title: Time series models diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md new file mode 100644 index 00000000000000..14523d65c70f3d --- /dev/null +++ b/docs/source/en/model_doc/patchtst.md @@ -0,0 +1,50 @@ + + +# PatchTST + +## Overview + +The PatchTST model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## PatchTSTConfig + +[[autodoc]] PatchTSTConfig + + +## PatchTSTModel + +[[autodoc]] PatchTSTModel + - forward + + +## PatchTSTForPrediction + +[[autodoc]] PatchTSTForPrediction + - forward \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b978757d1fe12f..051d4ef647f59c 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -372,6 +372,7 @@ ], "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"], "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"], + "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"], "models.instructblip": [ "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "InstructBlipConfig", @@ -1989,6 +1990,13 @@ "InformerPreTrainedModel", ] ) + _import_structure["models.patchtst"].extend( + [ + "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", + "PatchTSTModel", + "PatchTSTPreTrainedModel", + ] + ) _import_structure["models.instructblip"].extend( [ "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4462,6 +4470,7 @@ ) from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig + from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig from .models.instructblip import ( INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, InstructBlipConfig, @@ -5847,6 +5856,11 @@ InformerModel, InformerPreTrainedModel, ) + from .models.patchtst import ( + PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, + PatchTSTModel, + PatchTSTPreTrainedModel, + ) from .models.instructblip import ( INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST, InstructBlipForConditionalGeneration, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 3241a412572deb..3b958ac5c1df40 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -105,6 +105,7 @@ idefics, imagegpt, informer, + patchtst, instructblip, jukebox, layoutlm, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 0a3effd7955e1b..ac524d6882ad82 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -113,6 +113,8 @@ ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), + ("patchtst", "PatchTSTConfig"), + ("patchtst", "PatchTSTConfig"), ("instructblip", "InstructBlipConfig"), ("jukebox", "JukeboxConfig"), ("layoutlm", "LayoutLMConfig"), @@ -198,6 +200,8 @@ ("table-transformer", "TableTransformerConfig"), ("tapas", "TapasConfig"), ("time_series_transformer", "TimeSeriesTransformerConfig"), + ("patchtst", "PatchTSTConfig"), + ("patchtst", "PatchTSTConfig"), ("timesformer", "TimesformerConfig"), ("timm_backbone", "TimmBackboneConfig"), ("trajectory_transformer", "TrajectoryTransformerConfig"), @@ -319,6 +323,8 @@ ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -397,6 +403,8 @@ ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -527,6 +535,7 @@ ("idefics", "IDEFICS"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), + ("patchtst", "PatchTST"), ("instructblip", "InstructBLIP"), ("jukebox", "Jukebox"), ("layoutlm", "LayoutLM"), @@ -623,6 +632,8 @@ ("tapas", "TAPAS"), ("tapex", "TAPEX"), ("time_series_transformer", "Time Series Transformer"), + ("patchtst", "patchtst"), + ("patchtst", "PatchTST"), ("timesformer", "TimeSformer"), ("timm_backbone", "TimmBackbone"), ("trajectory_transformer", "Trajectory Transformer"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2ab504e2f23fdc..75cf77cc73a809 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -110,6 +110,8 @@ ("idefics", "IdeficsModel"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), + ("patchtst", "PatchTSTModel"), + ("patchtst", "PatchTSTModel"), ("jukebox", "JukeboxModel"), ("layoutlm", "LayoutLMModel"), ("layoutlmv2", "LayoutLMv2Model"), @@ -187,6 +189,8 @@ ("table-transformer", "TableTransformerModel"), ("tapas", "TapasModel"), ("time_series_transformer", "TimeSeriesTransformerModel"), + ("patchtst", "PatchTSTModel"), + ("patchtst", "PatchTSTModel"), ("timesformer", "TimesformerModel"), ("timm_backbone", "TimmBackbone"), ("trajectory_transformer", "TrajectoryTransformerModel"), diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py new file mode 100644 index 00000000000000..e633177a381952 --- /dev/null +++ b/src/transformers/models/patchtst/__init__.py @@ -0,0 +1,60 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +# rely on isort to merge the imports +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_patchtst": [ + "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", + "PatchTSTConfig", + ], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_patchtst"] = [ + "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", + "PatchTSTForPrediction", + "PatchTSTModel", + "PatchTSTPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_patchtst import ( + PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, + PatchTSTForPrediction, + PatchTSTModel, + PatchTSTPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py new file mode 100644 index 00000000000000..29759fb4bfcc6a --- /dev/null +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -0,0 +1,241 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PatchTST model configuration""" + +from typing import List, Optional, Union + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "ibm/patchtst-base": "https://huggingface.co/ibm/patchtst-base/resolve/main/config.json", +} + + +class PatchTSTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an + PatchTST model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + prediction_length (`int`): + The prediction length for the decoder. In other words, the prediction horizon of the model. This value is + typically dictated by the dataset and we recommend to set it appropriately. + context_length (`int`, *optional*, defaults to `prediction_length`): + The context length for the encoder. If `None`, the context length will be the same as the + `prediction_length`. + input_size (`int`, *optional*, defaults to 1): + The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of + multivariate targets. + scaling (`string` or `bool`, *optional* defaults to `"mean"`): + Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the + scaler is set to "mean". + num_time_features (`int`, *optional*, defaults to 0): + The number of time features in the input time series. + num_dynamic_real_features (`int`, *optional*, defaults to 0): + The number of dynamic real valued features. + num_static_categorical_features (`int`, *optional*, defaults to 0): + The number of static categorical features. + num_static_real_features (`int`, *optional*, defaults to 0): + The number of static real valued features. + cardinality (`list[int]`, *optional*): + The cardinality (number of different values) for each of the static categorical features. Should be a list + of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if + `num_static_categorical_features` is > 0. + embedding_dimension (`list[int]`, *optional*): + The dimension of the embedding for each of the static categorical features. Should be a list of integers, + having the same length as `num_static_categorical_features`. Cannot be `None` if + `num_static_categorical_features` is > 0. + d_model (`int`, *optional*, defaults to 64): + Dimensionality of the transformer layers. + encoder_layers (`int`, *optional*, defaults to 2): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 2): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 2): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 2): + Number of attention heads for each attention layer in the Transformer decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 32): + Dimension of the "intermediate" (often named feed-forward) layer in encoder. + decoder_ffn_dim (`int`, *optional*, defaults to 32): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and + `"relu"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the encoder, and decoder. + encoder_layerdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention and fully connected layers for each encoder layer. + decoder_layerdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention and fully connected layers for each decoder layer. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability used between the two layers of the feed-forward networks. + num_parallel_samples (`int`, *optional*, defaults to 100): + The number of samples to generate in parallel for each time step of inference. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal weight initialization distribution. + use_cache (`bool`, *optional*, defaults to `True`): + Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. + attention_type (`str`, *optional*, defaults to "prob"): + Attention used in encoder. This can be set to "prob" (PatchTST's ProbAttention) or "full" (vanilla + transformer's canonical self-attention). + sampling_factor (`int`, *optional*, defaults to 5): + ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the + reduced query matrix (Q_reduce) input length. + distil (`bool`, *optional*, defaults to `True`): + Whether to use distilling in encoder. + + Example: + + ```python + >>> from transformers import PatchTSTConfig, PatchTSTModel + + >>> # Initializing an PatchTST configuration with 12 time steps for prediction + >>> configuration = PatchTSTConfig(prediction_length=12) + + >>> # Randomly initializing a model (with random weights) from the configuration + >>> model = PatchTSTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "patchtst" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", + } + + def __init__( + self, + input_size: int = 1, + context_length: int = 32, + patch_length: int = 8, + stride: int = 8, + encoder_layers: int = 3, + d_model: int = 128, + encoder_attention_heads: int = 16, + shared_embedding: bool = True, + channel_attention: bool = False, + encoder_ffn_dim: int = 256, + norm: str = "BatchNorm", + attention_dropout: float = 0.0, + dropout: float = 0.0, + positional_dropout: float = 0.0, + dropout_path: float = 0.0, + ff_dropout: float = 0.0, + bias: bool = True, + activation_function: str = "gelu", + pre_norm: bool = False, + store_attn: bool = False, + positional_encoding: str = "sincos", + learn_pe: bool = False, + use_cls_token: bool = False, + patch_last: bool = True, + individual: bool = False, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = True, + d_size: str = "4D", + cv_channel_indices: list = None, + mask_value=0, + pooling: str = 'mean', + num_classes: int = 1, + head_dropout: float = 0.0, + proj_dropout: float = 0.0, + qkv_bias: bool = True, + num_dynamic_real_features: int = 0, + num_static_real_features: int = 0, + num_static_categorical_features: int = 0, + num_time_features: int = 0, + is_encoder_decoder: bool = False, + encoder_layerdrop: float = 0.1, + + # PatchTST arguments + attention_type: str = "prob", + sampling_factor: int = 5, + distil: bool = True, + **kwargs, + ): + + # time series specific configuration + self.context_length = context_length + self.input_size = input_size + self.num_time_features = num_time_features + self.num_dynamic_real_features = num_dynamic_real_features + self.num_static_real_features = num_static_real_features + self.num_static_categorical_features = num_static_categorical_features + + # Transformer architecture configuration + self.d_model = d_model + self.encoder_attention_heads = encoder_attention_heads + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.dropout = dropout + self.attention_dropout = attention_dropout + self.encoder_layerdrop = encoder_layerdrop + self.shared_embedding = shared_embedding + self.channel_attention = channel_attention + self.norm = norm + self.positional_dropout = positional_dropout + self.dropout_path = dropout_path + self.ff_dropout = ff_dropout + self.bias = bias + self.activation_function = activation_function + self.pre_norm = pre_norm + self.store_attention = store_attn + self.positional_encoding = positional_encoding + self.learn_pe = learn_pe + self.use_cls_token = use_cls_token + self.patch_last = patch_last + self.individual = individual + + # PatchTST + self.patch_length = patch_length + self.stride = stride + self.attention_type = attention_type + self.sampling_factor = sampling_factor + self.distil = distil + + # Masking + self.mask_type = mask_type + self.mask_ratio = mask_ratio + self.mask_patches = mask_patches + self.mask_patch_ratios = mask_patch_ratios + self.channel_consistent_masking = channel_consistent_masking + self.d_size = d_size + self.cv_channel_indices = cv_channel_indices + self.mask_value = mask_value + + # Classification + self.pooling = pooling + self.num_classes = num_classes + self.head_dropout = head_dropout + self.proj_dropout = proj_dropout + self.qkv_bias = qkv_bias + + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py new file mode 100755 index 00000000000000..3e861d8b9ac553 --- /dev/null +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -0,0 +1,596 @@ +# coding=utf-8 +# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch PatchTST model.""" + +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from torch import nn +import math +from ...modeling_utils import PreTrainedModel +from ...utils import add_start_docstrings, logging +from ...modeling_outputs import BaseModelOutputWithNoAttention +from .configuration_patchtst import PatchTSTConfig +from torch.nn.modules.activation import MultiheadAttention + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "PatchTSTConfig" + + +PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "ibm/patchtst-base", + # See all PatchTST models at https://huggingface.co/models?filter=patchtst +] + + +class PatchTSTAttention(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + + self.self_attn = MultiheadAttention(embed_dim=config.d_model, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + bias=config.bias, + add_bias_kv=True, + add_zero_attn=False, + batch_first=True + ) + + def forward(self, src: torch.Tensor) -> torch.Tensor: + """ + src: Tensor [bs x q_len x d_model] + """ + src, _ = self.self_attn(src, src, src, need_weights=False) + return src + + +def get_activation_fn(activation): + if callable(activation): return activation() + elif activation.lower() == "relu": return nn.ReLU() + elif activation.lower() == "gelu": return nn.GELU() + raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable') + + +class Transpose(nn.Module): + def __init__(self, *dims, contiguous=False): + super().__init__() + self.dims, self.contiguous = dims, contiguous + + def forward(self, x): + if self.contiguous: return x.transpose(*self.dims).contiguous() + else: return x.transpose(*self.dims) + + +def positional_encoding(pe, learn_pe, q_len, d_model): + # Positional encoding + if pe == None: + w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe + nn.init.uniform_(w_pos, -0.02, 0.02) + learn_pe = False + elif pe == 'zero': + w_pos = torch.empty((q_len, 1)) + nn.init.uniform_(w_pos, -0.02, 0.02) + elif pe == 'zeros': + w_pos = torch.empty((q_len, d_model)) + nn.init.uniform_(w_pos, -0.02, 0.02) + elif pe == 'normal' or pe == 'gauss': + w_pos = torch.zeros((q_len, 1)) + torch.nn.init.normal_(w_pos, mean=0.0, std=0.1) + elif pe == 'uniform': + w_pos = torch.zeros((q_len, 1)) + nn.init.uniform_(w_pos, a=0.0, b=0.1) + elif pe == 'lin1d': w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True) + elif pe == 'exp1d': w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True) + elif pe == 'lin2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True) + elif pe == 'exp2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True) + elif pe == 'sincos': + pos_enc = torch.zeros(q_len, d_model) + position = torch.arange(0, q_len).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) + pos_enc[:, 0::2] = torch.sin(position * div_term) + pos_enc[:, 1::2] = torch.cos(position * div_term) + pos_enc = pos_enc - pos_enc.mean() + pos_enc = pos_enc / (pos_enc.std() * 10) + w_pos = pos_enc + else: raise ValueError(f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \ + 'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)") + return nn.Parameter(w_pos, requires_grad=learn_pe) + + +def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False): + x = .5 if exponential else 1 + i = 0 + for i in range(100): + cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1 + # pv(f'{i:4.0f} {x:5.3f} {cpe.mean():+6.3f}', verbose) + if abs(cpe.mean()) <= eps: break + elif cpe.mean() > eps: x += .001 + else: x -= .001 + i += 1 + if normalize: + cpe = cpe - cpe.mean() + cpe = cpe / (cpe.std() * 10) + return cpe + + +def coord1d_pos_encoding(q_len, exponential=False, normalize=True): + cpe = (2 * (torch.linspace(0, 1, q_len).reshape(-1, 1)**(.5 if exponential else 1)) - 1) + if normalize: + cpe = cpe - cpe.mean() + cpe = cpe / (cpe.std() * 10) + return cpe + + +class TSTEncoderLayer(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + self.pre_norm = config.pre_norm + + assert not config.d_model % config.encoder_attention_heads, f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})" + + # Multi-Head attention + self.self_attn = PatchTSTAttention(config) + + # Add & Norm of the sublayer 1 + self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() + if "batch" in config.norm.lower(): + self.norm_sublayer1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + else: + self.norm_sublayer1 = nn.LayerNorm(config.d_model) + + # Position-wise Feed-Forward + self.ff = nn.Sequential( + nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias), + get_activation_fn(config.activation_function), + nn.Dropout(config.ff_dropout) if config.ff_dropout > 0 else nn.Identity(), + nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias), + ) + + # Add & Norm of sublayer 2 + self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() + if "batch" in config.norm.lower(): + self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + else: + self.norm_sublayer2 = nn.LayerNorm(config.d_model) + + def forward(self, src: torch.Tensor): + """ + src: tensor [bs x seq_len x d_model] + Return: + Tensor [bs x seq_len x d_model] + """ + # First sublayer: mixing across time + if self.pre_norm: + ## Norm and Multi-Head attention and Add residual connection + src = src + self.dropout_path1( + self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout + else: + ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT + src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src))) + + # Second sublayer: mixing across hidden dimension + if self.pre_norm: + ## Norm and Position-wise Feed-Forward and Add residual connection + src = src + self.dropout_path2( + self.ff(self.norm_sublayer2(src))) # Add: residual connection with residual dropout + else: + ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT + src = self.norm_sublayer2( + src + self.dropout_path2(self.ff(src))) # Add: residual connection with residual dropout + + return src + + +class TSTEncoder(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + + self.layers = nn.ModuleList( + [ + TSTEncoderLayer(config) + for i in range(config.encoder_layers) + ] + ) + + def forward(self, src: torch.Tensor, + output_hidden_states: Optional[bool] = False, + output_attention: Optional[bool] = False + ) -> torch.Tensor: + """ + src: tensor [bs x seq_len x d_model] + Return: + Tensor [bs x seq_len x d_model] + """ + all_hidden_states = [] + for mod in self.layers: + if output_hidden_states: + src = mod(src) + all_hidden_states.append(src) + if output_hidden_states: return src, all_hidden_states + return src + + +class PatchTSTPreTrainedModel(PreTrainedModel): + config_class = PatchTSTConfig + base_model_prefix = "model" + main_input_name = "past_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize weights""" + if self.config.use_cls_token: + torch.nn.init.normal_(self.config.cls_token, std=.02) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (PatchTSTEncoder)): + module.gradient_checkpointing = value + + +class PatchTSTEncoder(PatchTSTPreTrainedModel): + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + # self.n_vars = c_in + self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1 + self.d_model = config.d_model + self.shared_embedding = config.shared_embedding + self.use_cls_token = config.use_cls_token + + # Added params for patching + self.patch_last = config.patch_last + self.mask_ratio = config.mask_ratio + + # Input encoding: projection of feature vectors onto a d-dim vector space + if not self.shared_embedding: + self.W_P = nn.ModuleList() + for _ in range(config.input_size): + self.W_P.append(nn.Linear(config.patch_length, self.d_model)) + else: + self.W_P = nn.Linear(config.patch_length, config.d_model) + + # Positional encoding + if self.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) + self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model) + else: + self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model) + + # Positional dropout + self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity() + + # Encoder + self.encoder = TSTEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + x: tensor [bs x nvars x num_patch x patch_len] #[bs x num_patch x nvars x patch_len] + return: + tensor [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + """ + + # bs, num_patch, n_vars, patch_len = x.shape + bs, n_vars, num_patch, patch_len = x.shape + # Input encoding + if not self.shared_embedding: + x_out = [] + for i in range(n_vars): + z = self.W_P[i](x[:, i, :, :]) + x_out.append(z) + x = torch.stack(x_out, dim=1) + else: + x = self.W_P(x) # x: [bs x nvars x num_patch x d_model] + + # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model] + x = x.view(bs * n_vars, num_patch, self.d_model) # x: [bs * nvars x num_patch x d_model] + + if self.use_cls_token: + # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') + x = self.dropout(x + self.W_pos[1:, :]) # x: [bs * nvars x num_patch x d_model] + # append cls token + cls_token = self.cls_token + self.W_pos[:1, :] # cls_token: [1 x 1 x d_model] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # get the same copy for all the batch samples + x = torch.cat((cls_tokens, x), dim=1) # x: [bs * nvars x (num_patch+1) x d_model] + else: + # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') + x = self.dropout(x + self.W_pos) # x: [bs * nvars x num_patch x d_model] + + # Encoder + x = self.encoder(x) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token + x = torch.reshape(x, (bs, n_vars, -1, self.d_model)) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + return x + + +PATCHTST_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`PatchTSTConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +PATCHTST_INPUTS_DOCSTRING = r""" + Args: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`): + Past values of the time series, that serve as context in order to predict the future. The sequence size of + this tensor must be larger than the `context_length` of the model, since the model will use the larger size + to construct lag features, i.e. additional values from the past which are added in order to serve as "extra + context". + + The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no + `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest + look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of + the past. + + The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as + `static_categorical_features`, `static_real_features`, `past_time_features` and lags). + + Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`. + + For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of + variates in the time series per time step. + past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`): + Required time features, which the model internally will add to `past_values`. These could be things like + "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These + could also be so-called "age" features, which basically help the model know "at which point in life" a + time-series is. Age features have small values for distant past time steps and increase monotonically the + more we approach the current time step. Holiday features are also a good example of time features. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where + the position encodings are learned from scratch internally as parameters of the model, the Time Series + Transformer requires to provide additional time features. The Time Series Transformer only learns + additional embeddings for `static_categorical_features`. + + Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features + must but known at prediction time. + + The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in + `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*): + Optional static categorical features for which the model will learn an embedding, which it will add to the + values of the time series. + + Static categorical features are features which have the same value for all time steps (static over time). + + A typical example of a static categorical feature is a time series ID. + static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*): + Optional static real features which the model will add to the values of the time series. + + Static real features are features which have the same value for all time steps (static over time). + + A typical example of a static real feature is promotion information. + future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*): + Future values of the time series, that serve as labels for the model. The `future_values` is what the + Transformer needs during training to learn to output, given the `past_values`. + + The sequence length here is equal to `prediction_length`. + + See the demo notebook and code snippets for details. + + Optionally, during training any missing values need to be replaced with zeros and indicated via the + `future_observed_mask`. + + For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of + variates in the time series per time step. + future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`): + Required time features for the prediction window, which the model internally will add to `future_values`. + These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as + Fourier features). These could also be so-called "age" features, which basically help the model know "at + which point in life" a time-series is. Age features have small values for distant past time steps and + increase monotonically the more we approach the current time step. Holiday features are also a good example + of time features. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where + the position encodings are learned from scratch internally as parameters of the model, the Time Series + Transformer requires to provide additional time features. The Time Series Transformer only learns + additional embeddings for `static_categorical_features`. + + Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features + must but known at prediction time. + + The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. + future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*): + Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + This mask is used to filter out missing values for the final loss calculation. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to + make sure the model can only look at previous inputs in order to predict the future. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class PatchTSTEncoder(PatchTSTPreTrainedModel): + """ + PatchTST encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each + attention layer is an [`PatchTSTEncoderLayer`]. + + Args: + config: PatchTSTConfig + """ + + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + # self.n_vars = c_in + self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1 + self.d_model = config.d_model + self.shared_embedding = config.shared_embedding + self.use_cls_token = config.use_cls_token + + # Added params for patching + self.patch_last = config.patch_last + self.mask_ratio = config.mask_ratio + + # Input encoding: projection of feature vectors onto a d-dim vector space + if not self.shared_embedding: + self.w_p = nn.ModuleList() + for _ in range(config.input_size): + self.w_p.append(nn.Linear(config.patch_length, self.d_model)) + else: + self.w_p = nn.Linear(config.patch_length, config.d_model) + + # Positional encoding + if self.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) + self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model) + else: + self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model) + + # Positional dropout + self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity() + + # Encoder + self.encoder = TSTEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + x: tensor [bs x nvars x num_patch x patch_len] #[bs x num_patch x nvars x patch_len] + return: + tensor [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + """ + + # bs, num_patch, n_vars, patch_len = x.shape + bs, n_vars, num_patch, patch_len = x.shape + # Input encoding + if not self.shared_embedding: + x_out = [] + for i in range(n_vars): + z = self.w_p[i](x[:, i, :, :]) + x_out.append(z) + x = torch.stack(x_out, dim=1) + else: + x = self.w_p(x) # x: [bs x nvars x num_patch x d_model] + + # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model] + x = x.view(bs * n_vars, num_patch, self.d_model) # x: [bs * nvars x num_patch x d_model] + + if self.use_cls_token: + # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') + x = self.dropout(x + self.w_pos[1:, :]) # x: [bs * nvars x num_patch x d_model] + # append cls token + cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x d_model] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) # get the same copy for all the batch samples + x = torch.cat((cls_tokens, x), dim=1) # x: [bs * nvars x (num_patch+1) x d_model] + else: + # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') + x = self.dropout(x + self.w_pos) # x: [bs * nvars x num_patch x d_model] + + # Encoder + x = self.encoder( + x) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token + x = torch.reshape(x, (bs, n_vars, -1, + self.d_model)) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + return x + + +@add_start_docstrings( + "The bare PatchTST Model outputting raw hidden-states without any specific head on top.", + PATCHTST_START_DOCSTRING, +) +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST +class PatchTSTModel(PatchTSTPreTrainedModel): + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + self.encoder = PatchTSTEncoder(config) + + def forward(self, x: torch.Tensor): + encoder_output = self.encoder(x) + return BaseModelOutputWithNoAttention( + last_hidden_state=encoder_output, + hidden_states=None + ) + + diff --git a/tests/models/patchtst/__init__.py b/tests/models/patchtst/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py new file mode 100644 index 00000000000000..cf8060a284f232 --- /dev/null +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -0,0 +1,512 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch PatchTST model. """ + +import inspect +import tempfile +import unittest + +import numpy as np +from huggingface_hub import hf_hub_download + +from transformers import is_torch_available +from transformers.testing_utils import is_flaky, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +TOLERANCE = 1e-4 + +if is_torch_available(): + import torch + + from transformers import PatchTSTConfig, PatchTSTForPrediction, PatchTSTModel + from transformers.models.patchtst.modeling_patchtst import PatchTSTDecoder, PatchTSTEncoder + + +@require_torch +class PatchTSTModelTester: + def __init__( + self, + parent, + batch_size=13, + prediction_length=7, + context_length=14, + cardinality=19, + embedding_dimension=5, + num_time_features=4, + is_training=True, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + lags_sequence=[1, 2, 3, 4, 5], + sampling_factor=10, + distil=False, + ): + self.parent = parent + self.batch_size = batch_size + self.prediction_length = prediction_length + self.context_length = context_length + self.cardinality = cardinality + self.num_time_features = num_time_features + self.lags_sequence = lags_sequence + self.embedding_dimension = embedding_dimension + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + + self.encoder_seq_length = min( + sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length + ) + self.decoder_seq_length = min( + sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length + ) + self.sampling_factor = sampling_factor + self.distil = distil + + def get_config(self): + return PatchTSTConfig( + prediction_length=self.prediction_length, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + context_length=self.context_length, + lags_sequence=self.lags_sequence, + num_time_features=self.num_time_features, + num_static_categorical_features=1, + num_static_real_features=1, + cardinality=[self.cardinality], + embedding_dimension=[self.embedding_dimension], + sampling_factor=self.sampling_factor, + distil=self.distil, + ) + + def prepare_patchtst_inputs_dict(self, config): + _past_length = config.context_length + max(config.lags_sequence) + + static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0]) + static_real_features = floats_tensor([self.batch_size, 1]) + + past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features]) + past_values = floats_tensor([self.batch_size, _past_length]) + past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5 + + # decoder inputs + future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features]) + future_values = floats_tensor([self.batch_size, config.prediction_length]) + + inputs_dict = { + "past_values": past_values, + "static_categorical_features": static_categorical_features, + "static_real_features": static_real_features, + "past_time_features": past_time_features, + "past_observed_mask": past_observed_mask, + "future_time_features": future_time_features, + "future_values": future_values, + } + return inputs_dict + + def prepare_config_and_inputs(self): + config = self.get_config() + inputs_dict = self.prepare_patchtst_inputs_dict(config) + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def check_encoder_decoder_model_standalone(self, config, inputs_dict): + model = PatchTSTModel(config=config).to(torch_device).eval() + outputs = model(**inputs_dict) + + encoder_last_hidden_state = outputs.encoder_last_hidden_state + last_hidden_state = outputs.last_hidden_state + + with tempfile.TemporaryDirectory() as tmpdirname: + encoder = model.get_encoder() + encoder.save_pretrained(tmpdirname) + encoder = PatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device) + + transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict) + enc_input = transformer_inputs[:, : config.context_length, ...] + dec_input = transformer_inputs[:, config.context_length :, ...] + + encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0] + + self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) + + with tempfile.TemporaryDirectory() as tmpdirname: + decoder = model.get_decoder() + decoder.save_pretrained(tmpdirname) + decoder = PatchTSTDecoder.from_pretrained(tmpdirname).to(torch_device) + + last_hidden_state_2 = decoder( + inputs_embeds=dec_input, + encoder_hidden_states=encoder_last_hidden_state, + )[0] + + self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3) + + +@require_torch +class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (PatchTSTModel, PatchTSTForPrediction) if is_torch_available() else () + all_generative_model_classes = (PatchTSTForPrediction,) if is_torch_available() else () + is_encoder_decoder = True + test_pruning = False + test_head_masking = False + test_missing_keys = False + test_torchscript = False + test_inputs_embeds = False + test_model_common_attributes = False + + def setUp(self): + self.model_tester = PatchTSTModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=PatchTSTConfig, + has_text_modality=False, + prediction_length=self.model_tester.prediction_length, + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_save_load_strict(self): + config, _ = self.model_tester.prepare_config_and_inputs() + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_encoder_decoder_model_standalone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.context_length + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length + else: + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Ignore since we have no tokens embeddings + def test_resize_tokens_embeddings(self): + pass + + def test_model_outputs_equivalence(self): + pass + + def test_determinism(self): + pass + + # # Input is 'static_categorical_features' not 'input_ids' + def test_model_main_input_name(self): + model_signature = inspect.signature(getattr(PatchTSTModel, "forward")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = [ + "past_values", + "past_time_features", + "past_observed_mask", + "static_categorical_features", + "static_real_features", + "future_values", + "future_time_features", + ] + + expected_arg_names.extend( + [ + "future_observed_mask", + "decoder_attention_mask", + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + "encoder_outputs", + "past_key_values", + "output_hidden_states", + "output_attentions", + "use_cache", + "return_dict", + ] + if "future_observed_mask" in arg_names + else [ + "decoder_attention_mask", + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + "encoder_outputs", + "past_key_values", + "output_hidden_states", + "output_attentions", + "use_cache", + "return_dict", + ] + ) + + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + context_length = getattr(self.model_tester, "context_length", seq_len) + prediction_length = getattr(self.model_tester, "prediction_length", seq_len) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, context_length], + ) + out_len = len(outputs) + + correct_outlen = 7 + + if "last_hidden_state" in outputs: + correct_outlen += 1 + + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + if "loss" in outputs: + correct_outlen += 1 + + if "params" in outputs: + correct_outlen += 1 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_seq_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 2, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, context_length], + ) + + @is_flaky() + def test_retain_grad_hidden_states_attentions(self): + super().test_retain_grad_hidden_states_attentions() + + +def prepare_batch(filename="train-batch.pt"): + file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") + batch = torch.load(file, map_location=torch_device) + return batch + + +@require_torch +@slow +class PatchTSTModelIntegrationTests(unittest.TestCase): + def test_inference_no_head(self): + model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) + batch = prepare_batch() + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"], + past_time_features=batch["past_time_features"], + past_observed_mask=batch["past_observed_mask"], + static_categorical_features=batch["static_categorical_features"], + future_values=batch["future_values"], + future_time_features=batch["future_time_features"], + ).last_hidden_state + expected_shape = torch.Size((64, model.config.context_length, model.config.d_model)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]], + device=torch_device, + ) + self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_inference_head(self): + model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) + batch = prepare_batch("val-batch.pt") + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"], + past_time_features=batch["past_time_features"], + past_observed_mask=batch["past_observed_mask"], + static_categorical_features=batch["static_categorical_features"], + future_time_features=batch["future_time_features"], + ).encoder_last_hidden_state + + # encoder distils the context length to 1/8th of the original length + expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device + ) + self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_seq_to_seq_generation(self): + model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) + batch = prepare_batch("val-batch.pt") + + torch.manual_seed(0) + with torch.no_grad(): + outputs = model.generate( + static_categorical_features=batch["static_categorical_features"], + past_time_features=batch["past_time_features"], + past_values=batch["past_values"], + future_time_features=batch["future_time_features"], + past_observed_mask=batch["past_observed_mask"], + ) + expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) + self.assertEqual(outputs.sequences.shape, expected_shape) + + expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) + mean_prediction = outputs.sequences.mean(dim=1) + self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) From 97628bab3853e8182ec2ff4a602cb5202d869c59 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Wed, 16 Aug 2023 19:06:51 -0400 Subject: [PATCH 002/189] Add PatchTSTForPretraining --- src/transformers/models/patchtst/__init__.py | 4 +- .../models/patchtst/modeling_patchtst.py | 268 +++++++++++++++++- 2 files changed, 263 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index e633177a381952..73333c3fee067a 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -32,7 +32,7 @@ else: _import_structure["modeling_patchtst"] = [ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", - "PatchTSTForPrediction", + "PatchTSTForPretraining", "PatchTSTModel", "PatchTSTPreTrainedModel", ] @@ -49,7 +49,7 @@ else: from .modeling_patchtst import ( PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, - PatchTSTForPrediction, + PatchTSTForPretraining, PatchTSTModel, PatchTSTPreTrainedModel, ) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3e861d8b9ac553..6e72fb057bd845 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -25,7 +25,7 @@ from ...modeling_outputs import BaseModelOutputWithNoAttention from .configuration_patchtst import PatchTSTConfig from torch.nn.modules.activation import MultiheadAttention - +from ...utils import ModelOutput logger = logging.get_logger(__name__) @@ -265,12 +265,12 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if self.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model) + self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model) else: - self.W_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model) + self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model) # Positional dropout - self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity() + self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() # Encoder self.encoder = TSTEncoder(config) @@ -521,12 +521,12 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if self.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch + 1, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model) else: - self.w_pos = positional_encoding(config.pe, config.learn_pe, self.num_patch, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model) # Positional dropout - self.dropout = nn.Dropout(config.pos_dropout) if config.pos_dropout > 0 else nn.Identity() + self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() # Encoder self.encoder = TSTEncoder(config) @@ -594,3 +594,257 @@ def forward(self, x: torch.Tensor): ) +class PretrainHead(nn.Module): + def __init__(self, config): + super().__init__() + self.dropout = nn.Dropout(config.dropout) + self.linear = nn.Linear(config.d_model, config.patch_length) + self.use_cls_token = config.use_cls_token + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + x: tensor [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: tensor [bs x nvars x num_patch x patch_len] + """ + x = self.linear(self.dropout(x)) # [bs x nvars x num_patch x patch_len] + if self.use_cls_token: x = x[:, :, 1:, :] # remove the first cls token + return x + + +def cv_random_masking(xb: torch.Tensor, + mask_ratio: float, + cv_channel_indices: list = None, + channel_consistent_masking: bool = True, + d_size="4D", + mask_value=0): + """cv_random_masking: Mask the input considering the control variables. + + Args: + xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] + mask_ratio (float): Mask ratio. + cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D". + mask_value (int, optional): Value to use for masking. Defaults to 0. + + Returns: + Tensor: xb_mask, masked input, same shape as input + Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x tsg2 x c x n] + """ + if d_size == "4D": + bs, nvars, L, D = xb.shape + + len_keep = int(L * (1 - mask_ratio)) + + if d_size == "4D": + if channel_consistent_masking: + noise = torch.rand(bs, 1, L, device=xb.device) # noise in [0, 1], bs x 1 x L + noise = noise.repeat(1, nvars, 1) # bs x nvars x L + else: + noise = torch.rand(bs, nvars, L, device=xb.device) # noise in [0, 1], bs x nvars x L + + mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] + mask[:, :, :len_keep] = 0 + + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] + mask = torch.gather(mask, dim=-1, index=ids_restore) + + if d_size == "4D": + mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] + if cv_channel_indices is not None: + mask[:, cv_channel_indices, :, :] = 0 + + xb_mask = xb.masked_fill(mask.bool(), mask_value) + return xb_mask, mask[..., 0] + + +class PatchMasking(nn.Module): + def __init__(self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = True, + d_size: str = "4D", + cv_channel_indices: list = None, + mask_value=0, ): + """PatchMasking: Class to random or forcast masking. + + Args: + mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. + mask_ratio (float, optional): Mask ratio. + mask_patches (list, optional): List of patch lengths to mask in the end of the data. + mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. + if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. + cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D". + mask_value (int, optional): Value to use for masking. Defaults to 0. + """ + + self.mask_ratio = mask_ratio + self.channel_consistent_masking = channel_consistent_masking + self.d_size = d_size + self.mask_type = mask_type + self.mask_patches = mask_patches + self.mask_patch_ratios = mask_patch_ratios + self.cv_channel_indices = cv_channel_indices + self.mask_value = mask_value + if self.cv_channel_indices is not None: + self.cv_channel_indices.sort() + + super().__init__() + + def forward(self, x: torch.Tensor): + + """ + Input: + x: patched input + 4D: [bs x n_vars x num_patch x patch_len] + + Output: + x_mask: Masked patched input + 4D: [bs x n_vars x num_patch x patch_len] + mask: bool tensor indicating True on masked points + 4D: [bs x n_vars x num_patch] + """ + + if self.mask_type == "random": + x_mask, mask = cv_random_masking(xb=x, + mask_ratio=self.mask_ratio, + cv_channel_indices=self.cv_channel_indices, + channel_consistent_masking=self.channel_consistent_masking, + d_size=self.d_size, + mask_value=self.mask_value) + + else: + raise Exception("Invalid mask type") + + mask = mask.bool() # mask: [bs x n_vars x num_patch] + + return x_mask, mask + + +class Patch(nn.Module): + """ + A class to patchify the time series sequence into different patches + """ + def __init__(self, + seq_len: int, + patch_len: int, + stride: int, + padding: bool = False # TODO: use this to set whether we want to pad zeros to the sequence + ): + super().__init__() + + assert (seq_len > patch_len), f'Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})' + + self.seq_len = seq_len + self.patch_len = patch_len + self.stride = stride + + # get the number of patches + self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1 + tgt_len = patch_len + stride * (self.num_patch - 1) + self.s_begin = seq_len - tgt_len + + def forward(self, x: torch.Tensor): + """ + + Args: + x (torch.Tensor, required): Input of shape [bs x ... x seq_len x n_vars] + Returns: + z: output tensor data [bs x ... x n_vars x num_patch x patch_len] + """ + seq_len = x.shape[-2] + assert (seq_len == self.seq_len), f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." + + # x = x[:, :, self.s_begin:, :] # xb: [bs x ... x tgt_len x nvars] + z = x.transpose(0, -2)[self.s_begin:] # z: [tgt_len x ... x bs x n_vars] + z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution + z = z.unfold(dimension=-2, size=self.patch_len, step=self.stride) # xb: [bs x ... x num_patch x n_vars x patch_len] + z = z.transpose(-2, -3).contiguous() # xb: [bs x ... x n_vars x num_patch x patch_len] + return z + + +class PatchTSTForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class PatchTSTForPretraining(PatchTSTPreTrainedModel): + # PatchTSTModel + Pretraining Head + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + + self.patching = Patch(config.context_length, + patch_len=config.patch_length, + stride=config.stride) + self.masking = PatchMasking(mask_type=config.mask_type, + mask_ratio=config.mask_ratio, + mask_patches=config.mask_patches, + mask_patch_ratios=config.mask_patch_ratios, + channel_consistent_masking=config.channel_consistent_masking, + d_size=config.d_size, + cv_channel_indices=config.cv_channel_indices, + mask_value=config.mask_value) + self.model = PatchTSTModel(config) + self.head = PretrainHead(config) + self.loss = torch.nn.MSELoss(reduction='mean') + + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None + ) -> PatchTSTForPreTrainingOutput: + """ + past_values (x): tensor [bs x n_vars x num_patch x patch_len] + future_values (y): labels + """ + + # x: [bs x n_vars x num_patch x patch_len] for pretrain + + patched_x = self.patching(past_values) + masked_x, masked = self.masking(patched_x) + model_output = self.model(masked_x) # x: [bs x nvars x num_patch x d_model] + # or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] + + loss_val = self.loss(x_hat, patched_x) + return PatchTSTForPreTrainingOutput( + loss=loss_val, + prediction_logits=x_hat, + ) + + + From 10b75170f56041f42403b5ce93886cbd4e8a1034 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Mon, 21 Aug 2023 08:15:15 -0400 Subject: [PATCH 003/189] update to include classification Co-authored-by: Phanwadee Sinthong Co-authored-by: Nam Nguyen Co-authored-by: Vijay Ekambaram Co-authored-by: Ngoc Diep Do <55230119+diepi@users.noreply.github.com> Co-authored-by: Wesley Gifford <79663411+wgifford@users.noreply.github.com> --- .../models/auto/configuration_auto.py | 14 +- src/transformers/models/auto/modeling_auto.py | 10 + .../models/patchtst/modeling_patchtst.py | 350 ++++++++++++------ 3 files changed, 250 insertions(+), 124 deletions(-) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index ac524d6882ad82..f3d32bc388b9f5 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -113,8 +113,6 @@ ("idefics", "IdeficsConfig"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), - ("patchtst", "PatchTSTConfig"), - ("patchtst", "PatchTSTConfig"), ("instructblip", "InstructBlipConfig"), ("jukebox", "JukeboxConfig"), ("layoutlm", "LayoutLMConfig"), @@ -159,6 +157,7 @@ ("openai-gpt", "OpenAIGPTConfig"), ("opt", "OPTConfig"), ("owlvit", "OwlViTConfig"), + ("patchtst", "PatchTSTConfig"), ("pegasus", "PegasusConfig"), ("pegasus_x", "PegasusXConfig"), ("perceiver", "PerceiverConfig"), @@ -200,8 +199,6 @@ ("table-transformer", "TableTransformerConfig"), ("tapas", "TapasConfig"), ("time_series_transformer", "TimeSeriesTransformerConfig"), - ("patchtst", "PatchTSTConfig"), - ("patchtst", "PatchTSTConfig"), ("timesformer", "TimesformerConfig"), ("timm_backbone", "TimmBackboneConfig"), ("trajectory_transformer", "TrajectoryTransformerConfig"), @@ -323,8 +320,6 @@ ("idefics", "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -365,6 +360,7 @@ ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("opt", "OPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("owlvit", "OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("pegasus_x", "PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("perceiver", "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -403,8 +399,6 @@ ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), - ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -535,7 +529,6 @@ ("idefics", "IDEFICS"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), - ("patchtst", "PatchTST"), ("instructblip", "InstructBLIP"), ("jukebox", "Jukebox"), ("layoutlm", "LayoutLM"), @@ -588,6 +581,7 @@ ("openai-gpt", "OpenAI GPT"), ("opt", "OPT"), ("owlvit", "OWL-ViT"), + ("patchtst", "PatchTST"), ("pegasus", "Pegasus"), ("pegasus_x", "PEGASUS-X"), ("perceiver", "Perceiver"), @@ -632,8 +626,6 @@ ("tapas", "TAPAS"), ("tapex", "TAPEX"), ("time_series_transformer", "Time Series Transformer"), - ("patchtst", "patchtst"), - ("patchtst", "PatchTST"), ("timesformer", "TimeSformer"), ("timm_backbone", "TimmBackbone"), ("trajectory_transformer", "Trajectory Transformer"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 75cf77cc73a809..64ccf4061b28aa 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1111,6 +1111,12 @@ ] ) +MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("PatchTST", "PatchTSTForClassification"), + ] +) + MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES) MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES) @@ -1196,6 +1202,10 @@ MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES) +MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES +) + class AutoModelForMaskGeneration(_BaseAutoModelClass): _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6e72fb057bd845..f661008f022c02 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -42,14 +42,15 @@ class PatchTSTAttention(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.self_attn = MultiheadAttention(embed_dim=config.d_model, - num_heads=config.encoder_attention_heads, - dropout=config.attention_dropout, - bias=config.bias, - add_bias_kv=True, - add_zero_attn=False, - batch_first=True - ) + self.self_attn = MultiheadAttention( + embed_dim=config.d_model, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + bias=config.bias, + add_bias_kv=True, + add_zero_attn=False, + batch_first=True, + ) def forward(self, src: torch.Tensor) -> torch.Tensor: """ @@ -60,9 +61,12 @@ def forward(self, src: torch.Tensor) -> torch.Tensor: def get_activation_fn(activation): - if callable(activation): return activation() - elif activation.lower() == "relu": return nn.ReLU() - elif activation.lower() == "gelu": return nn.GELU() + if callable(activation): + return activation() + elif activation.lower() == "relu": + return nn.ReLU() + elif activation.lower() == "gelu": + return nn.GELU() raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable') @@ -72,33 +76,39 @@ def __init__(self, *dims, contiguous=False): self.dims, self.contiguous = dims, contiguous def forward(self, x): - if self.contiguous: return x.transpose(*self.dims).contiguous() - else: return x.transpose(*self.dims) + if self.contiguous: + return x.transpose(*self.dims).contiguous() + else: + return x.transpose(*self.dims) def positional_encoding(pe, learn_pe, q_len, d_model): # Positional encoding if pe == None: - w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe + w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe nn.init.uniform_(w_pos, -0.02, 0.02) learn_pe = False - elif pe == 'zero': + elif pe == "zero": w_pos = torch.empty((q_len, 1)) nn.init.uniform_(w_pos, -0.02, 0.02) - elif pe == 'zeros': + elif pe == "zeros": w_pos = torch.empty((q_len, d_model)) nn.init.uniform_(w_pos, -0.02, 0.02) - elif pe == 'normal' or pe == 'gauss': + elif pe == "normal" or pe == "gauss": w_pos = torch.zeros((q_len, 1)) torch.nn.init.normal_(w_pos, mean=0.0, std=0.1) - elif pe == 'uniform': + elif pe == "uniform": w_pos = torch.zeros((q_len, 1)) nn.init.uniform_(w_pos, a=0.0, b=0.1) - elif pe == 'lin1d': w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True) - elif pe == 'exp1d': w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True) - elif pe == 'lin2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True) - elif pe == 'exp2d': w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True) - elif pe == 'sincos': + elif pe == "lin1d": + w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True) + elif pe == "exp1d": + w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True) + elif pe == "lin2d": + w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True) + elif pe == "exp2d": + w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True) + elif pe == "sincos": pos_enc = torch.zeros(q_len, d_model) position = torch.arange(0, q_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) @@ -107,20 +117,29 @@ def positional_encoding(pe, learn_pe, q_len, d_model): pos_enc = pos_enc - pos_enc.mean() pos_enc = pos_enc / (pos_enc.std() * 10) w_pos = pos_enc - else: raise ValueError(f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \ - 'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)") + else: + raise ValueError( + f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \ + 'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)" + ) return nn.Parameter(w_pos, requires_grad=learn_pe) def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False): - x = .5 if exponential else 1 + x = 0.5 if exponential else 1 i = 0 for i in range(100): - cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1 + cpe = ( + 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) + - 1 + ) # pv(f'{i:4.0f} {x:5.3f} {cpe.mean():+6.3f}', verbose) - if abs(cpe.mean()) <= eps: break - elif cpe.mean() > eps: x += .001 - else: x -= .001 + if abs(cpe.mean()) <= eps: + break + elif cpe.mean() > eps: + x += 0.001 + else: + x -= 0.001 i += 1 if normalize: cpe = cpe - cpe.mean() @@ -129,7 +148,7 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps= def coord1d_pos_encoding(q_len, exponential=False, normalize=True): - cpe = (2 * (torch.linspace(0, 1, q_len).reshape(-1, 1)**(.5 if exponential else 1)) - 1) + cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** (0.5 if exponential else 1)) - 1 if normalize: cpe = cpe - cpe.mean() cpe = cpe / (cpe.std() * 10) @@ -141,7 +160,9 @@ def __init__(self, config: PatchTSTConfig): super().__init__() self.pre_norm = config.pre_norm - assert not config.d_model % config.encoder_attention_heads, f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})" + assert ( + not config.d_model % config.encoder_attention_heads + ), f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})" # Multi-Head attention self.self_attn = PatchTSTAttention(config) @@ -178,7 +199,8 @@ def forward(self, src: torch.Tensor): if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer1(src)) + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src))) @@ -187,11 +209,13 @@ def forward(self, src: torch.Tensor): if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path2( - self.ff(self.norm_sublayer2(src))) # Add: residual connection with residual dropout + self.ff(self.norm_sublayer2(src)) + ) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer2( - src + self.dropout_path2(self.ff(src))) # Add: residual connection with residual dropout + src + self.dropout_path2(self.ff(src)) + ) # Add: residual connection with residual dropout return src @@ -200,17 +224,11 @@ class TSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.layers = nn.ModuleList( - [ - TSTEncoderLayer(config) - for i in range(config.encoder_layers) - ] - ) + self.layers = nn.ModuleList([TSTEncoderLayer(config) for i in range(config.encoder_layers)]) - def forward(self, src: torch.Tensor, - output_hidden_states: Optional[bool] = False, - output_attention: Optional[bool] = False - ) -> torch.Tensor: + def forward( + self, src: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attention: Optional[bool] = False + ) -> torch.Tensor: """ src: tensor [bs x seq_len x d_model] Return: @@ -221,9 +239,10 @@ def forward(self, src: torch.Tensor, if output_hidden_states: src = mod(src) all_hidden_states.append(src) - if output_hidden_states: return src, all_hidden_states + if output_hidden_states: + return src, all_hidden_states return src - + class PatchTSTPreTrainedModel(PreTrainedModel): config_class = PatchTSTConfig @@ -234,12 +253,12 @@ class PatchTSTPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize weights""" if self.config.use_cls_token: - torch.nn.init.normal_(self.config.cls_token, std=.02) + torch.nn.init.normal_(self.config.cls_token, std=0.02) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (PatchTSTEncoder)): module.gradient_checkpointing = value - + class PatchTSTEncoder(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): @@ -265,9 +284,13 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if self.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model) + self.W_pos = positional_encoding( + config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model + ) else: - self.W_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model) + self.W_pos = positional_encoding( + config.positional_encoding, config.learn_pe, self.num_patch, config.d_model + ) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -313,8 +336,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.dropout(x + self.W_pos) # x: [bs * nvars x num_patch x d_model] # Encoder - x = self.encoder(x) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token - x = torch.reshape(x, (bs, n_vars, -1, self.d_model)) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x = self.encoder( + x + ) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token + x = torch.reshape( + x, (bs, n_vars, -1, self.d_model) + ) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token return x @@ -521,9 +548,13 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if self.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model + ) else: - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, self.num_patch, config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, self.num_patch, config.d_model + ) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -570,9 +601,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Encoder x = self.encoder( - x) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token - x = torch.reshape(x, (bs, n_vars, -1, - self.d_model)) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x + ) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token + x = torch.reshape( + x, (bs, n_vars, -1, self.d_model) + ) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token return x @@ -588,10 +621,7 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x: torch.Tensor): encoder_output = self.encoder(x) - return BaseModelOutputWithNoAttention( - last_hidden_state=encoder_output, - hidden_states=None - ) + return BaseModelOutputWithNoAttention(last_hidden_state=encoder_output, hidden_states=None) class PretrainHead(nn.Module): @@ -608,16 +638,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: output: tensor [bs x nvars x num_patch x patch_len] """ x = self.linear(self.dropout(x)) # [bs x nvars x num_patch x patch_len] - if self.use_cls_token: x = x[:, :, 1:, :] # remove the first cls token + if self.use_cls_token: + x = x[:, :, 1:, :] # remove the first cls token return x -def cv_random_masking(xb: torch.Tensor, - mask_ratio: float, - cv_channel_indices: list = None, - channel_consistent_masking: bool = True, - d_size="4D", - mask_value=0): +def cv_random_masking( + xb: torch.Tensor, + mask_ratio: float, + cv_channel_indices: list = None, + channel_consistent_masking: bool = True, + d_size="4D", + mask_value=0, +): """cv_random_masking: Mask the input considering the control variables. Args: @@ -662,15 +695,17 @@ def cv_random_masking(xb: torch.Tensor, class PatchMasking(nn.Module): - def __init__(self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = True, - d_size: str = "4D", - cv_channel_indices: list = None, - mask_value=0, ): + def __init__( + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = True, + d_size: str = "4D", + cv_channel_indices: list = None, + mask_value=0, + ): """PatchMasking: Class to random or forcast masking. Args: @@ -699,7 +734,6 @@ def __init__(self, super().__init__() def forward(self, x: torch.Tensor): - """ Input: x: patched input @@ -713,12 +747,14 @@ def forward(self, x: torch.Tensor): """ if self.mask_type == "random": - x_mask, mask = cv_random_masking(xb=x, - mask_ratio=self.mask_ratio, - cv_channel_indices=self.cv_channel_indices, - channel_consistent_masking=self.channel_consistent_masking, - d_size=self.d_size, - mask_value=self.mask_value) + x_mask, mask = cv_random_masking( + xb=x, + mask_ratio=self.mask_ratio, + cv_channel_indices=self.cv_channel_indices, + channel_consistent_masking=self.channel_consistent_masking, + d_size=self.d_size, + mask_value=self.mask_value, + ) else: raise Exception("Invalid mask type") @@ -732,15 +768,19 @@ class Patch(nn.Module): """ A class to patchify the time series sequence into different patches """ - def __init__(self, - seq_len: int, - patch_len: int, - stride: int, - padding: bool = False # TODO: use this to set whether we want to pad zeros to the sequence - ): + + def __init__( + self, + seq_len: int, + patch_len: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + ): super().__init__() - assert (seq_len > patch_len), f'Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})' + assert ( + seq_len > patch_len + ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})" self.seq_len = seq_len self.patch_len = patch_len @@ -760,12 +800,14 @@ def forward(self, x: torch.Tensor): z: output tensor data [bs x ... x n_vars x num_patch x patch_len] """ seq_len = x.shape[-2] - assert (seq_len == self.seq_len), f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." + assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." # x = x[:, :, self.s_begin:, :] # xb: [bs x ... x tgt_len x nvars] - z = x.transpose(0, -2)[self.s_begin:] # z: [tgt_len x ... x bs x n_vars] - z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution - z = z.unfold(dimension=-2, size=self.patch_len, step=self.stride) # xb: [bs x ... x num_patch x n_vars x patch_len] + z = x.transpose(0, -2)[self.s_begin :] # z: [tgt_len x ... x bs x n_vars] + z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution + z = z.unfold( + dimension=-2, size=self.patch_len, step=self.stride + ) # xb: [bs x ... x num_patch x n_vars x patch_len] z = z.transpose(-2, -3).contiguous() # xb: [bs x ... x n_vars x num_patch x patch_len] return z @@ -808,25 +850,24 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.context_length, - patch_len=config.patch_length, - stride=config.stride) - self.masking = PatchMasking(mask_type=config.mask_type, - mask_ratio=config.mask_ratio, - mask_patches=config.mask_patches, - mask_patch_ratios=config.mask_patch_ratios, - channel_consistent_masking=config.channel_consistent_masking, - d_size=config.d_size, - cv_channel_indices=config.cv_channel_indices, - mask_value=config.mask_value) + self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) + self.masking = PatchMasking( + mask_type=config.mask_type, + mask_ratio=config.mask_ratio, + mask_patches=config.mask_patches, + mask_patch_ratios=config.mask_patch_ratios, + channel_consistent_masking=config.channel_consistent_masking, + d_size=config.d_size, + cv_channel_indices=config.cv_channel_indices, + mask_value=config.mask_value, + ) self.model = PatchTSTModel(config) self.head = PretrainHead(config) - self.loss = torch.nn.MSELoss(reduction='mean') + self.loss = torch.nn.MSELoss(reduction="mean") - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None - ) -> PatchTSTForPreTrainingOutput: + def forward( + self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None + ) -> PatchTSTForPreTrainingOutput: """ past_values (x): tensor [bs x n_vars x num_patch x patch_len] future_values (y): labels @@ -838,7 +879,7 @@ def forward(self, masked_x, masked = self.masking(patched_x) model_output = self.model(masked_x) # x: [bs x nvars x num_patch x d_model] # or [bs x nvars x (num_patch+1) x d_model] if use cls_token - x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] + x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] loss_val = self.loss(x_hat, patched_x) return PatchTSTForPreTrainingOutput( @@ -847,4 +888,87 @@ def forward(self, ) +class PatchTSTForClassification(PatchTSTPretrainedModel): + # PatchTST model + classification head + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + + self.patching = Patch(config.seq_len, patch_len=config.patch_len, stride=config.stride) + + self.model = PatchTSTModel(config) + self.head = ClassificationHead(config) + self.loss = nn.CrossEntropyLoss() + + def forward(self, x, y=None): + patched_x = self.patching(x) + model_output = self.model(patched_x) + y_hat = self.head(model_output[0]) + + loss_val = None + if y is not None: + loss_val = self.loss(y_hat, y) + return PatchTSTForClassificationOutput( + loss=loss_val, + prediction_logits=y_hat, + ) + + +class ClassificationHead(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + self.use_cls_token = config.use_cls_token + self.pooling = config.pooling + self.flatten = nn.Flatten(start_dim=1) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + self.linear = nn.Linear(config.n_vars * config.d_model, config.n_classes) + + def forward(self, x): + """ + x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: [bs x n_classes] + """ + if self.use_cls_token: + x = x[:, :, 0, :] # use the first output token, x: bs x nvars x d_model + elif self.pooling == "mean": + x = x.mean(dim=2) # x: [bs x nvars x d_model] + elif self.pooling == "max": + x = x.max(dim=2) # x: [bs x nvars x d_model] + else: + raise Exception(f"pooling operator {self.pooling} is not implemented yet") + x = self.flatten(x) # x: bs x nvars * d_model + y = self.linear(self.dropout(x)) # y: bs x n_classes + return y + + +class PatchTSTForClassificationOutput(ModelOutput): + """ + Output type of [`PatchTSTForClassification`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None From 1935eef97d52eb98caadd90828124e41126d8001 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Mon, 21 Aug 2023 09:23:39 -0400 Subject: [PATCH 004/189] clean up auto files --- src/transformers/models/auto/modeling_auto.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 64ccf4061b28aa..c3f1929190e3e5 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -111,7 +111,6 @@ ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), ("patchtst", "PatchTSTModel"), - ("patchtst", "PatchTSTModel"), ("jukebox", "JukeboxModel"), ("layoutlm", "LayoutLMModel"), ("layoutlmv2", "LayoutLMv2Model"), @@ -189,8 +188,6 @@ ("table-transformer", "TableTransformerModel"), ("tapas", "TapasModel"), ("time_series_transformer", "TimeSeriesTransformerModel"), - ("patchtst", "PatchTSTModel"), - ("patchtst", "PatchTSTModel"), ("timesformer", "TimesformerModel"), ("timm_backbone", "TimmBackbone"), ("trajectory_transformer", "TrajectoryTransformerModel"), From c6195cb3d839a11e2854a110dc5d0530ae215833 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 22 Aug 2023 10:04:07 -0400 Subject: [PATCH 005/189] Add PatchTSTForPrediction --- .../models/patchtst/configuration_patchtst.py | 10 +- .../models/patchtst/modeling_patchtst.py | 148 ++++++++++++++++-- 2 files changed, 140 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 29759fb4bfcc6a..7f6fc611d0d282 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -173,6 +173,7 @@ def __init__( num_time_features: int = 0, is_encoder_decoder: bool = False, encoder_layerdrop: float = 0.1, + prediction_length: int = 24, # PatchTST arguments attention_type: str = "prob", @@ -183,7 +184,7 @@ def __init__( # time series specific configuration self.context_length = context_length - self.input_size = input_size + self.input_size = input_size # n_vars self.num_time_features = num_time_features self.num_dynamic_real_features = num_dynamic_real_features self.num_static_real_features = num_static_real_features @@ -216,6 +217,7 @@ def __init__( # PatchTST self.patch_length = patch_length self.stride = stride + self.num_patch = self._num_patches() self.attention_type = attention_type self.sampling_factor = sampling_factor self.distil = distil @@ -237,5 +239,11 @@ def __init__( self.proj_dropout = proj_dropout self.qkv_bias = qkv_bias + # Forcasting + self.prediction_length = prediction_length + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + def _num_patches(self): + return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 + diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f661008f022c02..26899814cd6e49 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -20,12 +20,12 @@ import torch from torch import nn import math -from ...modeling_utils import PreTrainedModel -from ...utils import add_start_docstrings, logging -from ...modeling_outputs import BaseModelOutputWithNoAttention -from .configuration_patchtst import PatchTSTConfig +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, logging +from transformers.modeling_outputs import BaseModelOutputWithNoAttention +from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig from torch.nn.modules.activation import MultiheadAttention -from ...utils import ModelOutput +from transformers.utils import ModelOutput logger = logging.get_logger(__name__) @@ -869,13 +869,10 @@ def forward( self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None ) -> PatchTSTForPreTrainingOutput: """ - past_values (x): tensor [bs x n_vars x num_patch x patch_len] + past_values (x): tensor [bs x seq_len x n_vars ] future_values (y): labels """ - - # x: [bs x n_vars x num_patch x patch_len] for pretrain - - patched_x = self.patching(past_values) + patched_x = self.patching(past_values) # patched_x: [bs x n_vars x num_patch x patch_len] for pretrain masked_x, masked = self.masking(patched_x) model_output = self.model(masked_x) # x: [bs x nvars x num_patch x d_model] # or [bs x nvars x (num_patch+1) x d_model] if use cls_token @@ -888,25 +885,25 @@ def forward( ) -class PatchTSTForClassification(PatchTSTPretrainedModel): +class PatchTSTForClassification(PatchTSTPreTrainedModel): # PatchTST model + classification head def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.seq_len, patch_len=config.patch_len, stride=config.stride) + self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) self.model = PatchTSTModel(config) self.head = ClassificationHead(config) self.loss = nn.CrossEntropyLoss() - def forward(self, x, y=None): - patched_x = self.patching(x) + def forward(self, past_values, future_values=None): + patched_x = self.patching(past_values) model_output = self.model(patched_x) y_hat = self.head(model_output[0]) loss_val = None - if y is not None: - loss_val = self.loss(y_hat, y) + if future_values is not None: + loss_val = self.loss(y_hat, future_values) return PatchTSTForClassificationOutput( loss=loss_val, prediction_logits=y_hat, @@ -920,7 +917,7 @@ def __init__(self, config: PatchTSTConfig): self.pooling = config.pooling self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - self.linear = nn.Linear(config.n_vars * config.d_model, config.n_classes) + self.linear = nn.Linear(config.input_size * config.d_model, config.num_classes) def forward(self, x): """ @@ -972,3 +969,120 @@ class PatchTSTForClassificationOutput(ModelOutput): seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class PredictionHead(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + self.individual = config.individual + self.n_vars = config.input_size + self.use_cls_token = config.use_cls_token + self.pooling = config.pooling + head_dimension = config.d_model if config.pooling else config.d_model * config.num_patch + + if self.individual: + self.linears = nn.ModuleList() + self.dropouts = nn.ModuleList() + self.flattens = nn.ModuleList() + for i in range(self.n_vars): + self.flattens.append(nn.Flatten(start_dim=2)) + self.linears.append(nn.Linear(head_dimension, config.prediction_length)) + self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + ) + else: + self.flatten = nn.Flatten(start_dim=2) + self.linear = nn.Linear(head_dimension, config.prediction_length) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + + def forward(self, x: torch.Tensor): + """ + x: [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: [bs x forecast_len x nvars] + """ + + if self.use_cls_token: + y = x[:, :, 0, :] # y: [bs x nvars x d_model] + else: + if self.pooling == 'mean': + y = x.mean(dim=2) # y: [bs x nvars x d_model] + elif self.pooling == 'max': + y = x.max(dim=2) # y: [bs x nvars x d_model] + else: + y = x # y: [bs x nvars x num_patch x d_model] + + if self.individual: + x_out = [] + for i in range(self.n_vars): + z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patch)] or [bs x d_model)] + z = self.linears[i](z) # z: [bs x forecast_len] + z = self.dropouts[i](z) + x_out.append(z) + x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] + else: + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)] + z = self.dropout(z) + x = self.linear(z) # x: [bs x nvars x forecast_len] + + x = x.transpose(2, 1) # [bs x forecast_len x nvars] + + return x + + +class PatchTSTForPredictionOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class PatchTSTForPrediction(PatchTSTPreTrainedModel): + # PatchTST model + classification head + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + + self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) + + self.model = PatchTSTModel(config) + self.head = PredictionHead(config) + self.loss = nn.MSELoss(reduction='mean') + + def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]): + patched_x = self.patching(past_values) + model_output = self.model(patched_x) + y_hat = self.head(model_output[0]) + + loss_val = None + if future_values is not None: + loss_val = self.loss(y_hat, future_values) + return PatchTSTForPredictionOutput( + loss=loss_val, + prediction_logits=y_hat, + ) + From 2d4b02cfe5fcaf5b5f70d5ccb1833d454851ac33 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 22 Aug 2023 10:26:15 -0400 Subject: [PATCH 006/189] Fix relative import --- src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 26899814cd6e49..2ada2d8aaaf39c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -20,12 +20,12 @@ import torch from torch import nn import math -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import add_start_docstrings, logging -from transformers.modeling_outputs import BaseModelOutputWithNoAttention -from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig +from ...modeling_utils import PreTrainedModel +from ...utils import add_start_docstrings, logging +from ...modeling_outputs import BaseModelOutputWithNoAttention +from .configuration_patchtst import PatchTSTConfig from torch.nn.modules.activation import MultiheadAttention -from transformers.utils import ModelOutput +from ...utils import ModelOutput logger = logging.get_logger(__name__) From ee8c8726a3121ba3dfe8fc21cc68f50b9ce48277 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 22 Aug 2023 17:42:45 -0400 Subject: [PATCH 007/189] Replace original PatchTSTEncoder with ChannelAttentionPatchTSTEncoder --- src/transformers/models/auto/modeling_auto.py | 1 - .../models/patchtst/modeling_patchtst.py | 270 ++++++------------ 2 files changed, 86 insertions(+), 185 deletions(-) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index c3f1929190e3e5..b7cf99b0e0e4ae 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -110,7 +110,6 @@ ("idefics", "IdeficsModel"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), - ("patchtst", "PatchTSTModel"), ("jukebox", "JukeboxModel"), ("layoutlm", "LayoutLMModel"), ("layoutlmv2", "LayoutLMv2Model"), diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 2ada2d8aaaf39c..b735c2cb57540c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -14,12 +14,11 @@ # limitations under the License. """ PyTorch PatchTST model.""" -from typing import List, Optional, Tuple, Union - -import numpy as np +from typing import Optional, Tuple import torch from torch import nn import math + from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, logging from ...modeling_outputs import BaseModelOutputWithNoAttention @@ -155,14 +154,30 @@ def coord1d_pos_encoding(q_len, exponential=False, normalize=True): return cpe -class TSTEncoderLayer(nn.Module): +class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.pre_norm = config.pre_norm - assert ( - not config.d_model % config.encoder_attention_heads - ), f"d_model ({config.d_model}) must be divisible by n_heads ({config.encoder_attention_heads})" + self.layers = nn.ModuleList( + [ + ChannelAttentionTSTEncoderLayer(config) + for i in range(config.encoder_layers) + ] + ) + + def forward(self, src: torch.Tensor): + """ + src: tensor [bs x nvars x seq_len x d_model] + Return: + Tensor [bs x nvars x seq_len x d_model] + """ + for mod in self.layers: src = mod(src) + return src + + +class ChannelAttentionTSTEncoderLayer(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() # Multi-Head attention self.self_attn = PatchTSTAttention(config) @@ -174,6 +189,13 @@ def __init__(self, config: PatchTSTConfig): else: self.norm_sublayer1 = nn.LayerNorm(config.d_model) + # Add & Norm of the sublayer 2 + self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() + if "batch" in config.norm.lower(): + self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + else: + self.norm_sublayer2 = nn.LayerNorm(config.d_model) + # Position-wise Feed-Forward self.ff = nn.Sequential( nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias), @@ -182,68 +204,58 @@ def __init__(self, config: PatchTSTConfig): nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias), ) - # Add & Norm of sublayer 2 - self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() + # Add & Norm of sublayer 3 + self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + self.norm_sublayer3 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) else: - self.norm_sublayer2 = nn.LayerNorm(config.d_model) + self.norm_sublayer3 = nn.LayerNorm(config.d_model) + + self.pre_norm = config.pre_norm + self.store_attn = config.store_attention def forward(self, src: torch.Tensor): """ - src: tensor [bs x seq_len x d_model] + src: tensor [bs x nvars x seq_len x d_model] Return: - Tensor [bs x seq_len x d_model] + Tensor [bs x nvars x seq_len x d_model] """ - # First sublayer: mixing across time + bs, n_vars, seq_len, d_model = src.shape + + # First sublayer: attention across time + src = src.view(bs*n_vars, seq_len, d_model) # src: [(bs*nvars) x seq_len x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src)) - ) # Add: residual connection with residual dropout + src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer1(src + self.dropout_path1(self.self_attn(src))) + src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) ) # src: [(bs*nvars) x seq_len x d_model] + src = src.reshape(bs, n_vars, seq_len, d_model) # [bs x nvars x seq_len x d_model] - # Second sublayer: mixing across hidden dimension + # second sublayer: attention across variable at any given time + # [bs x nvars x seq_len x d_model] -> [bs x seq_len x nvars x d_model] -> [(bs*seq_len) x nvars x d_model] + src = src.transpose(2, 1).contiguous().view(bs*seq_len, n_vars, d_model) # [(bs*seq_len) x nvars x d_model] + if self.pre_norm: + ## Norm and Multi-Head attention and Add residual connection + src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) ) # Add: residual connection with residual dropout + else: + ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT + src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) ) # src: [(bs*seq_len) x nvars x d_model] + src = src.reshape(bs, seq_len, n_vars, d_model).transpose(1,2).contiguous() # src: [bs x nvars x seq_len x d_model] + + # Third sublayer: mixing across hidden + src = src.view(bs*n_vars, seq_len, d_model) # src: [(bs*nvars) x seq_len x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection - src = src + self.dropout_path2( - self.ff(self.norm_sublayer2(src)) - ) # Add: residual connection with residual dropout + src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) )) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer2( - src + self.dropout_path2(self.ff(src)) - ) # Add: residual connection with residual dropout + src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout + src = src.reshape(bs, n_vars, seq_len, d_model) # [bs x nvars x seq_len x d_model] return src -class TSTEncoder(nn.Module): - def __init__(self, config: PatchTSTConfig): - super().__init__() - - self.layers = nn.ModuleList([TSTEncoderLayer(config) for i in range(config.encoder_layers)]) - - def forward( - self, src: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attention: Optional[bool] = False - ) -> torch.Tensor: - """ - src: tensor [bs x seq_len x d_model] - Return: - Tensor [bs x seq_len x d_model] - """ - all_hidden_states = [] - for mod in self.layers: - if output_hidden_states: - src = mod(src) - all_hidden_states.append(src) - if output_hidden_states: - return src, all_hidden_states - return src - - class PatchTSTPreTrainedModel(PreTrainedModel): config_class = PatchTSTConfig base_model_prefix = "model" @@ -256,92 +268,75 @@ def _init_weights(self, module): torch.nn.init.normal_(self.config.cls_token, std=0.02) def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (PatchTSTEncoder)): + if isinstance(module, (ChannelAttentionPatchTSTEncoder)): module.gradient_checkpointing = value -class PatchTSTEncoder(PatchTSTPreTrainedModel): +class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - # self.n_vars = c_in - self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1 + self.n_vars = config.input_size + self.num_patch = config.num_patch + self.patch_length = config.patch_length self.d_model = config.d_model self.shared_embedding = config.shared_embedding self.use_cls_token = config.use_cls_token - # Added params for patching - self.patch_last = config.patch_last - self.mask_ratio = config.mask_ratio - # Input encoding: projection of feature vectors onto a d-dim vector space - if not self.shared_embedding: - self.W_P = nn.ModuleList() - for _ in range(config.input_size): - self.W_P.append(nn.Linear(config.patch_length, self.d_model)) + if not config.shared_embedding: + self.w_p = nn.ModuleList() + for _ in range(self.n_vars): + self.w_p.append(nn.Linear(config.patch_length, config.d_model)) else: - self.W_P = nn.Linear(config.patch_length, config.d_model) + self.w_p = nn.Linear(config.patch_length, config.d_model) # Positional encoding - if self.use_cls_token: - self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.W_pos = positional_encoding( - config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model - ) + if config.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch + 1, config.d_model) else: - self.W_pos = positional_encoding( - config.positional_encoding, config.learn_pe, self.num_patch, config.d_model - ) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch, config.d_model) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() # Encoder - self.encoder = TSTEncoder(config) + self.encoder = ChannelAttentionTSTEncoder(config) # Initialize weights and apply final processing self.post_init() def forward(self, x: torch.Tensor) -> torch.Tensor: """ - x: tensor [bs x nvars x num_patch x patch_len] #[bs x num_patch x nvars x patch_len] + x: tensor [bs x nvars x num_patch x patch_len] return: tensor [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token """ - # bs, num_patch, n_vars, patch_len = x.shape bs, n_vars, num_patch, patch_len = x.shape # Input encoding if not self.shared_embedding: x_out = [] for i in range(n_vars): - z = self.W_P[i](x[:, i, :, :]) + z = self.w_p[i](x[:, i, :, :]) x_out.append(z) x = torch.stack(x_out, dim=1) else: - x = self.W_P(x) # x: [bs x nvars x num_patch x d_model] - - # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model] - x = x.view(bs * n_vars, num_patch, self.d_model) # x: [bs * nvars x num_patch x d_model] + x = self.w_p(x) # x: [bs x nvars x num_patch x d_model] if self.use_cls_token: - # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') - x = self.dropout(x + self.W_pos[1:, :]) # x: [bs * nvars x num_patch x d_model] + x = self.dropout(x + self.w_pos[1:, :]) # x: [bs x nvars x num_patch x d_model] # append cls token - cls_token = self.cls_token + self.W_pos[:1, :] # cls_token: [1 x 1 x d_model] + cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(x.shape[0], -1, -1) # get the same copy for all the batch samples - x = torch.cat((cls_tokens, x), dim=1) # x: [bs * nvars x (num_patch+1) x d_model] + x = torch.cat((cls_tokens, x), dim=1) # x: [bs x nvars x (num_patch+1) x d_model] else: - # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') - x = self.dropout(x + self.W_pos) # x: [bs * nvars x num_patch x d_model] + x = self.dropout(x + self.w_pos) # x: [bs x nvars x num_patch x d_model] # Encoder x = self.encoder( - x - ) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token - x = torch.reshape( - x, (bs, n_vars, -1, self.d_model) - ) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token return x @@ -516,99 +511,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """ -class PatchTSTEncoder(PatchTSTPreTrainedModel): - """ - PatchTST encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each - attention layer is an [`PatchTSTEncoderLayer`]. - - Args: - config: PatchTSTConfig - """ - - def __init__(self, config: PatchTSTConfig): - super().__init__(config) - # self.n_vars = c_in - self.num_patch = (max(config.context_length, config.patch_length) - config.patch_length) // config.stride + 1 - self.d_model = config.d_model - self.shared_embedding = config.shared_embedding - self.use_cls_token = config.use_cls_token - - # Added params for patching - self.patch_last = config.patch_last - self.mask_ratio = config.mask_ratio - - # Input encoding: projection of feature vectors onto a d-dim vector space - if not self.shared_embedding: - self.w_p = nn.ModuleList() - for _ in range(config.input_size): - self.w_p.append(nn.Linear(config.patch_length, self.d_model)) - else: - self.w_p = nn.Linear(config.patch_length, config.d_model) - - # Positional encoding - if self.use_cls_token: - self.cls_token = nn.Parameter(torch.zeros(1, 1, config.d_model)) - self.w_pos = positional_encoding( - config.positional_encoding, config.learn_pe, self.num_patch + 1, config.d_model - ) - else: - self.w_pos = positional_encoding( - config.positional_encoding, config.learn_pe, self.num_patch, config.d_model - ) - - # Positional dropout - self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() - - # Encoder - self.encoder = TSTEncoder(config) - - # Initialize weights and apply final processing - self.post_init() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - x: tensor [bs x nvars x num_patch x patch_len] #[bs x num_patch x nvars x patch_len] - return: - tensor [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token - """ - - # bs, num_patch, n_vars, patch_len = x.shape - bs, n_vars, num_patch, patch_len = x.shape - # Input encoding - if not self.shared_embedding: - x_out = [] - for i in range(n_vars): - z = self.w_p[i](x[:, i, :, :]) - x_out.append(z) - x = torch.stack(x_out, dim=1) - else: - x = self.w_p(x) # x: [bs x nvars x num_patch x d_model] - - # x: [bs x nvars x num_patch x d_model] -> [bs * nvars x num_patch x d_model] - x = x.view(bs * n_vars, num_patch, self.d_model) # x: [bs * nvars x num_patch x d_model] - - if self.use_cls_token: - # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') - x = self.dropout(x + self.w_pos[1:, :]) # x: [bs * nvars x num_patch x d_model] - # append cls token - cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x d_model] - cls_tokens = cls_token.expand(x.shape[0], -1, -1) # get the same copy for all the batch samples - x = torch.cat((cls_tokens, x), dim=1) # x: [bs * nvars x (num_patch+1) x d_model] - else: - # print(f'x and W_pos shapes: {x.shape}, {self.W_pos.shape}') - x = self.dropout(x + self.w_pos) # x: [bs * nvars x num_patch x d_model] - - # Encoder - x = self.encoder( - x - ) # x: [bs * nvars x num_patch x d_model] or [bs * nvars x (num_patch+1) x d_model] if use cls_token - x = torch.reshape( - x, (bs, n_vars, -1, self.d_model) - ) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - return x - - @add_start_docstrings( "The bare PatchTST Model outputting raw hidden-states without any specific head on top.", PATCHTST_START_DOCSTRING, @@ -617,7 +519,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.encoder = PatchTSTEncoder(config) + self.encoder = ChannelAttentionPatchTSTEncoder(config) def forward(self, x: torch.Tensor): encoder_output = self.encoder(x) From c657ae8524b794a1613b8ac3beb3e259107cddfe Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 25 Aug 2023 18:42:40 +0700 Subject: [PATCH 008/189] temporary adding absolute path + add PatchTSTForForecasting class --- src/transformers/models/patchtst/__init__.py | 2 +- .../models/patchtst/configuration_patchtst.py | 4 +- .../models/patchtst/modeling_patchtst.py | 155 ++++++++++++++++-- 3 files changed, 141 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 73333c3fee067a..88ed72154b826c 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -14,7 +14,7 @@ from typing import TYPE_CHECKING # rely on isort to merge the imports -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available +from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available _import_structure = { diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 7f6fc611d0d282..782a45fede9e8b 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -16,8 +16,8 @@ from typing import List, Optional, Union -from ...configuration_utils import PretrainedConfig -from ...utils import logging +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging logger = logging.get_logger(__name__) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index b735c2cb57540c..bafd9c3e85b39b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -19,12 +19,14 @@ from torch import nn import math -from ...modeling_utils import PreTrainedModel -from ...utils import add_start_docstrings, logging -from ...modeling_outputs import BaseModelOutputWithNoAttention -from .configuration_patchtst import PatchTSTConfig +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, logging +from transformers.modeling_outputs import BaseModelOutputWithNoAttention +from transformers.utils import ModelOutput + from torch.nn.modules.activation import MultiheadAttention -from ...utils import ModelOutput + +from .configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) @@ -768,7 +770,8 @@ def __init__(self, config: PatchTSTConfig): self.loss = torch.nn.MSELoss(reduction="mean") def forward( - self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None + self, past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None ) -> PatchTSTForPreTrainingOutput: """ past_values (x): tensor [bs x seq_len x n_vars ] @@ -902,7 +905,6 @@ def forward(self, x: torch.Tensor): or [bs x nvars x (num_patch+1) x d_model] if use cls_token output: [bs x forecast_len x nvars] """ - if self.use_cls_token: y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: @@ -937,13 +939,11 @@ class PatchTSTForPredictionOutput(ModelOutput): Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). + MSE loss. + + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction outputs of the time series modeling heads. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -958,8 +958,7 @@ class PatchTSTForPredictionOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None + prediction_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -985,6 +984,128 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso loss_val = self.loss(y_hat, future_values) return PatchTSTForPredictionOutput( loss=loss_val, - prediction_logits=y_hat, + prediction_outputs=y_hat, ) + +class PatchTSTForForecastingOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + MSE loss. + + forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Forecasting outputs of the time series modeling heads. + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + forecast_outputs: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class ForecastHead(nn.Module): + def __init__(self, + individual: bool, + n_vars: int, + d_model: int, + num_patch: int, + forecast_len: int, + head_dropout: float = 0., + use_cls_token: bool = False, + pooling: str = None, + ): + super().__init__() + + self.individual = individual + self.n_vars = n_vars + self.use_cls_token = use_cls_token + self.pooling = pooling + head_dim = d_model if pooling else d_model * num_patch + + if self.individual: + self.linears = nn.ModuleList() + self.dropouts = nn.ModuleList() + self.flattens = nn.ModuleList() + for i in range(self.n_vars): + self.flattens.append(nn.Flatten(start_dim=2)) + self.linears.append(nn.Linear(head_dim, forecast_len)) + self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity() + ) + else: + self.flatten = nn.Flatten(start_dim=2) + self.linear = nn.Linear(head_dim, forecast_len) + self.dropout = nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity() + + def forward(self, x): + """ + x: [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: [bs x forecast_len x nvars] + """ + + if self.use_cls_token: + y = x[:, :, 0, :] # y: [bs x nvars x d_model] + else: + if self.pooling == 'mean': + y = x.mean(dim=2) # y: [bs x nvars x d_model] + elif self.pooling == 'max': + y = x.max(dim=2) # y: [bs x nvars x d_model] + else: + y = x # y: [bs x nvars x num_patch x d_model] + + if self.individual: + x_out = [] + for i in range(self.n_vars): + z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patch)] or [bs x d_model)] + z = self.linears[i](z) # z: [bs x forecast_len] + z = self.dropouts[i](z) + x_out.append(z) + x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] + else: + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)] + z = self.dropout(z) + x = self.linear(z) # x: [bs x nvars x forecast_len] + + x = x.transpose(2, 1) # [bs x forecast_len x nvars] + + return x + + +class PatchTSTForForecasting(PatchTSTPreTrainedModel): + # PatchTST model + classification head + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + + self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) + + self.model = PatchTSTModel(config) + self.head = ForecastHead(config) + self.loss = nn.MSELoss(reduction='mean') + + def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]): + patched_x = self.patching(past_values) + model_output = self.model(patched_x) + y_hat = self.head(model_output[0]) + + loss_val = None + if future_values is not None: + loss_val = self.loss(y_hat, future_values) + return PatchTSTForForecastingOutput( + loss=loss_val, + forecast_outputs=y_hat, + ) From fa72e8a310e216fabec87b9734df80eed32b5b7b Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Fri, 25 Aug 2023 12:39:46 -0400 Subject: [PATCH 009/189] Update base PatchTSTModel + Unittest --- .../models/patchtst/configuration_patchtst.py | 8 +- .../models/patchtst/modeling_patchtst.py | 265 ++++++-- .../models/patchtst/test_modeling_patchtst.py | 594 +++++++++--------- 3 files changed, 497 insertions(+), 370 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 782a45fede9e8b..dae40eee1ee12c 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -56,10 +56,6 @@ class PatchTSTConfig(PretrainedConfig): The number of static categorical features. num_static_real_features (`int`, *optional*, defaults to 0): The number of static real valued features. - cardinality (`list[int]`, *optional*): - The cardinality (number of different values) for each of the static categorical features. Should be a list - of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if - `num_static_categorical_features` is > 0. embedding_dimension (`list[int]`, *optional*): The dimension of the embedding for each of the static categorical features. Should be a list of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if @@ -154,6 +150,8 @@ def __init__( use_cls_token: bool = False, patch_last: bool = True, individual: bool = False, + seed_number= None, + mask_input: Optional[bool] = None, mask_type: str = "random", mask_ratio=0.5, mask_patches: list = [2, 3], @@ -223,6 +221,8 @@ def __init__( self.distil = distil # Masking + self.seed_number = seed_number + self.mask_input = mask_input self.mask_type = mask_type self.mask_ratio = mask_ratio self.mask_patches = mask_patches diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index bafd9c3e85b39b..1fbd5f204fa7fa 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -18,15 +18,15 @@ import torch from torch import nn import math +import random +import numpy as np from transformers.modeling_utils import PreTrainedModel from transformers.utils import add_start_docstrings, logging from transformers.modeling_outputs import BaseModelOutputWithNoAttention from transformers.utils import ModelOutput - from torch.nn.modules.activation import MultiheadAttention - -from .configuration_patchtst import PatchTSTConfig +from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) @@ -167,14 +167,20 @@ def __init__(self, config: PatchTSTConfig): ] ) - def forward(self, src: torch.Tensor): + def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): """ src: tensor [bs x nvars x seq_len x d_model] Return: Tensor [bs x nvars x seq_len x d_model] """ - for mod in self.layers: src = mod(src) - return src + all_hidden_states = [] + for mod in self.layers: + if output_hidden_states: + src = mod(src) + all_hidden_states.append(src) + if output_hidden_states: + return src, all_hidden_states + return src, None class ChannelAttentionTSTEncoderLayer(nn.Module): @@ -283,6 +289,7 @@ def __init__(self, config: PatchTSTConfig): self.d_model = config.d_model self.shared_embedding = config.shared_embedding self.use_cls_token = config.use_cls_token + self.gradient_checkpointing = False # Input encoding: projection of feature vectors onto a d-dim vector space if not config.shared_embedding: @@ -308,7 +315,7 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor: """ x: tensor [bs x nvars x num_patch x patch_len] return: @@ -316,30 +323,38 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: or [bs x nvars x (num_patch+1) x d_model] if use cls_token """ # bs, num_patch, n_vars, patch_len = x.shape - bs, n_vars, num_patch, patch_len = x.shape + bs, n_vars, num_patch, patch_len = past_values.shape + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) # Input encoding if not self.shared_embedding: x_out = [] for i in range(n_vars): - z = self.w_p[i](x[:, i, :, :]) + z = self.w_p[i](past_values[:, i, :, :]) x_out.append(z) - x = torch.stack(x_out, dim=1) + past_values = torch.stack(x_out, dim=1) else: - x = self.w_p(x) # x: [bs x nvars x num_patch x d_model] + past_values = self.w_p(past_values) # x: [bs x nvars x num_patch x d_model] if self.use_cls_token: - x = self.dropout(x + self.w_pos[1:, :]) # x: [bs x nvars x num_patch x d_model] + past_values = self.dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patch x d_model] # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] - cls_tokens = cls_token.expand(x.shape[0], -1, -1) # get the same copy for all the batch samples - x = torch.cat((cls_tokens, x), dim=1) # x: [bs x nvars x (num_patch+1) x d_model] + cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples + past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x nvars x (num_patch+1) x d_model] else: - x = self.dropout(x + self.w_pos) # x: [bs x nvars x num_patch x d_model] + past_values = self.dropout(past_values + self.w_pos) # x: [bs x nvars x num_patch x d_model] # Encoder - x = self.encoder( - x) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - return x + past_values, hidden_states = self.encoder( + past_values, output_hidden_states) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + # return past_values + # return past_values, hidden_states + return BaseModelOutputWithNoAttention( + last_hidden_state=past_values, hidden_states=hidden_states + ) PATCHTST_START_DOCSTRING = r""" @@ -521,11 +536,56 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) + self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) + if config.mask_input: + self.masking = PatchMasking( + mask_type=config.mask_type, + mask_ratio=config.mask_ratio, + mask_patches=config.mask_patches, + mask_patch_ratios=config.mask_patch_ratios, + channel_consistent_masking=config.channel_consistent_masking, + d_size=config.d_size, + cv_channel_indices=config.cv_channel_indices, + mask_value=config.mask_value, + seed_number=config.seed_number + ) + else: + self.masking = nn.Identity() self.encoder = ChannelAttentionPatchTSTEncoder(config) - def forward(self, x: torch.Tensor): - encoder_output = self.encoder(x) - return BaseModelOutputWithNoAttention(last_hidden_state=encoder_output, hidden_states=None) + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor]=None, + output_hidden_states: Optional[bool] = None): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain + masked_values = self.masking(patched_values) + encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) + return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, + hidden_states=encoder_output.hidden_states, + patched_input=patched_values) + + +class PatchTSTModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + patched_input + """ + + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + patched_input: torch.FloatTensor = None class PretrainHead(nn.Module): @@ -598,6 +658,13 @@ def cv_random_masking( return xb_mask, mask[..., 0] +def set_seed(x=42): + random.seed(x) + np.random.seed(x) + torch.manual_seed(x) + if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) + + class PatchMasking(nn.Module): def __init__( self, @@ -609,6 +676,7 @@ def __init__( d_size: str = "4D", cv_channel_indices: list = None, mask_value=0, + seed_number: Optional[int] = None ): """PatchMasking: Class to random or forcast masking. @@ -623,7 +691,8 @@ def __init__( d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D". mask_value (int, optional): Value to use for masking. Defaults to 0. """ - + if seed_number: + set_seed(seed_number) self.mask_ratio = mask_ratio self.channel_consistent_masking = channel_consistent_masking self.d_size = d_size @@ -665,7 +734,7 @@ def forward(self, x: torch.Tensor): mask = mask.bool() # mask: [bs x n_vars x num_patch] - return x_mask, mask + return x_mask #, mask class Patch(nn.Module): @@ -699,7 +768,7 @@ def forward(self, x: torch.Tensor): """ Args: - x (torch.Tensor, required): Input of shape [bs x ... x seq_len x n_vars] + x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars] Returns: z: output tensor data [bs x ... x n_vars x num_patch x patch_len] """ @@ -754,39 +823,28 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) - self.masking = PatchMasking( - mask_type=config.mask_type, - mask_ratio=config.mask_ratio, - mask_patches=config.mask_patches, - mask_patch_ratios=config.mask_patch_ratios, - channel_consistent_masking=config.channel_consistent_masking, - d_size=config.d_size, - cv_channel_indices=config.cv_channel_indices, - mask_value=config.mask_value, - ) + config.mask_input = True self.model = PatchTSTModel(config) self.head = PretrainHead(config) self.loss = torch.nn.MSELoss(reduction="mean") def forward( - self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None + self, past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None ) -> PatchTSTForPreTrainingOutput: """ past_values (x): tensor [bs x seq_len x n_vars ] future_values (y): labels """ - patched_x = self.patching(past_values) # patched_x: [bs x n_vars x num_patch x patch_len] for pretrain - masked_x, masked = self.masking(patched_x) - model_output = self.model(masked_x) # x: [bs x nvars x num_patch x d_model] - # or [bs x nvars x (num_patch+1) x d_model] if use cls_token + model_output = self.model(past_values) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] - loss_val = self.loss(x_hat, patched_x) + loss_val = self.loss(x_hat, model_output.patched_input) return PatchTSTForPreTrainingOutput( loss=loss_val, prediction_logits=x_hat, + hidden_states=model_output.hidden_states ) @@ -795,15 +853,12 @@ class PatchTSTForClassification(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) - self.model = PatchTSTModel(config) self.head = ClassificationHead(config) self.loss = nn.CrossEntropyLoss() - def forward(self, past_values, future_values=None): - patched_x = self.patching(past_values) - model_output = self.model(patched_x) + def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None): + model_output = self.model(past_values) y_hat = self.head(model_output[0]) loss_val = None @@ -812,6 +867,7 @@ def forward(self, past_values, future_values=None): return PatchTSTForClassificationOutput( loss=loss_val, prediction_logits=y_hat, + hidden_states=model_output.hidden_states ) @@ -968,16 +1024,19 @@ class PatchTSTForPrediction(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) - self.model = PatchTSTModel(config) self.head = PredictionHead(config) self.loss = nn.MSELoss(reduction='mean') - def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]): - patched_x = self.patching(past_values) - model_output = self.model(patched_x) - y_hat = self.head(model_output[0]) + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + model_output = self.model(past_values, output_hidden_states=output_hidden_states) + y_hat = self.head(model_output.last_hidden_state) loss_val = None if future_values is not None: @@ -985,6 +1044,7 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso return PatchTSTForPredictionOutput( loss=loss_val, prediction_outputs=y_hat, + hidden_states=model_output.hidden_states ) @@ -1097,9 +1157,14 @@ def __init__(self, config: PatchTSTConfig): self.head = ForecastHead(config) self.loss = nn.MSELoss(reduction='mean') - def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor]): - patched_x = self.patching(past_values) - model_output = self.model(patched_x) + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output[0]) loss_val = None @@ -1108,4 +1173,92 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tenso return PatchTSTForForecastingOutput( loss=loss_val, forecast_outputs=y_hat, + hidden_states=model_output.hidden_states ) + + +if __name__ == "__main__": + + from transformers import Trainer, TrainingArguments + from torch.utils.data import Dataset + from transformers import AutoModel, AutoConfig + import numpy as np + + class AssetDataset(Dataset): + def __init__(self, x, y, seq_len=10, pred_len=10, is_pred=False): + self.seq_len = seq_len + self.x = x + self.y = y + self.is_pred = is_pred + self.pred_len = pred_len + + def __getitem__(self, index): + s_begin = index + s_end = s_begin + self.seq_len + r_begin = s_end - 1 + r_end = s_end + self.pred_len + + seq_x = self.x[s_begin:s_end] + seq_y = np.array(self.y[r_begin]) + if self.is_pred: + seq_y = self.x[s_end:r_end] + + return {'past_values': seq_x, 'future_values': seq_y} + + def __len__(self): + if self.is_pred: + return len(self.x) - self.seq_len - self.pred_len + 1 + return len(self.x) - self.seq_len + 1 + + n_classes = 3 + bs = 200 + n_features = 20 + pred_len = 7 + x = torch.randn(bs, n_features) + y = torch.randint(low=0, high=n_classes, size=(bs, 1))[:, 0] + valid_asset_ds = train_asset_ds = AssetDataset(x, y, seq_len=10, pred_len=pred_len, is_pred=False) + config = PatchTSTConfig( + input_size=n_features, + num_classes=n_classes, + context_length=10, + patch_length=5, + stride=5, + batch_size=50, + standardscale=None, # 'bysample' + context_points=10, + encoder_layers=12, + encoder_attention_heads=8, + d_model=256, + encoder_ffn_dim=1024, + dropout=0.2, + fc_dropout=0, + r=0.4, + prediction_length=pred_len, + ) + # model = PatchTSTForPretraining(config) + # model = PatchTSTForPrediction(config) + model = PatchTSTForClassification(config) + training_args = TrainingArguments( + output_dir='./save_model/', + num_train_epochs=1, + per_device_train_batch_size=5, + per_device_eval_batch_size=5, + report_to=[], + save_strategy='no', + remove_unused_columns=False, + no_cuda=True + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_asset_ds, + eval_dataset=valid_asset_ds + ) + trainer.train() + trainer.save_model('./save_model') + # AutoConfig.register("patchtst", PatchTSTConfig) + AutoModel.register(PatchTSTConfig, PatchTSTForClassification) + config = AutoConfig.from_pretrained('./save_model') + model = AutoModel.from_pretrained('./save_model', config=config) + print(model) + diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index cf8060a284f232..34caf2d0253442 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -25,7 +25,7 @@ from transformers.testing_utils import is_flaky, require_torch, slow, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, _config_zero_init from ...test_pipeline_mixin import PipelineTesterMixin @@ -34,8 +34,8 @@ if is_torch_available(): import torch - from transformers import PatchTSTConfig, PatchTSTForPrediction, PatchTSTModel - from transformers.models.patchtst.modeling_patchtst import PatchTSTDecoder, PatchTSTEncoder + from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig + from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder @require_torch @@ -46,9 +46,10 @@ def __init__( batch_size=13, prediction_length=7, context_length=14, - cardinality=19, - embedding_dimension=5, - num_time_features=4, + patch_length=5, + stride=5, + input_size=1, + num_time_features=1, is_training=True, hidden_size=16, num_hidden_layers=2, @@ -60,15 +61,17 @@ def __init__( lags_sequence=[1, 2, 3, 4, 5], sampling_factor=10, distil=False, + seed_number=42 ): self.parent = parent self.batch_size = batch_size self.prediction_length = prediction_length self.context_length = context_length - self.cardinality = cardinality + self.patch_length = patch_length + self.stride = stride + self.input_size = input_size self.num_time_features = num_time_features self.lags_sequence = lags_sequence - self.embedding_dimension = embedding_dimension self.is_training = is_training self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -81,56 +84,42 @@ def __init__( self.encoder_seq_length = min( sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length ) - self.decoder_seq_length = min( - sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length - ) + self.seed_number = seed_number self.sampling_factor = sampling_factor self.distil = distil + self.num_patch = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 def get_config(self): return PatchTSTConfig( prediction_length=self.prediction_length, + patch_length=self.patch_length, + stride=self.stride, + input_size=self.input_size, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, - decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, - decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, - decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, context_length=self.context_length, - lags_sequence=self.lags_sequence, - num_time_features=self.num_time_features, - num_static_categorical_features=1, - num_static_real_features=1, - cardinality=[self.cardinality], - embedding_dimension=[self.embedding_dimension], - sampling_factor=self.sampling_factor, - distil=self.distil, + activation_function=self.hidden_act, + seed_number=self.seed_number ) def prepare_patchtst_inputs_dict(self, config): - _past_length = config.context_length + max(config.lags_sequence) - - static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0]) - static_real_features = floats_tensor([self.batch_size, 1]) + _past_length = config.context_length + # bs, n_vars, num_patch, patch_len - past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features]) - past_values = floats_tensor([self.batch_size, _past_length]) - past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5 + # [bs x seq_len x n_vars] + past_values = floats_tensor([self.batch_size, _past_length, self.input_size]) + # past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5 - # decoder inputs - future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features]) - future_values = floats_tensor([self.batch_size, config.prediction_length]) + future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size]) inputs_dict = { "past_values": past_values, - "static_categorical_features": static_categorical_features, - "static_real_features": static_real_features, - "past_time_features": past_time_features, - "past_observed_mask": past_observed_mask, - "future_time_features": future_time_features, + # "past_observed_mask": past_observed_mask, + # "future_time_features": future_time_features, "future_values": future_values, } return inputs_dict @@ -144,44 +133,32 @@ def prepare_config_and_inputs_for_common(self): config, inputs_dict = self.prepare_config_and_inputs() return config, inputs_dict - def check_encoder_decoder_model_standalone(self, config, inputs_dict): - model = PatchTSTModel(config=config).to(torch_device).eval() - outputs = model(**inputs_dict) - - encoder_last_hidden_state = outputs.encoder_last_hidden_state - last_hidden_state = outputs.last_hidden_state - - with tempfile.TemporaryDirectory() as tmpdirname: - encoder = model.get_encoder() - encoder.save_pretrained(tmpdirname) - encoder = PatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device) - - transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict) - enc_input = transformer_inputs[:, : config.context_length, ...] - dec_input = transformer_inputs[:, config.context_length :, ...] - - encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0] - - self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) - - with tempfile.TemporaryDirectory() as tmpdirname: - decoder = model.get_decoder() - decoder.save_pretrained(tmpdirname) - decoder = PatchTSTDecoder.from_pretrained(tmpdirname).to(torch_device) - - last_hidden_state_2 = decoder( - inputs_embeds=dec_input, - encoder_hidden_states=encoder_last_hidden_state, - )[0] - - self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3) + # def check_encoder_model_standalone(self, config, inputs_dict): + # model = PatchTSTModel(config=config).to(torch_device).eval() + # outputs = model(**inputs_dict) + # + # encoder_last_hidden_state = outputs.encoder_last_hidden_state + # + # with tempfile.TemporaryDirectory() as tmpdirname: + # encoder = model.get_encoder() + # encoder.save_pretrained(tmpdirname) + # encoder = ChannelAttentionPatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device) + # + # transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict) + # # [bs x seq_len x n_vars] => bs, num_patch, n_vars, patch_len = x.shape + # enc_input = transformer_inputs[:, : config.context_length, ...] + # + # encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0] + # + # self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (PatchTSTModel, PatchTSTForPrediction) if is_torch_available() else () - all_generative_model_classes = (PatchTSTForPrediction,) if is_torch_available() else () - is_encoder_decoder = True + all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else () + all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} + is_encoder_decoder = False test_pruning = False test_head_masking = False test_missing_keys = False @@ -189,6 +166,13 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_inputs_embeds = False test_model_common_attributes = False + + test_resize_embeddings = True + test_resize_position_embeddings = False + test_mismatched_shapes = True + test_model_parallel = False + has_attentions = False + def setUp(self): self.model_tester = PatchTSTModelTester(self) self.config_tester = ConfigTester( @@ -211,10 +195,10 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) - def test_encoder_decoder_model_standalone(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() - self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) - + # def test_encoder_model_standalone(self): + # config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + # self.model_tester.check_encoder_model_standalone(*config_and_inputs) +# def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) @@ -227,39 +211,22 @@ def check_hidden_states_output(inputs_dict, config, model_class): hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( - self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers ) self.assertEqual(len(hidden_states), expected_num_layers) - if hasattr(self.model_tester, "encoder_seq_length"): - seq_length = self.model_tester.context_length - if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: - seq_length = seq_length * self.model_tester.chunk_length - else: - seq_length = self.model_tester.seq_length - + num_patch = self.model_tester.num_patch self.assertListEqual( list(hidden_states[0].shape[-2:]), - [seq_length, self.model_tester.hidden_size], + [num_patch, self.model_tester.hidden_size], ) - if config.is_encoder_decoder: - hidden_states = outputs.decoder_hidden_states - - self.assertIsInstance(hidden_states, (list, tuple)) - self.assertEqual(len(hidden_states), expected_num_layers) - seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len) - - self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [decoder_seq_length, self.model_tester.hidden_size], - ) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True + print('model_class: ', model_class) + check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config @@ -267,23 +234,30 @@ def check_hidden_states_output(inputs_dict, config, model_class): config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) - - # Ignore since we have no tokens embeddings +# +# # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass def test_model_outputs_equivalence(self): pass - +# def test_determinism(self): pass - # # Input is 'static_categorical_features' not 'input_ids' - def test_model_main_input_name(self): - model_signature = inspect.signature(getattr(PatchTSTModel, "forward")) - # The main input is the name of the argument after `self` - observed_main_input_name = list(model_signature.parameters.keys())[1] - self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name) + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -296,217 +270,217 @@ def test_forward_signature(self): expected_arg_names = [ "past_values", - "past_time_features", - "past_observed_mask", - "static_categorical_features", - "static_real_features", + # "past_time_features", + # "past_observed_mask", + # "static_categorical_features", + # "static_real_features", "future_values", - "future_time_features", + # "future_time_features", ] expected_arg_names.extend( [ - "future_observed_mask", - "decoder_attention_mask", - "head_mask", - "decoder_head_mask", - "cross_attn_head_mask", - "encoder_outputs", - "past_key_values", + # "future_observed_mask", + # "decoder_attention_mask", + # "head_mask", + # "decoder_head_mask", + # "cross_attn_head_mask", + # "encoder_outputs", + # "past_key_values", "output_hidden_states", - "output_attentions", - "use_cache", - "return_dict", - ] - if "future_observed_mask" in arg_names - else [ - "decoder_attention_mask", - "head_mask", - "decoder_head_mask", - "cross_attn_head_mask", - "encoder_outputs", - "past_key_values", - "output_hidden_states", - "output_attentions", - "use_cache", - "return_dict", + # "output_attentions", + # "use_cache", + # "return_dict", ] + # if "future_observed_mask" in arg_names + # else [ + # "decoder_attention_mask", + # "head_mask", + # "decoder_head_mask", + # "cross_attn_head_mask", + # "encoder_outputs", + # "past_key_values", + # "output_hidden_states", + # "output_attentions", + # "use_cache", + # "return_dict", + # ] ) self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - - def test_attention_outputs(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True - - seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) - context_length = getattr(self.model_tester, "context_length", seq_len) - prediction_length = getattr(self.model_tester, "prediction_length", seq_len) - - for model_class in self.all_model_classes: - inputs_dict["output_attentions"] = True - inputs_dict["output_hidden_states"] = False - config.return_dict = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - - # check that output_attentions also work using config - del inputs_dict["output_attentions"] - config.output_attentions = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - attentions = outputs.encoder_attentions - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) - - self.assertListEqual( - list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, context_length], - ) - out_len = len(outputs) - - correct_outlen = 7 - - if "last_hidden_state" in outputs: - correct_outlen += 1 - - if "past_key_values" in outputs: - correct_outlen += 1 # past_key_values have been returned - - if "loss" in outputs: - correct_outlen += 1 - - if "params" in outputs: - correct_outlen += 1 - - self.assertEqual(out_len, correct_outlen) - - # decoder attentions - decoder_attentions = outputs.decoder_attentions - self.assertIsInstance(decoder_attentions, (list, tuple)) - self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length], - ) - - # cross attentions - cross_attentions = outputs.cross_attentions - self.assertIsInstance(cross_attentions, (list, tuple)) - self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(cross_attentions[0].shape[-3:]), - [ - self.model_tester.num_attention_heads, - decoder_seq_length, - encoder_seq_length, - ], - ) - - # Check attention is always last and order is fine - inputs_dict["output_attentions"] = True - inputs_dict["output_hidden_states"] = True - model = model_class(config) - model.to(torch_device) - model.eval() - with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - - self.assertEqual(out_len + 2, len(outputs)) - - self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions - - self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) - self.assertListEqual( - list(self_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, encoder_seq_length, context_length], - ) - - @is_flaky() - def test_retain_grad_hidden_states_attentions(self): - super().test_retain_grad_hidden_states_attentions() - - -def prepare_batch(filename="train-batch.pt"): - file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") - batch = torch.load(file, map_location=torch_device) - return batch - - -@require_torch -@slow -class PatchTSTModelIntegrationTests(unittest.TestCase): - def test_inference_no_head(self): - model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) - batch = prepare_batch() - - torch.manual_seed(0) - with torch.no_grad(): - output = model( - past_values=batch["past_values"], - past_time_features=batch["past_time_features"], - past_observed_mask=batch["past_observed_mask"], - static_categorical_features=batch["static_categorical_features"], - future_values=batch["future_values"], - future_time_features=batch["future_time_features"], - ).last_hidden_state - expected_shape = torch.Size((64, model.config.context_length, model.config.d_model)) - self.assertEqual(output.shape, expected_shape) - - expected_slice = torch.tensor( - [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]], - device=torch_device, - ) - self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) - - def test_inference_head(self): - model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) - batch = prepare_batch("val-batch.pt") - - torch.manual_seed(0) - with torch.no_grad(): - output = model( - past_values=batch["past_values"], - past_time_features=batch["past_time_features"], - past_observed_mask=batch["past_observed_mask"], - static_categorical_features=batch["static_categorical_features"], - future_time_features=batch["future_time_features"], - ).encoder_last_hidden_state - - # encoder distils the context length to 1/8th of the original length - expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model)) - self.assertEqual(output.shape, expected_shape) - - expected_slice = torch.tensor( - [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device - ) - self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) - - def test_seq_to_seq_generation(self): - model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) - batch = prepare_batch("val-batch.pt") - - torch.manual_seed(0) - with torch.no_grad(): - outputs = model.generate( - static_categorical_features=batch["static_categorical_features"], - past_time_features=batch["past_time_features"], - past_values=batch["past_values"], - future_time_features=batch["future_time_features"], - past_observed_mask=batch["past_observed_mask"], - ) - expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) - self.assertEqual(outputs.sequences.shape, expected_shape) - - expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) - mean_prediction = outputs.sequences.mean(dim=1) - self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) +# +# def test_attention_outputs(self): +# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() +# config.return_dict = True +# +# seq_len = getattr(self.model_tester, "seq_length", None) +# decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) +# encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) +# context_length = getattr(self.model_tester, "context_length", seq_len) +# prediction_length = getattr(self.model_tester, "prediction_length", seq_len) +# +# for model_class in self.all_model_classes: +# inputs_dict["output_attentions"] = True +# inputs_dict["output_hidden_states"] = False +# config.return_dict = True +# model = model_class(config) +# model.to(torch_device) +# model.eval() +# with torch.no_grad(): +# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) +# attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions +# self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) +# +# # check that output_attentions also work using config +# del inputs_dict["output_attentions"] +# config.output_attentions = True +# model = model_class(config) +# model.to(torch_device) +# model.eval() +# with torch.no_grad(): +# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) +# attentions = outputs.encoder_attentions +# self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) +# +# self.assertListEqual( +# list(attentions[0].shape[-3:]), +# [self.model_tester.num_attention_heads, encoder_seq_length, context_length], +# ) +# out_len = len(outputs) +# +# correct_outlen = 7 +# +# if "last_hidden_state" in outputs: +# correct_outlen += 1 +# +# if "past_key_values" in outputs: +# correct_outlen += 1 # past_key_values have been returned +# +# if "loss" in outputs: +# correct_outlen += 1 +# +# if "params" in outputs: +# correct_outlen += 1 +# +# self.assertEqual(out_len, correct_outlen) +# +# # decoder attentions +# decoder_attentions = outputs.decoder_attentions +# self.assertIsInstance(decoder_attentions, (list, tuple)) +# self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) +# self.assertListEqual( +# list(decoder_attentions[0].shape[-3:]), +# [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length], +# ) +# +# # cross attentions +# cross_attentions = outputs.cross_attentions +# self.assertIsInstance(cross_attentions, (list, tuple)) +# self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) +# self.assertListEqual( +# list(cross_attentions[0].shape[-3:]), +# [ +# self.model_tester.num_attention_heads, +# decoder_seq_length, +# encoder_seq_length, +# ], +# ) +# +# # Check attention is always last and order is fine +# inputs_dict["output_attentions"] = True +# inputs_dict["output_hidden_states"] = True +# model = model_class(config) +# model.to(torch_device) +# model.eval() +# with torch.no_grad(): +# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) +# +# self.assertEqual(out_len + 2, len(outputs)) +# +# self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions +# +# self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) +# self.assertListEqual( +# list(self_attentions[0].shape[-3:]), +# [self.model_tester.num_attention_heads, encoder_seq_length, context_length], +# ) +# +# @is_flaky() +# def test_retain_grad_hidden_states_attentions(self): +# super().test_retain_grad_hidden_states_attentions() +# +# +# def prepare_batch(filename="train-batch.pt"): +# file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") +# batch = torch.load(file, map_location=torch_device) +# return batch +# +# +# @require_torch +# @slow +# class PatchTSTModelIntegrationTests(unittest.TestCase): +# def test_inference_no_head(self): +# model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) +# batch = prepare_batch() +# +# torch.manual_seed(0) +# with torch.no_grad(): +# output = model( +# past_values=batch["past_values"], +# past_time_features=batch["past_time_features"], +# past_observed_mask=batch["past_observed_mask"], +# static_categorical_features=batch["static_categorical_features"], +# future_values=batch["future_values"], +# future_time_features=batch["future_time_features"], +# ).last_hidden_state +# expected_shape = torch.Size((64, model.config.context_length, model.config.d_model)) +# self.assertEqual(output.shape, expected_shape) +# +# expected_slice = torch.tensor( +# [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]], +# device=torch_device, +# ) +# self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) +# +# def test_inference_head(self): +# model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) +# batch = prepare_batch("val-batch.pt") +# +# torch.manual_seed(0) +# with torch.no_grad(): +# output = model( +# past_values=batch["past_values"], +# past_time_features=batch["past_time_features"], +# past_observed_mask=batch["past_observed_mask"], +# static_categorical_features=batch["static_categorical_features"], +# future_time_features=batch["future_time_features"], +# ).encoder_last_hidden_state +# +# # encoder distils the context length to 1/8th of the original length +# expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model)) +# self.assertEqual(output.shape, expected_shape) +# +# expected_slice = torch.tensor( +# [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device +# ) +# self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) +# +# def test_seq_to_seq_generation(self): +# model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) +# batch = prepare_batch("val-batch.pt") +# +# torch.manual_seed(0) +# with torch.no_grad(): +# outputs = model.generate( +# static_categorical_features=batch["static_categorical_features"], +# past_time_features=batch["past_time_features"], +# past_values=batch["past_values"], +# future_time_features=batch["future_time_features"], +# past_observed_mask=batch["past_observed_mask"], +# ) +# expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) +# self.assertEqual(outputs.sequences.shape, expected_shape) +# +# expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) +# mean_prediction = outputs.sequences.mean(dim=1) +# self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) From 8b1310ecec2ae78445c3a6de3967476ec3c2c15d Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Fri, 25 Aug 2023 12:48:40 -0400 Subject: [PATCH 010/189] Update ForecastHead to use the config class --- .../models/patchtst/modeling_patchtst.py | 118 ++---------------- .../models/patchtst/test_modeling_patchtst.py | 6 +- 2 files changed, 13 insertions(+), 111 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 1fbd5f204fa7fa..a4fcd4a85af215 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1079,23 +1079,14 @@ class PatchTSTForForecastingOutput(ModelOutput): class ForecastHead(nn.Module): - def __init__(self, - individual: bool, - n_vars: int, - d_model: int, - num_patch: int, - forecast_len: int, - head_dropout: float = 0., - use_cls_token: bool = False, - pooling: str = None, - ): + def __init__(self, config: PatchTSTConfig): super().__init__() - self.individual = individual - self.n_vars = n_vars - self.use_cls_token = use_cls_token - self.pooling = pooling - head_dim = d_model if pooling else d_model * num_patch + self.individual = config.individual + self.n_vars = config.input_size + self.use_cls_token = config.use_cls_token + self.pooling = config.pooling + head_dim = config.d_model if self.pooling else config.d_model * config.num_patch if self.individual: self.linears = nn.ModuleList() @@ -1103,15 +1094,15 @@ def __init__(self, self.flattens = nn.ModuleList() for i in range(self.n_vars): self.flattens.append(nn.Flatten(start_dim=2)) - self.linears.append(nn.Linear(head_dim, forecast_len)) + self.linears.append(nn.Linear(head_dim, config.prediction_length)) self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity() ) else: self.flatten = nn.Flatten(start_dim=2) - self.linear = nn.Linear(head_dim, forecast_len) - self.dropout = nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity() + self.linear = nn.Linear(head_dim, config.prediction_length) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - def forward(self, x): + def forward(self, x: torch.Tensor): """ x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token @@ -1150,9 +1141,6 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel): # PatchTST model + classification head def __init__(self, config: PatchTSTConfig): super().__init__(config) - - self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) - self.model = PatchTSTModel(config) self.head = ForecastHead(config) self.loss = nn.MSELoss(reduction='mean') @@ -1176,89 +1164,3 @@ def forward(self, hidden_states=model_output.hidden_states ) - -if __name__ == "__main__": - - from transformers import Trainer, TrainingArguments - from torch.utils.data import Dataset - from transformers import AutoModel, AutoConfig - import numpy as np - - class AssetDataset(Dataset): - def __init__(self, x, y, seq_len=10, pred_len=10, is_pred=False): - self.seq_len = seq_len - self.x = x - self.y = y - self.is_pred = is_pred - self.pred_len = pred_len - - def __getitem__(self, index): - s_begin = index - s_end = s_begin + self.seq_len - r_begin = s_end - 1 - r_end = s_end + self.pred_len - - seq_x = self.x[s_begin:s_end] - seq_y = np.array(self.y[r_begin]) - if self.is_pred: - seq_y = self.x[s_end:r_end] - - return {'past_values': seq_x, 'future_values': seq_y} - - def __len__(self): - if self.is_pred: - return len(self.x) - self.seq_len - self.pred_len + 1 - return len(self.x) - self.seq_len + 1 - - n_classes = 3 - bs = 200 - n_features = 20 - pred_len = 7 - x = torch.randn(bs, n_features) - y = torch.randint(low=0, high=n_classes, size=(bs, 1))[:, 0] - valid_asset_ds = train_asset_ds = AssetDataset(x, y, seq_len=10, pred_len=pred_len, is_pred=False) - config = PatchTSTConfig( - input_size=n_features, - num_classes=n_classes, - context_length=10, - patch_length=5, - stride=5, - batch_size=50, - standardscale=None, # 'bysample' - context_points=10, - encoder_layers=12, - encoder_attention_heads=8, - d_model=256, - encoder_ffn_dim=1024, - dropout=0.2, - fc_dropout=0, - r=0.4, - prediction_length=pred_len, - ) - # model = PatchTSTForPretraining(config) - # model = PatchTSTForPrediction(config) - model = PatchTSTForClassification(config) - training_args = TrainingArguments( - output_dir='./save_model/', - num_train_epochs=1, - per_device_train_batch_size=5, - per_device_eval_batch_size=5, - report_to=[], - save_strategy='no', - remove_unused_columns=False, - no_cuda=True - ) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_asset_ds, - eval_dataset=valid_asset_ds - ) - trainer.train() - trainer.save_model('./save_model') - # AutoConfig.register("patchtst", PatchTSTConfig) - AutoModel.register(PatchTSTConfig, PatchTSTForClassification) - config = AutoConfig.from_pretrained('./save_model') - model = AutoModel.from_pretrained('./save_model', config=config) - print(model) - diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 34caf2d0253442..efca51b1b4f4db 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -35,7 +35,7 @@ import torch from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig - from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder + from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder @require_torch @@ -155,8 +155,8 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else () - all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForPretraining) if is_torch_available() else () + all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () + all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} is_encoder_decoder = False test_pruning = False From 7c09b86b058c356becf5a873736541083eb9096d Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sat, 26 Aug 2023 23:14:33 +0700 Subject: [PATCH 011/189] edit cv_random_masking, add mask to model output --- .../models/patchtst/modeling_patchtst.py | 408 +++++++++--------- 1 file changed, 199 insertions(+), 209 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a4fcd4a85af215..695c6da96ffec0 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -156,6 +156,176 @@ def coord1d_pos_encoding(q_len, exponential=False, normalize=True): return cpe +def set_seed(x=42): + random.seed(x) + np.random.seed(x) + torch.manual_seed(x) + if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) + + +def random_masking( + xb: torch.Tensor, + mask_ratio: float, + unmasked_channel_indices: list = None, + channel_consistent_masking: bool = True, + mask_value=0, +): + """random_masking: Mask the input considering the control variables. + + Args: + xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] + mask_ratio (float): Mask ratio. + unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + mask_value (int, optional): Value to use for masking. Defaults to 0. + + Returns: + Tensor: xb_mask, masked input, same shape as input + Tensor: Mask tensor of shape [bs x c x n] + """ + bs, nvars, L, D = xb.shape + + len_keep = int(L * (1 - mask_ratio)) + + if channel_consistent_masking: + noise = torch.rand(bs, 1, L, device=xb.device) # noise in [0, 1], bs x 1 x L + noise = noise.repeat(1, nvars, 1) # bs x nvars x L + else: + noise = torch.rand(bs, nvars, L, device=xb.device) # noise in [0, 1], bs x nvars x L + + mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] + mask[:, :, :len_keep] = 0 + + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] + mask = torch.gather(mask, dim=-1, index=ids_restore) + + mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] + if unmasked_channel_indices is not None: + mask[:, unmasked_channel_indices, :, :] = 0 + + xb_mask = xb.masked_fill(mask.bool(), mask_value) + return xb_mask, mask[..., 0] + + +class Patch(nn.Module): + """ + A class to patchify the time series sequence into different patches + """ + + def __init__( + self, + seq_len: int, + patch_len: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + ): + super().__init__() + + assert ( + seq_len > patch_len + ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})" + + self.seq_len = seq_len + self.patch_len = patch_len + self.stride = stride + + # get the number of patches + self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1 + tgt_len = patch_len + stride * (self.num_patch - 1) + self.s_begin = seq_len - tgt_len + + def forward(self, x: torch.Tensor): + """ + + Args: + x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars] + Returns: + z: output tensor data [bs x ... x n_vars x num_patch x patch_len] + """ + seq_len = x.shape[-2] + assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." + + # x = x[:, :, self.s_begin:, :] # xb: [bs x ... x tgt_len x nvars] + z = x.transpose(0, -2)[self.s_begin :] # z: [tgt_len x ... x bs x n_vars] + z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution + z = z.unfold( + dimension=-2, size=self.patch_len, step=self.stride + ) # xb: [bs x ... x num_patch x n_vars x patch_len] + z = z.transpose(-2, -3).contiguous() # xb: [bs x ... x n_vars x num_patch x patch_len] + return z + + +class PatchMasking(nn.Module): + def __init__( + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = False, + unmasked_channel_indices: list = None, + mask_value=0, + seed_number: Optional[int] = None + ): + """PatchMasking: Class to random or forcast masking. + + Args: + mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. + mask_ratio (float, optional): Mask ratio. + mask_patches (list, optional): List of patch lengths to mask in the end of the data. + mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. + if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. + unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + mask_value (int, optional): Value to use for masking. Defaults to 0. + """ + if seed_number: + set_seed(seed_number) + self.mask_ratio = mask_ratio + self.channel_consistent_masking = channel_consistent_masking + self.mask_type = mask_type + self.mask_patches = mask_patches + self.mask_patch_ratios = mask_patch_ratios + self.unmasked_channel_indices = unmasked_channel_indices + self.mask_value = mask_value + if self.unmasked_channel_indices is not None: + self.unmasked_channel_indices.sort() + + super().__init__() + + def forward(self, x: torch.Tensor): + """ + Input: + x: patched input + 4D: [bs x n_vars x num_patch x patch_len] + + Output: + x_mask: Masked patched input + 4D: [bs x n_vars x num_patch x patch_len] + mask: bool tensor indicating True on masked points + 4D: [bs x n_vars x num_patch] + """ + + if self.mask_type == "random": + x_mask, mask = random_masking( + xb=x, + mask_ratio=self.mask_ratio, + unmasked_channel_indices=self.unmasked_channel_indices, + channel_consistent_masking=self.channel_consistent_masking, + mask_value=self.mask_value, + ) + + else: + raise Exception("Invalid mask type") + + mask = mask.bool() # mask: [bs x n_vars x num_patch] + + return x_mask, mask + + + class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -532,6 +702,29 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool "The bare PatchTST Model outputting raw hidden-states without any specific head on top.", PATCHTST_START_DOCSTRING, ) + + +class PatchTSTModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + patched_input + """ + + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + patched_input: torch.FloatTensor = None + mask: torch.FloatTensor = None + + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): @@ -544,8 +737,7 @@ def __init__(self, config: PatchTSTConfig): mask_patches=config.mask_patches, mask_patch_ratios=config.mask_patch_ratios, channel_consistent_masking=config.channel_consistent_masking, - d_size=config.d_size, - cv_channel_indices=config.cv_channel_indices, + unmasked_channel_indices=config.unmasked_channel_indices, mask_value=config.mask_value, seed_number=config.seed_number ) @@ -561,31 +753,13 @@ def forward(self, output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain - masked_values = self.masking(patched_values) + masked_values, mask = self.masking(patched_values) encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, hidden_states=encoder_output.hidden_states, - patched_input=patched_values) - - -class PatchTSTModelOutputWithNoAttention(ModelOutput): - """ - Base class for model's outputs, with potential hidden states. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - patched_input - """ - - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - patched_input: torch.FloatTensor = None + patched_input=patched_values, + mask=mask + ) class PretrainHead(nn.Module): @@ -607,184 +781,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def cv_random_masking( - xb: torch.Tensor, - mask_ratio: float, - cv_channel_indices: list = None, - channel_consistent_masking: bool = True, - d_size="4D", - mask_value=0, -): - """cv_random_masking: Mask the input considering the control variables. - - Args: - xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] - mask_ratio (float): Mask ratio. - cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. - d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D". - mask_value (int, optional): Value to use for masking. Defaults to 0. - - Returns: - Tensor: xb_mask, masked input, same shape as input - Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x tsg2 x c x n] - """ - if d_size == "4D": - bs, nvars, L, D = xb.shape - - len_keep = int(L * (1 - mask_ratio)) - - if d_size == "4D": - if channel_consistent_masking: - noise = torch.rand(bs, 1, L, device=xb.device) # noise in [0, 1], bs x 1 x L - noise = noise.repeat(1, nvars, 1) # bs x nvars x L - else: - noise = torch.rand(bs, nvars, L, device=xb.device) # noise in [0, 1], bs x nvars x L - - mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] - mask[:, :, :len_keep] = 0 - - # sort noise for each sample - ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove - ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] - mask = torch.gather(mask, dim=-1, index=ids_restore) - - if d_size == "4D": - mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] - if cv_channel_indices is not None: - mask[:, cv_channel_indices, :, :] = 0 - - xb_mask = xb.masked_fill(mask.bool(), mask_value) - return xb_mask, mask[..., 0] - - -def set_seed(x=42): - random.seed(x) - np.random.seed(x) - torch.manual_seed(x) - if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) - - -class PatchMasking(nn.Module): - def __init__( - self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = True, - d_size: str = "4D", - cv_channel_indices: list = None, - mask_value=0, - seed_number: Optional[int] = None - ): - """PatchMasking: Class to random or forcast masking. - - Args: - mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. - mask_ratio (float, optional): Mask ratio. - mask_patches (list, optional): List of patch lengths to mask in the end of the data. - mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. - if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - cv_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. - d_size (str, optional): Input data size. Allowed values: 4D, 6D. Defaults to "4D". - mask_value (int, optional): Value to use for masking. Defaults to 0. - """ - if seed_number: - set_seed(seed_number) - self.mask_ratio = mask_ratio - self.channel_consistent_masking = channel_consistent_masking - self.d_size = d_size - self.mask_type = mask_type - self.mask_patches = mask_patches - self.mask_patch_ratios = mask_patch_ratios - self.cv_channel_indices = cv_channel_indices - self.mask_value = mask_value - if self.cv_channel_indices is not None: - self.cv_channel_indices.sort() - - super().__init__() - - def forward(self, x: torch.Tensor): - """ - Input: - x: patched input - 4D: [bs x n_vars x num_patch x patch_len] - - Output: - x_mask: Masked patched input - 4D: [bs x n_vars x num_patch x patch_len] - mask: bool tensor indicating True on masked points - 4D: [bs x n_vars x num_patch] - """ - - if self.mask_type == "random": - x_mask, mask = cv_random_masking( - xb=x, - mask_ratio=self.mask_ratio, - cv_channel_indices=self.cv_channel_indices, - channel_consistent_masking=self.channel_consistent_masking, - d_size=self.d_size, - mask_value=self.mask_value, - ) - - else: - raise Exception("Invalid mask type") - - mask = mask.bool() # mask: [bs x n_vars x num_patch] - - return x_mask #, mask - - -class Patch(nn.Module): - """ - A class to patchify the time series sequence into different patches - """ - - def __init__( - self, - seq_len: int, - patch_len: int, - stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence - ): - super().__init__() - - assert ( - seq_len > patch_len - ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})" - - self.seq_len = seq_len - self.patch_len = patch_len - self.stride = stride - - # get the number of patches - self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1 - tgt_len = patch_len + stride * (self.num_patch - 1) - self.s_begin = seq_len - tgt_len - - def forward(self, x: torch.Tensor): - """ - - Args: - x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars] - Returns: - z: output tensor data [bs x ... x n_vars x num_patch x patch_len] - """ - seq_len = x.shape[-2] - assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." - - # x = x[:, :, self.s_begin:, :] # xb: [bs x ... x tgt_len x nvars] - z = x.transpose(0, -2)[self.s_begin :] # z: [tgt_len x ... x bs x n_vars] - z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution - z = z.unfold( - dimension=-2, size=self.patch_len, step=self.stride - ) # xb: [bs x ... x num_patch x n_vars x patch_len] - z = z.transpose(-2, -3).contiguous() # xb: [bs x ... x n_vars x num_patch x patch_len] - return z - - class PatchTSTForPreTrainingOutput(ModelOutput): """ Output type of [`BertForPreTraining`]. @@ -838,7 +834,7 @@ def forward( future_values (y): labels """ model_output = self.model(past_values) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] + x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token loss_val = self.loss(x_hat, model_output.patched_input) return PatchTSTForPreTrainingOutput( @@ -909,9 +905,6 @@ class PatchTSTForClassificationOutput(ModelOutput): (classification) loss. prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -927,7 +920,6 @@ class PatchTSTForClassificationOutput(ModelOutput): loss: Optional[torch.FloatTensor] = None prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -996,10 +988,8 @@ class PatchTSTForPredictionOutput(ModelOutput): Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction outputs of the time series modeling heads. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. From 617db9ae49948be7baac3f699bc33ce4835736f1 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sat, 26 Aug 2023 23:15:08 +0700 Subject: [PATCH 012/189] Update configuration_patchtst.py --- src/transformers/models/patchtst/configuration_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index dae40eee1ee12c..c1547601335353 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -158,7 +158,7 @@ def __init__( mask_patch_ratios: list = [1, 1], channel_consistent_masking: bool = True, d_size: str = "4D", - cv_channel_indices: list = None, + unmasked_channel_indices: list = None, mask_value=0, pooling: str = 'mean', num_classes: int = 1, @@ -229,7 +229,7 @@ def __init__( self.mask_patch_ratios = mask_patch_ratios self.channel_consistent_masking = channel_consistent_masking self.d_size = d_size - self.cv_channel_indices = cv_channel_indices + self.unmasked_channel_indices = unmasked_channel_indices self.mask_value = mask_value # Classification From 484dc009323f2227edc7bef58030873520b138af Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sat, 26 Aug 2023 23:57:19 +0700 Subject: [PATCH 013/189] add masked_loss to the pretraining --- .../models/patchtst/modeling_patchtst.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 695c6da96ffec0..6c4dcffdfaedaf 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -134,7 +134,7 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps= 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1 ) - # pv(f'{i:4.0f} {x:5.3f} {cpe.mean():+6.3f}', verbose) + if abs(cpe.mean()) <= eps: break elif cpe.mean() > eps: @@ -789,11 +789,8 @@ class PatchTSTForPreTrainingOutput(ModelOutput): loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patch, patch_len )`): + Prediction outputs of the modeling head. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -808,8 +805,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - seq_relationship_logits: torch.FloatTensor = None + prediction_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -822,7 +818,7 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config) self.head = PretrainHead(config) - self.loss = torch.nn.MSELoss(reduction="mean") + self.loss = torch.nn.MSELoss(reduction=None) def forward( self, past_values: torch.Tensor, @@ -836,10 +832,13 @@ def forward( model_output = self.model(past_values) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token + # calculate masked_loss loss_val = self.loss(x_hat, model_output.patched_input) + masked_loss = (loss_val * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) + return PatchTSTForPreTrainingOutput( - loss=loss_val, - prediction_logits=x_hat, + loss=masked_loss, + prediction_outputs=x_hat, hidden_states=model_output.hidden_states ) From b1ef4af8b63a7a1a84fcda6c1b354a57a7ac54df Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 27 Aug 2023 23:48:31 +0700 Subject: [PATCH 014/189] add PatchEmbeddings --- .../models/patchtst/modeling_patchtst.py | 258 ++++++++++++------ 1 file changed, 167 insertions(+), 91 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6c4dcffdfaedaf..df7bc3281a37ab 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -26,7 +26,8 @@ from transformers.modeling_outputs import BaseModelOutputWithNoAttention from transformers.utils import ModelOutput from torch.nn.modules.activation import MultiheadAttention -from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig +# from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig +from patchtst.configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) @@ -173,7 +174,7 @@ def random_masking( """random_masking: Mask the input considering the control variables. Args: - xb (Tensor): Input to mask [ bs x nvars x num_patch x patch_len] + xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. @@ -201,7 +202,7 @@ def random_masking( ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] mask = torch.gather(mask, dim=-1, index=ids_restore) - mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] + mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patches x patch_length] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 @@ -209,55 +210,139 @@ def random_masking( return xb_mask, mask[..., 0] -class Patch(nn.Module): +def compute_num_patches(sequence_length, patch_length, stride): + return (max(sequence_length, patch_length) - patch_length) // stride + 1 + + +class Patchify(nn.Module): """ A class to patchify the time series sequence into different patches + Args: + sequence_length (int, required): input sequence length + patch_length (int, required): patch length + stride (int, required): stride between patches + Returns: + z: output tensor data [bs x n_vars x num_patches x patch_length] """ def __init__( self, - seq_len: int, - patch_len: int, + sequence_length: int, + patch_length: int, stride: int, padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() assert ( - seq_len > patch_len - ), f"Sequence length ({seq_len}) has to be greater than the patch length ({patch_len})" + sequence_length > patch_length + ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" - self.seq_len = seq_len - self.patch_len = patch_len + self.sequence_length = sequence_length + self.patch_length = patch_length self.stride = stride # get the number of patches - self.num_patch = (max(seq_len, patch_len) - patch_len) // stride + 1 - tgt_len = patch_len + stride * (self.num_patch - 1) - self.s_begin = seq_len - tgt_len + self.num_patches = compute_num_patches(sequence_length, patch_length, stride) + new_sequence_length = patch_length + stride * (self.num_patches - 1) + self.s_begin = sequence_length - new_sequence_length - def forward(self, x: torch.Tensor): + def forward(self, past_values: torch.Tensor): """ + Args: + past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars] + Returns: + x: output tensor data [bs x n_vars x num_patches x patch_length] + """ + sequence_length = past_values.shape[-2] + assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model ({self.sequence_length})." + + x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + x = x.unfold( + dimension=-2, size=self.patch_length, step=self.stride + ) # x: [bs x num_patches x n_vars x patch_length] + x = x.transpose(-2, -3).contiguous() # xb: [bs x n_vars x num_patches x patch_length] + return x + + +class PatchEmbeddings(nn.Module): + """ + A class to patchify the time series sequence into different patches + Args: + sequence_length (int, required): input sequence length + patch_length (int, required): patch length + stride (int, required): stride between patches + Returns: + embeddings: output tensor data [bs x n_vars x num_patches x embed_dim] + """ + def __init__( + self, + sequence_length: int, + patch_length: int, + stride: int, + embed_dim: int + ): + super().__init__() + + assert ( + sequence_length > patch_length + ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" + + # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" + self.sequence_length = sequence_length + self.patch_length = patch_length + self.stride = stride + self.embed_dim = embed_dim + + # get the number of patches + self.num_patches = compute_num_patches(sequence_length, patch_length, stride) + new_sequence_length = patch_length + stride * (self.num_patches - 1) + self.s_begin = sequence_length - new_sequence_length + + # Embedding + self.projection = nn.Conv1d(in_channels=1, + out_channels=embed_dim, + kernel_size=patch_length, + stride=stride, + ) + + def forward(self, past_values: torch.Tensor): + """ Args: - x (torch.Tensor, required): Input of shape [bs x seq_len x n_vars] + past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars] Returns: - z: output tensor data [bs x ... x n_vars x num_patch x patch_len] + embeddings: output tensor data [bs x n_vars x num_patches x emb_dim] """ - seq_len = x.shape[-2] - assert seq_len == self.seq_len, f"Input sequence length ({seq_len}) doesn't match model ({self.seq_len})." + bs, sequence_length, n_vars = past_values.shape + assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." + + x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + # convert past_values to shape [bs*n_vars x 1 x sequence_length ] + x = x.transpose(1, 2).reshape(bs*n_vars, 1, -1).contiguous() + # projection + embeddings = self.projection(x) # embeddings: [bs*n_vars x emb_dim x num_patches] + # reshape + embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim] + # embeddings = embeddings.flatten(2).transpose(1, 2) + return embeddings - # x = x[:, :, self.s_begin:, :] # xb: [bs x ... x tgt_len x nvars] - z = x.transpose(0, -2)[self.s_begin :] # z: [tgt_len x ... x bs x n_vars] - z = z.transpose(0, -2).contiguous() # z: [bs x ... x tgt_len x n_vars] # TODO: need a better solution - z = z.unfold( - dimension=-2, size=self.patch_len, step=self.stride - ) # xb: [bs x ... x num_patch x n_vars x patch_len] - z = z.transpose(-2, -3).contiguous() # xb: [bs x ... x n_vars x num_patch x patch_len] - return z class PatchMasking(nn.Module): + """ + PatchMasking: Class to random or forcast masking. + + Args: + mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. + mask_ratio (float, optional): Mask ratio. + mask_patches (list, optional): List of patch lengths to mask in the end of the data. + mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. + if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. + unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + mask_value (int, optional): Value to use for masking. Defaults to 0. + """ def __init__( self, mask_type: str = "random", @@ -269,18 +354,7 @@ def __init__( mask_value=0, seed_number: Optional[int] = None ): - """PatchMasking: Class to random or forcast masking. - Args: - mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. - mask_ratio (float, optional): Mask ratio. - mask_patches (list, optional): List of patch lengths to mask in the end of the data. - mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. - if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. - mask_value (int, optional): Value to use for masking. Defaults to 0. - """ if seed_number: set_seed(seed_number) self.mask_ratio = mask_ratio @@ -299,11 +373,11 @@ def forward(self, x: torch.Tensor): """ Input: x: patched input - 4D: [bs x n_vars x num_patch x patch_len] + 4D: [bs x n_vars x num_patches x patch_length] Output: x_mask: Masked patched input - 4D: [bs x n_vars x num_patch x patch_len] + 4D: [bs x n_vars x num_patches x patch_length] mask: bool tensor indicating True on masked points 4D: [bs x n_vars x num_patch] """ @@ -339,9 +413,9 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - src: tensor [bs x nvars x seq_len x d_model] + src: tensor [bs x nvars x sequence_length x d_model] Return: - Tensor [bs x nvars x seq_len x d_model] + Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] for mod in self.layers: @@ -394,42 +468,42 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ - src: tensor [bs x nvars x seq_len x d_model] + src: tensor [bs x nvars x sequence_length x d_model] Return: - Tensor [bs x nvars x seq_len x d_model] + Tensor [bs x nvars x sequence_length x d_model] """ - bs, n_vars, seq_len, d_model = src.shape + bs, n_vars, sequence_length, d_model = src.shape # First sublayer: attention across time - src = src.view(bs*n_vars, seq_len, d_model) # src: [(bs*nvars) x seq_len x d_model] + src = src.view(bs*n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) ) # src: [(bs*nvars) x seq_len x d_model] - src = src.reshape(bs, n_vars, seq_len, d_model) # [bs x nvars x seq_len x d_model] + src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) ) # src: [(bs*nvars) x sequence_length x d_model] + src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time - # [bs x nvars x seq_len x d_model] -> [bs x seq_len x nvars x d_model] -> [(bs*seq_len) x nvars x d_model] - src = src.transpose(2, 1).contiguous().view(bs*seq_len, n_vars, d_model) # [(bs*seq_len) x nvars x d_model] + # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] + src = src.transpose(2, 1).contiguous().view(bs*sequence_length, n_vars, d_model) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) ) # src: [(bs*seq_len) x nvars x d_model] - src = src.reshape(bs, seq_len, n_vars, d_model).transpose(1,2).contiguous() # src: [bs x nvars x seq_len x d_model] + src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) ) # src: [(bs*sequence_length) x nvars x d_model] + src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,2).contiguous() # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden - src = src.view(bs*n_vars, seq_len, d_model) # src: [(bs*nvars) x seq_len x d_model] + src = src.view(bs*n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) )) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout - src = src.reshape(bs, n_vars, seq_len, d_model) # [bs x nvars x seq_len x d_model] + src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src @@ -454,7 +528,7 @@ class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) self.n_vars = config.input_size - self.num_patch = config.num_patch + self.num_patches = config.num_patches self.patch_length = config.patch_length self.d_model = config.d_model self.shared_embedding = config.shared_embedding @@ -472,9 +546,9 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch + 1, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model) else: - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patch, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, config.d_model) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -487,13 +561,13 @@ def __init__(self, config: PatchTSTConfig): def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor: """ - x: tensor [bs x nvars x num_patch x patch_len] + x: tensor [bs x nvars x num_patches x patch_length] return: - tensor [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token + tensor [bs x nvars x num_patches x d_model] + or [bs x nvars x (num_patches+1) x d_model] if use cls_token """ - # bs, num_patch, n_vars, patch_len = x.shape - bs, n_vars, num_patch, patch_len = past_values.shape + # bs, num_patches, n_vars, patch_length = x.shape + bs, n_vars, num_patches, patch_length = past_values.shape output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -506,24 +580,26 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool x_out.append(z) past_values = torch.stack(x_out, dim=1) else: - past_values = self.w_p(past_values) # x: [bs x nvars x num_patch x d_model] + past_values = self.w_p(past_values) # x: [bs x nvars x num_patches x d_model] if self.use_cls_token: - past_values = self.dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patch x d_model] + past_values = self.dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patches x d_model] # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples - past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x nvars x (num_patch+1) x d_model] + past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x nvars x (num_patches+1) x d_model] else: - past_values = self.dropout(past_values + self.w_pos) # x: [bs x nvars x num_patch x d_model] + past_values = self.dropout(past_values + self.w_pos) # x: [bs x nvars x num_patches x d_model] # Encoder past_values, hidden_states = self.encoder( - past_values, output_hidden_states) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - # return past_values + past_values, output_hidden_states) # x: [bs x nvars x num_patches x d_model] + # or [bs x nvars x (num_patches+1) x d_model] if use cls_token + # return past_values, hidden_states return BaseModelOutputWithNoAttention( - last_hidden_state=past_values, hidden_states=hidden_states + last_hidden_state=past_values, + hidden_states=hidden_states ) @@ -729,7 +805,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.patching = Patch(config.context_length, patch_len=config.patch_length, stride=config.stride) + self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride) if config.mask_input: self.masking = PatchMasking( mask_type=config.mask_type, @@ -752,7 +828,7 @@ def forward(self, output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patch x patch_len] for pretrain + patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain masked_values, mask = self.masking(patched_values) encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, @@ -771,11 +847,11 @@ def __init__(self, config): def forward(self, x: torch.Tensor) -> torch.Tensor: """ - x: tensor [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token - output: tensor [bs x nvars x num_patch x patch_len] + x: tensor [bs x nvars x num_patches x d_model] + or [bs x nvars x (num_patches+1) x d_model] if use cls_token + output: tensor [bs x nvars x num_patches x patch_length] """ - x = self.linear(self.dropout(x)) # [bs x nvars x num_patch x patch_len] + x = self.linear(self.dropout(x)) # [bs x nvars x num_patches x patch_length] if self.use_cls_token: x = x[:, :, 1:, :] # remove the first cls token return x @@ -789,7 +865,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput): loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patch, patch_len )`): + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patches, patch_length )`): Prediction outputs of the modeling head. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of @@ -826,11 +902,11 @@ def forward( output_hidden_states: Optional[bool] = None ) -> PatchTSTForPreTrainingOutput: """ - past_values (x): tensor [bs x seq_len x n_vars ] + past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels """ - model_output = self.model(past_values) # x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patch x patch_len] or [bs x nvars x (num_patch+1) x patch_len] if use cls_token + model_output = self.model(past_values) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token + x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token # calculate masked_loss loss_val = self.loss(x_hat, model_output.patched_input) @@ -877,7 +953,7 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x): """ - x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: [bs x n_classes] """ if self.use_cls_token: @@ -930,7 +1006,7 @@ def __init__(self, config: PatchTSTConfig): self.n_vars = config.input_size self.use_cls_token = config.use_cls_token self.pooling = config.pooling - head_dimension = config.d_model if config.pooling else config.d_model * config.num_patch + head_dimension = config.d_model if config.pooling else config.d_model * config.num_patches if self.individual: self.linears = nn.ModuleList() @@ -948,8 +1024,8 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x: torch.Tensor): """ - x: [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x: [bs x nvars x num_patches x d_model] + or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: [bs x forecast_len x nvars] """ if self.use_cls_token: @@ -960,18 +1036,18 @@ def forward(self, x: torch.Tensor): elif self.pooling == 'max': y = x.max(dim=2) # y: [bs x nvars x d_model] else: - y = x # y: [bs x nvars x num_patch x d_model] + y = x # y: [bs x nvars x num_patches x d_model] if self.individual: x_out = [] for i in range(self.n_vars): - z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patch)] or [bs x d_model)] + z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.linears[i](z) # z: [bs x forecast_len] z = self.dropouts[i](z) x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)] + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) x = self.linear(z) # x: [bs x nvars x forecast_len] @@ -1075,7 +1151,7 @@ def __init__(self, config: PatchTSTConfig): self.n_vars = config.input_size self.use_cls_token = config.use_cls_token self.pooling = config.pooling - head_dim = config.d_model if self.pooling else config.d_model * config.num_patch + head_dim = config.d_model if self.pooling else config.d_model * config.num_patches if self.individual: self.linears = nn.ModuleList() @@ -1093,8 +1169,8 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x: torch.Tensor): """ - x: [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token + x: [bs x nvars x num_patches x d_model] + or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: [bs x forecast_len x nvars] """ @@ -1106,18 +1182,18 @@ def forward(self, x: torch.Tensor): elif self.pooling == 'max': y = x.max(dim=2) # y: [bs x nvars x d_model] else: - y = x # y: [bs x nvars x num_patch x d_model] + y = x # y: [bs x nvars x num_patches x d_model] if self.individual: x_out = [] for i in range(self.n_vars): - z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patch)] or [bs x d_model)] + z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.linears[i](z) # z: [bs x forecast_len] z = self.dropouts[i](z) x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patch)] or [bs x nvars x d_model)] + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) x = self.linear(z) # x: [bs x nvars x forecast_len] From bc22a87ccb6791b8fb1d5a9a9918b1415797ee64 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 27 Aug 2023 23:48:42 +0700 Subject: [PATCH 015/189] Update configuration_patchtst.py --- src/transformers/models/patchtst/configuration_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index c1547601335353..24d741867fd2d3 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -215,7 +215,7 @@ def __init__( # PatchTST self.patch_length = patch_length self.stride = stride - self.num_patch = self._num_patches() + self.num_patches = self._num_patches() self.attention_type = attention_type self.sampling_factor = sampling_factor self.distil = distil From 9799a5bf9c16857085871439c387d371cedbe97e Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 28 Aug 2023 20:12:56 +0700 Subject: [PATCH 016/189] edit loss which considers mask in the pretraining --- .../models/patchtst/modeling_patchtst.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index df7bc3281a37ab..6bddab979e4d28 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -164,11 +164,12 @@ def set_seed(x=42): if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) + def random_masking( xb: torch.Tensor, mask_ratio: float, unmasked_channel_indices: list = None, - channel_consistent_masking: bool = True, + channel_consistent_masking: bool = False, mask_value=0, ): """random_masking: Mask the input considering the control variables. @@ -194,14 +195,14 @@ def random_masking( else: noise = torch.rand(bs, nvars, L, device=xb.device) # noise in [0, 1], bs x nvars x L - mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] - mask[:, :, :len_keep] = 0 + mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] + mask[:, :, :len_keep] = 0 # sort noise for each sample ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] - mask = torch.gather(mask, dim=-1, index=ids_restore) + mask = torch.gather(mask, dim=-1, index=ids_restore) mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patches x patch_length] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 @@ -255,7 +256,7 @@ def forward(self, past_values: torch.Tensor): x: output tensor data [bs x n_vars x num_patches x patch_length] """ sequence_length = past_values.shape[-2] - assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model ({self.sequence_length})." + assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] x = x.unfold( @@ -803,10 +804,12 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): - def __init__(self, config: PatchTSTConfig): + def __init__(self, config: PatchTSTConfig, mask_input: bool = False): super().__init__(config) self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride) - if config.mask_input: + self.mask_input = mask_input #config.mask_input + + if self.mask_input: self.masking = PatchMasking( mask_type=config.mask_type, mask_ratio=config.mask_ratio, @@ -829,7 +832,10 @@ def forward(self, output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain - masked_values, mask = self.masking(patched_values) + if self.mask_input: + masked_values, mask = self.masking(patched_values) + else: + masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, hidden_states=encoder_output.hidden_states, @@ -891,10 +897,10 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - config.mask_input = True - self.model = PatchTSTModel(config) + # config.mask_input = True + self.model = PatchTSTModel(config=config, mask_input=True) self.head = PretrainHead(config) - self.loss = torch.nn.MSELoss(reduction=None) + self.loss = torch.nn.MSELoss(reduction='none') def forward( self, past_values: torch.Tensor, @@ -910,7 +916,7 @@ def forward( # calculate masked_loss loss_val = self.loss(x_hat, model_output.patched_input) - masked_loss = (loss_val * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) + masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) return PatchTSTForPreTrainingOutput( loss=masked_loss, From 78f317377aa9cc94e5644a52f19580460cfb6eb4 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 28 Aug 2023 20:13:10 +0700 Subject: [PATCH 017/189] remove patch_last option --- src/transformers/models/patchtst/configuration_patchtst.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 24d741867fd2d3..a85c7035eccc36 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -148,7 +148,7 @@ def __init__( positional_encoding: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, - patch_last: bool = True, + individual: bool = False, seed_number= None, mask_input: Optional[bool] = None, @@ -156,7 +156,7 @@ def __init__( mask_ratio=0.5, mask_patches: list = [2, 3], mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = True, + channel_consistent_masking: bool = False, d_size: str = "4D", unmasked_channel_indices: list = None, mask_value=0, @@ -209,7 +209,7 @@ def __init__( self.positional_encoding = positional_encoding self.learn_pe = learn_pe self.use_cls_token = use_cls_token - self.patch_last = patch_last + # self.patch_last = patch_last self.individual = individual # PatchTST From 30819f69069b2dd536685ea43474c5fc90abf19f Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Mon, 28 Aug 2023 10:32:17 -0400 Subject: [PATCH 018/189] Add commits from internal repo --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6bddab979e4d28..862cf2253d0585 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -27,7 +27,7 @@ from transformers.utils import ModelOutput from torch.nn.modules.activation import MultiheadAttention # from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig -from patchtst.configuration_patchtst import PatchTSTConfig +from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) From 2060fb07c25f6597e9b29111c272f509f494f8c5 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Mon, 28 Aug 2023 11:30:16 -0400 Subject: [PATCH 019/189] Update ForecastHead --- .../models/patchtst/modeling_patchtst.py | 145 ++++++++++-------- 1 file changed, 78 insertions(+), 67 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 862cf2253d0585..f7d873cfdb4ea5 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -26,14 +26,12 @@ from transformers.modeling_outputs import BaseModelOutputWithNoAttention from transformers.utils import ModelOutput from torch.nn.modules.activation import MultiheadAttention -# from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "PatchTSTConfig" - PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ibm/patchtst-base", # See all PatchTST models at https://huggingface.co/models?filter=patchtst @@ -132,8 +130,9 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps= i = 0 for i in range(100): cpe = ( - 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - - 1 + 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * ( + torch.linspace(0, 1, d_model).reshape(1, -1) ** x) + - 1 ) if abs(cpe.mean()) <= eps: @@ -164,13 +163,12 @@ def set_seed(x=42): if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) - def random_masking( - xb: torch.Tensor, - mask_ratio: float, - unmasked_channel_indices: list = None, - channel_consistent_masking: bool = False, - mask_value=0, + xb: torch.Tensor, + mask_ratio: float, + unmasked_channel_indices: list = None, + channel_consistent_masking: bool = False, + mask_value=0, ): """random_masking: Mask the input considering the control variables. @@ -227,16 +225,16 @@ class Patchify(nn.Module): """ def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + self, + sequence_length: int, + patch_length: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" self.sequence_length = sequence_length @@ -276,17 +274,18 @@ class PatchEmbeddings(nn.Module): Returns: embeddings: output tensor data [bs x n_vars x num_patches x embed_dim] """ + def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - embed_dim: int + self, + sequence_length: int, + patch_length: int, + stride: int, + embed_dim: int ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" @@ -320,16 +319,16 @@ def forward(self, past_values: torch.Tensor): x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] # convert past_values to shape [bs*n_vars x 1 x sequence_length ] - x = x.transpose(1, 2).reshape(bs*n_vars, 1, -1).contiguous() + x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous() # projection - embeddings = self.projection(x) # embeddings: [bs*n_vars x emb_dim x num_patches] + embeddings = self.projection(x) # embeddings: [bs*n_vars x emb_dim x num_patches] # reshape - embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim] + embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, + self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim] # embeddings = embeddings.flatten(2).transpose(1, 2) return embeddings - class PatchMasking(nn.Module): """ PatchMasking: Class to random or forcast masking. @@ -344,16 +343,17 @@ class PatchMasking(nn.Module): channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. """ + def __init__( - self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = False, - unmasked_channel_indices: list = None, - mask_value=0, - seed_number: Optional[int] = None + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = False, + unmasked_channel_indices: list = None, + mask_value=0, + seed_number: Optional[int] = None ): if seed_number: @@ -400,7 +400,6 @@ def forward(self, x: torch.Tensor): return x_mask, mask - class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -476,35 +475,43 @@ def forward(self, src: torch.Tensor): bs, n_vars, sequence_length, d_model = src.shape # First sublayer: attention across time - src = src.view(bs*n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view(bs * n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path1(self.self_attn(self.norm_sublayer1(src)) ) # Add: residual connection with residual dropout + src = src + self.dropout_path1( + self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src) ) ) # src: [(bs*nvars) x sequence_length x d_model] - src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = self.norm_sublayer1( + src + self.dropout_path1(self.self_attn(src))) # src: [(bs*nvars) x sequence_length x d_model] + src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] - src = src.transpose(2, 1).contiguous().view(bs*sequence_length, n_vars, d_model) # [(bs*sequence_length) x nvars x d_model] + src = src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, + d_model) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path2(self.self_attn(self.norm_sublayer2(src)) ) # Add: residual connection with residual dropout + src = src + self.dropout_path2( + self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src) ) ) # src: [(bs*sequence_length) x nvars x d_model] - src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1,2).contiguous() # src: [bs x nvars x sequence_length x d_model] + src = self.norm_sublayer2( + src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] + src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, + 2).contiguous() # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden - src = src.view(bs*n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view(bs * n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection - src = src + self.dropout_path3(self.ff( self.norm_sublayer3(src) )) # Add: residual connection with residual dropout + src = src + self.dropout_path3( + self.ff(self.norm_sublayer3(src))) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout - src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = self.norm_sublayer3( + src + self.dropout_path3(self.ff(src))) # Add: residual connection with residual dropout + src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src @@ -547,9 +554,11 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, + config.d_model) else: - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, config.d_model) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, + config.d_model) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -560,7 +569,8 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> torch.Tensor: + def forward(self, past_values: torch.Tensor, + output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention: """ x: tensor [bs x nvars x num_patches x patch_length] return: @@ -595,7 +605,7 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool # Encoder past_values, hidden_states = self.encoder( past_values, output_hidden_states) # x: [bs x nvars x num_patches x d_model] - # or [bs x nvars x (num_patches+1) x d_model] if use cls_token + # or [bs x nvars x (num_patches+1) x d_model] if use cls_token # return past_values, hidden_states return BaseModelOutputWithNoAttention( @@ -779,8 +789,6 @@ def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool "The bare PatchTST Model outputting raw hidden-states without any specific head on top.", PATCHTST_START_DOCSTRING, ) - - class PatchTSTModelOutputWithNoAttention(ModelOutput): """ Base class for model's outputs, with potential hidden states. @@ -807,7 +815,7 @@ class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig, mask_input: bool = False): super().__init__(config) self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride) - self.mask_input = mask_input #config.mask_input + self.mask_input = mask_input # config.mask_input if self.mask_input: self.masking = PatchMasking( @@ -826,12 +834,13 @@ def __init__(self, config: PatchTSTConfig, mask_input: bool = False): def forward(self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor]=None, + future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - patched_values = self.patching(past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain + patched_values = self.patching( + past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain if self.mask_input: masked_values, mask = self.masking(patched_values) else: @@ -903,7 +912,7 @@ def __init__(self, config: PatchTSTConfig): self.loss = torch.nn.MSELoss(reduction='none') def forward( - self, past_values: torch.Tensor, + self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None ) -> PatchTSTForPreTrainingOutput: @@ -911,8 +920,10 @@ def forward( past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels """ - model_output = self.model(past_values) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token - x_hat = self.head(model_output[0]) # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token + model_output = self.model( + past_values) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token + x_hat = self.head(model_output[ + 0]) # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token # calculate masked_loss loss_val = self.loss(x_hat, model_output.patched_input) @@ -1035,14 +1046,14 @@ def forward(self, x: torch.Tensor): output: [bs x forecast_len x nvars] """ if self.use_cls_token: - y = x[:, :, 0, :] # y: [bs x nvars x d_model] + y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: if self.pooling == 'mean': y = x.mean(dim=2) # y: [bs x nvars x d_model] elif self.pooling == 'max': y = x.max(dim=2) # y: [bs x nvars x d_model] else: - y = x # y: [bs x nvars x num_patches x d_model] + y = x # y: [bs x nvars x num_patches x d_model] if self.individual: x_out = [] @@ -1053,7 +1064,7 @@ def forward(self, x: torch.Tensor): x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) x = self.linear(z) # x: [bs x nvars x forecast_len] @@ -1166,7 +1177,7 @@ def __init__(self, config: PatchTSTConfig): for i in range(self.n_vars): self.flattens.append(nn.Flatten(start_dim=2)) self.linears.append(nn.Linear(head_dim, config.prediction_length)) - self.dropouts.append(nn.Dropout(head_dropout) if head_dropout > 0 else nn.Identity() + self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() ) else: self.flatten = nn.Flatten(start_dim=2) @@ -1181,14 +1192,14 @@ def forward(self, x: torch.Tensor): """ if self.use_cls_token: - y = x[:, :, 0, :] # y: [bs x nvars x d_model] + y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: if self.pooling == 'mean': y = x.mean(dim=2) # y: [bs x nvars x d_model] elif self.pooling == 'max': y = x.max(dim=2) # y: [bs x nvars x d_model] else: - y = x # y: [bs x nvars x num_patches x d_model] + y = x # y: [bs x nvars x num_patches x d_model] if self.individual: x_out = [] @@ -1199,7 +1210,7 @@ def forward(self, x: torch.Tensor): x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] + z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) x = self.linear(z) # x: [bs x nvars x forecast_len] From 271b19bd89fea5db7feb647c0e5a4bc39df8d4c4 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Mon, 28 Aug 2023 18:19:03 -0400 Subject: [PATCH 020/189] Add model weight initilization + unittest --- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/patchtst/__init__.py | 4 + .../models/patchtst/configuration_patchtst.py | 3 +- .../models/patchtst/modeling_patchtst.py | 39 +++- .../models/patchtst/test_modeling_patchtst.py | 188 ++---------------- 5 files changed, 63 insertions(+), 172 deletions(-) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b7cf99b0e0e4ae..d15f166fe6dc54 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -152,6 +152,7 @@ ("openai-gpt", "OpenAIGPTModel"), ("opt", "OPTModel"), ("owlvit", "OwlViTModel"), + ("patchtst", "PatchTSTModel"), ("pegasus", "PegasusModel"), ("pegasus_x", "PegasusXModel"), ("perceiver", "PerceiverModel"), diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 88ed72154b826c..265eef2483805d 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -33,6 +33,7 @@ _import_structure["modeling_patchtst"] = [ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", "PatchTSTForPretraining", + "PatchTSTForPrediction" "PatchTSTModel", "PatchTSTPreTrainedModel", ] @@ -51,6 +52,9 @@ PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForPretraining, PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForClassification, PatchTSTPreTrainedModel, ) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index a85c7035eccc36..085d886d1bd86d 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -148,7 +148,7 @@ def __init__( positional_encoding: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, - + init_std: float = 0.02, individual: bool = False, seed_number= None, mask_input: Optional[bool] = None, @@ -211,6 +211,7 @@ def __init__( self.use_cls_token = use_cls_token # self.patch_last = patch_last self.individual = individual + self.init_std = init_std # PatchTST self.patch_length = patch_length diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f7d873cfdb4ea5..4235d96c7422f2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -169,10 +169,12 @@ def random_masking( unmasked_channel_indices: list = None, channel_consistent_masking: bool = False, mask_value=0, + seed_number: Optional[int] = None ): """random_masking: Mask the input considering the control variables. Args: + seed_number (int, optional): Value to set for the seed number xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. @@ -183,6 +185,9 @@ def random_masking( Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] """ + if seed_number: + set_seed(seed_number) + bs, nvars, L, D = xb.shape len_keep = int(L * (1 - mask_ratio)) @@ -356,8 +361,8 @@ def __init__( seed_number: Optional[int] = None ): - if seed_number: - set_seed(seed_number) + # if seed_number: + # set_seed(seed_number) self.mask_ratio = mask_ratio self.channel_consistent_masking = channel_consistent_masking self.mask_type = mask_type @@ -367,6 +372,7 @@ def __init__( self.mask_value = mask_value if self.unmasked_channel_indices is not None: self.unmasked_channel_indices.sort() + self.seed_number = seed_number super().__init__() @@ -390,6 +396,7 @@ def forward(self, x: torch.Tensor): unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, mask_value=self.mask_value, + seed_number=self.seed_number ) else: @@ -526,6 +533,18 @@ def _init_weights(self, module): """Initialize weights""" if self.config.use_cls_token: torch.nn.init.normal_(self.config.cls_token, std=0.02) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=self.config.init_std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, MultiheadAttention): + module.in_proj_weight.data.normal_(mean=0.0, std=self.config.init_std) + module.bias_k.data.normal_(mean=0.0, std=self.config.init_std) + module.bias_v.data.normal_(mean=0.0, std=self.config.init_std) + def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (ChannelAttentionPatchTSTEncoder)): @@ -550,7 +569,6 @@ def __init__(self, config: PatchTSTConfig): self.w_p.append(nn.Linear(config.patch_length, config.d_model)) else: self.w_p = nn.Linear(config.patch_length, config.d_model) - # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) @@ -832,6 +850,9 @@ def __init__(self, config: PatchTSTConfig, mask_input: bool = False): self.masking = nn.Identity() self.encoder = ChannelAttentionPatchTSTEncoder(config) + # Initialize weights and apply final processing + self.post_init() + def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, @@ -911,6 +932,9 @@ def __init__(self, config: PatchTSTConfig): self.head = PretrainHead(config) self.loss = torch.nn.MSELoss(reduction='none') + # Initialize weights and apply final processing + self.post_init() + def forward( self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, @@ -945,6 +969,9 @@ def __init__(self, config: PatchTSTConfig): self.head = ClassificationHead(config) self.loss = nn.CrossEntropyLoss() + # Initialize weights and apply final processing + self.post_init() + def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None): model_output = self.model(past_values) y_hat = self.head(model_output[0]) @@ -1110,6 +1137,9 @@ def __init__(self, config: PatchTSTConfig): self.head = PredictionHead(config) self.loss = nn.MSELoss(reduction='mean') + # Initialize weights and apply final processing + self.post_init() + def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor], @@ -1227,6 +1257,9 @@ def __init__(self, config: PatchTSTConfig): self.head = ForecastHead(config) self.loss = nn.MSELoss(reduction='mean') + # Initialize weights and apply final processing + self.post_init() + def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor], diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index efca51b1b4f4db..dd2ae7cf8ffb42 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -22,10 +22,10 @@ from huggingface_hub import hf_hub_download from transformers import is_torch_available -from transformers.testing_utils import is_flaky, require_torch, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, torch_device from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor, _config_zero_init +from ...test_modeling_common import ModelTesterMixin, floats_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -35,7 +35,8 @@ import torch from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig - from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel, ChannelAttentionPatchTSTEncoder + from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel + # from transformers import PatchTSTConfig, PatchTSTModel, PatchTSTForPrediction @require_torch @@ -112,14 +113,11 @@ def prepare_patchtst_inputs_dict(self, config): # [bs x seq_len x n_vars] past_values = floats_tensor([self.batch_size, _past_length, self.input_size]) - # past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5 future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size]) inputs_dict = { "past_values": past_values, - # "past_observed_mask": past_observed_mask, - # "future_time_features": future_time_features, "future_values": future_values, } return inputs_dict @@ -133,25 +131,6 @@ def prepare_config_and_inputs_for_common(self): config, inputs_dict = self.prepare_config_and_inputs() return config, inputs_dict - # def check_encoder_model_standalone(self, config, inputs_dict): - # model = PatchTSTModel(config=config).to(torch_device).eval() - # outputs = model(**inputs_dict) - # - # encoder_last_hidden_state = outputs.encoder_last_hidden_state - # - # with tempfile.TemporaryDirectory() as tmpdirname: - # encoder = model.get_encoder() - # encoder.save_pretrained(tmpdirname) - # encoder = ChannelAttentionPatchTSTEncoder.from_pretrained(tmpdirname).to(torch_device) - # - # transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict) - # # [bs x seq_len x n_vars] => bs, num_patch, n_vars, patch_len = x.shape - # enc_input = transformer_inputs[:, : config.context_length, ...] - # - # encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0] - # - # self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) - @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): @@ -195,9 +174,6 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) - # def test_encoder_model_standalone(self): - # config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() - # self.model_tester.check_encoder_model_standalone(*config_and_inputs) # def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): @@ -245,19 +221,19 @@ def test_model_outputs_equivalence(self): def test_determinism(self): pass - def test_initialization(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_model_main_input_name(self): + model_signature = inspect.signature(getattr(PatchTSTModel, "forward")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name) - configs_no_init = _config_zero_init(config) - for model_class in self.all_model_classes: - model = model_class(config=configs_no_init) - for name, param in model.named_parameters(): - if param.requires_grad: - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) + def test_save_load_fast_init_from_base(self): + # super().test_save_load_fast_init_from_base() + pass + + def test_save_load_fast_init_to_base(self): + # super().test_save_load_fast_init_to_base() + pass def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -270,144 +246,20 @@ def test_forward_signature(self): expected_arg_names = [ "past_values", - # "past_time_features", - # "past_observed_mask", - # "static_categorical_features", - # "static_real_features", "future_values", - # "future_time_features", ] expected_arg_names.extend( [ - # "future_observed_mask", - # "decoder_attention_mask", - # "head_mask", - # "decoder_head_mask", - # "cross_attn_head_mask", - # "encoder_outputs", - # "past_key_values", "output_hidden_states", - # "output_attentions", - # "use_cache", - # "return_dict", ] - # if "future_observed_mask" in arg_names - # else [ - # "decoder_attention_mask", - # "head_mask", - # "decoder_head_mask", - # "cross_attn_head_mask", - # "encoder_outputs", - # "past_key_values", - # "output_hidden_states", - # "output_attentions", - # "use_cache", - # "return_dict", - # ] ) self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) -# -# def test_attention_outputs(self): -# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() -# config.return_dict = True -# -# seq_len = getattr(self.model_tester, "seq_length", None) -# decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) -# encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) -# context_length = getattr(self.model_tester, "context_length", seq_len) -# prediction_length = getattr(self.model_tester, "prediction_length", seq_len) -# -# for model_class in self.all_model_classes: -# inputs_dict["output_attentions"] = True -# inputs_dict["output_hidden_states"] = False -# config.return_dict = True -# model = model_class(config) -# model.to(torch_device) -# model.eval() -# with torch.no_grad(): -# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) -# attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions -# self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) -# -# # check that output_attentions also work using config -# del inputs_dict["output_attentions"] -# config.output_attentions = True -# model = model_class(config) -# model.to(torch_device) -# model.eval() -# with torch.no_grad(): -# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) -# attentions = outputs.encoder_attentions -# self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) -# -# self.assertListEqual( -# list(attentions[0].shape[-3:]), -# [self.model_tester.num_attention_heads, encoder_seq_length, context_length], -# ) -# out_len = len(outputs) -# -# correct_outlen = 7 -# -# if "last_hidden_state" in outputs: -# correct_outlen += 1 -# -# if "past_key_values" in outputs: -# correct_outlen += 1 # past_key_values have been returned -# -# if "loss" in outputs: -# correct_outlen += 1 -# -# if "params" in outputs: -# correct_outlen += 1 -# -# self.assertEqual(out_len, correct_outlen) -# -# # decoder attentions -# decoder_attentions = outputs.decoder_attentions -# self.assertIsInstance(decoder_attentions, (list, tuple)) -# self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) -# self.assertListEqual( -# list(decoder_attentions[0].shape[-3:]), -# [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length], -# ) -# -# # cross attentions -# cross_attentions = outputs.cross_attentions -# self.assertIsInstance(cross_attentions, (list, tuple)) -# self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) -# self.assertListEqual( -# list(cross_attentions[0].shape[-3:]), -# [ -# self.model_tester.num_attention_heads, -# decoder_seq_length, -# encoder_seq_length, -# ], -# ) -# -# # Check attention is always last and order is fine -# inputs_dict["output_attentions"] = True -# inputs_dict["output_hidden_states"] = True -# model = model_class(config) -# model.to(torch_device) -# model.eval() -# with torch.no_grad(): -# outputs = model(**self._prepare_for_class(inputs_dict, model_class)) -# -# self.assertEqual(out_len + 2, len(outputs)) -# -# self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions -# -# self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) -# self.assertListEqual( -# list(self_attentions[0].shape[-3:]), -# [self.model_tester.num_attention_heads, encoder_seq_length, context_length], -# ) -# -# @is_flaky() -# def test_retain_grad_hidden_states_attentions(self): -# super().test_retain_grad_hidden_states_attentions() + + @is_flaky() + def test_retain_grad_hidden_states_attentions(self): + super().test_retain_grad_hidden_states_attentions() # # # def prepare_batch(filename="train-batch.pt"): From 9325a6a64ecabf07670887c98e4bd91d877f5a8a Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 29 Aug 2023 01:43:11 -0400 Subject: [PATCH 021/189] Update PatchTST unittest to use local import --- src/transformers/__init__.py | 6 ++++++ src/transformers/models/patchtst/__init__.py | 7 ++++--- tests/models/patchtst/test_modeling_patchtst.py | 13 ++----------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 051d4ef647f59c..5b294f4218bb73 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1995,6 +1995,9 @@ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", "PatchTSTModel", "PatchTSTPreTrainedModel", + "PatchTSTForPrediction", + "PatchTSTForForecasting", + "PatchTSTForPretraining" ] ) _import_structure["models.instructblip"].extend( @@ -5859,6 +5862,9 @@ from .models.patchtst import ( PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForPretraining, PatchTSTPreTrainedModel, ) from .models.instructblip import ( diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 265eef2483805d..35e2a01f166a29 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -14,7 +14,7 @@ from typing import TYPE_CHECKING # rely on isort to merge the imports -from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available _import_structure = { @@ -32,10 +32,11 @@ else: _import_structure["modeling_patchtst"] = [ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", - "PatchTSTForPretraining", - "PatchTSTForPrediction" "PatchTSTModel", "PatchTSTPreTrainedModel", + "PatchTSTForPrediction", + "PatchTSTForForecasting", + "PatchTSTForPretraining" ] diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index dd2ae7cf8ffb42..c99cbe0a74900f 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -33,10 +33,9 @@ if is_torch_available(): import torch + from transformers import PatchTSTConfig + from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining - from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig - from transformers.models.patchtst.modeling_patchtst import PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTModel - # from transformers import PatchTSTConfig, PatchTSTModel, PatchTSTForPrediction @require_torch @@ -227,14 +226,6 @@ def test_model_main_input_name(self): observed_main_input_name = list(model_signature.parameters.keys())[1] self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name) - def test_save_load_fast_init_from_base(self): - # super().test_save_load_fast_init_from_base() - pass - - def test_save_load_fast_init_to_base(self): - # super().test_save_load_fast_init_to_base() - pass - def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() From 0c5deb4126e8688cdc1633351d4aa051c5ba1316 Mon Sep 17 00:00:00 2001 From: Ngoc Diep Do Date: Tue, 29 Aug 2023 13:43:06 +0200 Subject: [PATCH 022/189] PatchTST integration tests for pretraining and prediction --- .../models/patchtst/modeling_patchtst.py | 2 +- .../models/patchtst/test_modeling_patchtst.py | 143 ++++++++---------- 2 files changed, 67 insertions(+), 78 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4235d96c7422f2..87f36dd8eb5867 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1142,7 +1142,7 @@ def __init__(self, config: PatchTSTConfig): def forward(self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor], + future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index c99cbe0a74900f..c9a6afa7315757 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -22,7 +22,7 @@ from huggingface_hub import hf_hub_download from transformers import is_torch_available -from transformers.testing_utils import is_flaky, require_torch, torch_device +from transformers.testing_utils import is_flaky, require_torch, torch_device, slow from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor @@ -251,79 +251,68 @@ def test_forward_signature(self): @is_flaky() def test_retain_grad_hidden_states_attentions(self): super().test_retain_grad_hidden_states_attentions() -# -# -# def prepare_batch(filename="train-batch.pt"): -# file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") -# batch = torch.load(file, map_location=torch_device) -# return batch -# -# -# @require_torch -# @slow -# class PatchTSTModelIntegrationTests(unittest.TestCase): -# def test_inference_no_head(self): -# model = PatchTSTModel.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) -# batch = prepare_batch() -# -# torch.manual_seed(0) -# with torch.no_grad(): -# output = model( -# past_values=batch["past_values"], -# past_time_features=batch["past_time_features"], -# past_observed_mask=batch["past_observed_mask"], -# static_categorical_features=batch["static_categorical_features"], -# future_values=batch["future_values"], -# future_time_features=batch["future_time_features"], -# ).last_hidden_state -# expected_shape = torch.Size((64, model.config.context_length, model.config.d_model)) -# self.assertEqual(output.shape, expected_shape) -# -# expected_slice = torch.tensor( -# [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]], -# device=torch_device, -# ) -# self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) -# -# def test_inference_head(self): -# model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) -# batch = prepare_batch("val-batch.pt") -# -# torch.manual_seed(0) -# with torch.no_grad(): -# output = model( -# past_values=batch["past_values"], -# past_time_features=batch["past_time_features"], -# past_observed_mask=batch["past_observed_mask"], -# static_categorical_features=batch["static_categorical_features"], -# future_time_features=batch["future_time_features"], -# ).encoder_last_hidden_state -# -# # encoder distils the context length to 1/8th of the original length -# expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model)) -# self.assertEqual(output.shape, expected_shape) -# -# expected_slice = torch.tensor( -# [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device -# ) -# self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) -# -# def test_seq_to_seq_generation(self): -# model = PatchTSTForPrediction.from_pretrained("huggingface/patchtst-tourism-monthly").to(torch_device) -# batch = prepare_batch("val-batch.pt") -# -# torch.manual_seed(0) -# with torch.no_grad(): -# outputs = model.generate( -# static_categorical_features=batch["static_categorical_features"], -# past_time_features=batch["past_time_features"], -# past_values=batch["past_values"], -# future_time_features=batch["future_time_features"], -# past_observed_mask=batch["past_observed_mask"], -# ) -# expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) -# self.assertEqual(outputs.sequences.shape, expected_shape) -# -# expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) -# mean_prediction = outputs.sequences.mean(dim=1) -# self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) + + +def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'): + file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") + batch = torch.load(file, map_location=torch_device) + return batch + + +@require_torch +@slow +class PatchTSTModelIntegrationTests(unittest.TestCase): + def test_pretrain_head(self): + model = PatchTSTForPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device) + batch = prepare_batch() + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"].to(torch_device) + ).prediction_outputs + num_patch = (max(model.config.context_length, + model.config.patch_length) - model.config.patch_length) // model.config.stride + 1 + expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length]) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], + [[0.0246]], [[0.0090]]], + device=torch_device) + self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) + + # def test_classification_head(self): + # # mock data, test + # model = PatchTSTForClassification.from_pretrained('diepi/test_patchtst_classification_mock').to(torch_device) + # batch = prepare_batch(repo_id="diepi/mock-data", file="test-mock-patchtst.pt") + # + # torch.manual_seed(0) + # with torch.no_grad(): + # output = model( + # past_values=batch["past_values"].to(torch_device) + # ).prediction_logits + # expected_shape = torch.Size([1, model.config.num_classes]) + # self.assertEqual(output.shape, expected_shape) + # + # expected_slice = torch.tensor([[-0.2774, -0.1081, 0.6771]], + # device=torch_device, + # ) + # self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE)) + + def test_prediction_head(self): + model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device) + batch = prepare_batch(file="test-batch.pt") + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"].to(torch_device), + future_values=batch["future_values"].to(torch_device) + ).prediction_outputs + expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + device=torch_device, + ) + self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) From 00c2af6cca1d30a0f180f40ebd2fa214ddb44879 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 29 Aug 2023 23:39:56 -0400 Subject: [PATCH 023/189] Added PatchTSTForRegression + update unittest to include label generation --- src/transformers/__init__.py | 10 ++- src/transformers/models/auto/__init__.py | 4 + src/transformers/models/auto/modeling_auto.py | 11 ++- src/transformers/models/patchtst/__init__.py | 5 +- .../models/patchtst/configuration_patchtst.py | 6 ++ .../models/patchtst/modeling_patchtst.py | 90 +++++++++++++++++-- tests/models/bert/test_modeling_bert.py | 3 + .../models/patchtst/test_modeling_patchtst.py | 85 +++++++++++++++--- 8 files changed, 193 insertions(+), 21 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5b294f4218bb73..d6a15e2aeebf93 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1142,6 +1142,8 @@ "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING", "MODEL_MAPPING", "MODEL_WITH_LM_HEAD_MAPPING", + "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING", + "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING", "AutoBackbone", "AutoModel", "AutoModelForAudioClassification", @@ -1997,7 +1999,9 @@ "PatchTSTPreTrainedModel", "PatchTSTForPrediction", "PatchTSTForForecasting", - "PatchTSTForPretraining" + "PatchTSTForPretraining", + "PatchTSTForClassification", + "PatchTSTForRegression", ] ) _import_structure["models.instructblip"].extend( @@ -5163,6 +5167,8 @@ MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, AutoBackbone, AutoModel, AutoModelForAudioClassification, @@ -5866,6 +5872,8 @@ PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTPreTrainedModel, + PatchTSTForClassification, + PatchTSTForRegression, ) from .models.instructblip import ( INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 12d79822fd1d43..6b13313fd6be73 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -76,6 +76,8 @@ "MODEL_WITH_LM_HEAD_MAPPING", "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING", + "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING", + "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING", "AutoModel", "AutoBackbone", "AutoModelForAudioClassification", @@ -254,6 +256,8 @@ MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, AutoBackbone, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index d15f166fe6dc54..af27c099bceb77 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1110,7 +1110,13 @@ MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ - ("PatchTST", "PatchTSTForClassification"), + ("patchtst", "PatchTSTForClassification"), + ] +) + +MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES = OrderedDict( + [ + ("patchtst", "PatchTSTForRegression"), ] ) @@ -1203,6 +1209,9 @@ CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES ) +MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES +) class AutoModelForMaskGeneration(_BaseAutoModelClass): _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 35e2a01f166a29..c5836322f55b99 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -36,7 +36,9 @@ "PatchTSTPreTrainedModel", "PatchTSTForPrediction", "PatchTSTForForecasting", - "PatchTSTForPretraining" + "PatchTSTForPretraining", + "PatchTSTForRegression", + "PatchTSTForClassification" ] @@ -56,6 +58,7 @@ PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForClassification, + PatchTSTForRegression, PatchTSTPreTrainedModel, ) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 085d886d1bd86d..4fc0946a51ec74 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -172,6 +172,8 @@ def __init__( is_encoder_decoder: bool = False, encoder_layerdrop: float = 0.1, prediction_length: int = 24, + prediction_range: List = [0, 1], + target_dimension: int = 1, # PatchTST arguments attention_type: str = "prob", @@ -243,6 +245,10 @@ def __init__( # Forcasting self.prediction_length = prediction_length + # Regression + self.target_dimension = target_dimension + self.prediction_range = prediction_range + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) def _num_patches(self): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 87f36dd8eb5867..6932b87ff2792b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -944,8 +944,11 @@ def forward( past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) model_output = self.model( - past_values) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token + past_values, output_hidden_states=output_hidden_states) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token x_hat = self.head(model_output[ 0]) # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token @@ -972,13 +975,17 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, past_values, future_values=None, output_hidden_states: Optional[bool] = None): - model_output = self.model(past_values) + def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] = None): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output[0]) loss_val = None - if future_values is not None: - loss_val = self.loss(y_hat, future_values) + if labels is not None: + loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( loss=loss_val, prediction_logits=y_hat, @@ -1129,7 +1136,7 @@ class PatchTSTForPredictionOutput(ModelOutput): class PatchTSTForPrediction(PatchTSTPreTrainedModel): - # PatchTST model + classification head + # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1250,7 +1257,7 @@ def forward(self, x: torch.Tensor): class PatchTSTForForecasting(PatchTSTPreTrainedModel): - # PatchTST model + classification head + # PatchTST model + Forecasting head def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) @@ -1279,3 +1286,72 @@ def forward(self, hidden_states=model_output.hidden_states ) + +class RegressionHead(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + self.y_range = config.prediction_range + self.use_cls_token = config.use_cls_token + self.pooling = config.pooling + # self.is_flatten = is_flatten + + self.flatten = nn.Flatten(start_dim=1) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + input_dim = config.input_size * config.d_model + # if is_flatten: input_dim *= num_patch + self.linear = nn.Linear(input_dim, config.target_dimension) + + def forward(self, past_values): + """ + x: [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: [bs x output_dim] + """ + + if self.use_cls_token: + past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] + elif self.pooling == 'mean': + past_values = past_values.mean(dim=2) # x: [bs x nvars x d_model] + elif self.pooling == 'max': + past_values = past_values.max(dim=2) # x: [bs x nvars x d_model] + else: + raise Exception(f'pooling operator {self.pooling} is not implemented yet') + # flatten the input + past_values = self.flatten(past_values) # x: bs x nvars * d_model + y = self.linear(self.dropout(past_values)) # y: bs x output_dim + + if self.y_range: + y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] + + return y + + +class PatchTSTForRegression(PatchTSTPreTrainedModel): + # PatchTST model + Regression head + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + self.model = PatchTSTModel(config) + self.head = RegressionHead(config) + self.loss = nn.MSELoss(reduction='mean') + + # Initialize weights and apply final processing + self.post_init() + + def forward(self, + past_values: torch.Tensor, + labels: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + model_output = self.model(past_values, output_hidden_states=output_hidden_states) + y_hat = self.head(model_output[0]) + loss_val = None + if labels is not None: + loss_val = self.loss(y_hat, labels) + return PatchTSTForForecastingOutput( + loss=loss_val, + forecast_outputs=y_hat, + hidden_states=model_output.hidden_states + ) + \ No newline at end of file diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 9aec91367d8dda..dd0afca3ec81a2 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -485,6 +485,9 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_save_load(self): + super().test_save_load() + def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index c9a6afa7315757..39951b9ae2f373 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -23,9 +23,10 @@ from transformers import is_torch_available from transformers.testing_utils import is_flaky, require_torch, torch_device, slow - +from transformers.models.auto import get_values +import random from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin, floats_tensor +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -33,8 +34,9 @@ if is_torch_available(): import torch - from transformers import PatchTSTConfig - from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining + from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING + from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, \ + PatchTSTForClassification, PatchTSTForRegression @@ -61,7 +63,9 @@ def __init__( lags_sequence=[1, 2, 3, 4, 5], sampling_factor=10, distil=False, - seed_number=42 + seed_number=42, + num_classes=2, + target_dimension=2, ): self.parent = parent self.batch_size = batch_size @@ -85,9 +89,11 @@ def __init__( sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length ) self.seed_number = seed_number + self.num_classes = num_classes + self.target_dimension = target_dimension self.sampling_factor = sampling_factor self.distil = distil - self.num_patch = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 + self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 def get_config(self): return PatchTSTConfig( @@ -103,7 +109,9 @@ def get_config(self): attention_dropout=self.attention_probs_dropout_prob, context_length=self.context_length, activation_function=self.hidden_act, - seed_number=self.seed_number + seed_number=self.seed_number, + num_classes=self.num_classes, + target_dimension=self.target_dimension ) def prepare_patchtst_inputs_dict(self, config): @@ -133,7 +141,16 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () + all_model_classes = ( + (PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForPretraining, + PatchTSTForClassification, + PatchTSTForRegression) + if is_torch_available() + else () + ) all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} is_encoder_decoder = False @@ -163,6 +180,22 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + # if classification model: + if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): + rng = random.Random(self.model_tester.seed_number) + labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_classes, rng=rng) + inputs_dict["labels"] = labels + inputs_dict.pop("future_values") + elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): + rng = random.Random(self.model_tester.seed_number) + labels = floats_tensor([self.model_tester.batch_size, self.model_tester.target_dimension], rng=rng) + inputs_dict["labels"] = labels + inputs_dict.pop("future_values") + return inputs_dict + def test_save_load_strict(self): config, _ = self.model_tester.prepare_config_and_inputs() for model_class in self.all_model_classes: @@ -190,7 +223,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): ) self.assertEqual(len(hidden_states), expected_num_layers) - num_patch = self.model_tester.num_patch + num_patch = self.model_tester.num_patches self.assertListEqual( list(hidden_states[0].shape[-2:]), [num_patch, self.model_tester.hidden_size], @@ -211,12 +244,13 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) # # # Ignore since we have no tokens embeddings + def test_resize_tokens_embeddings(self): pass def test_model_outputs_equivalence(self): pass -# + def test_determinism(self): pass @@ -239,7 +273,10 @@ def test_forward_signature(self): "past_values", "future_values", ] - + if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \ + model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): + expected_arg_names.remove("future_values") + expected_arg_names.append("labels") expected_arg_names.extend( [ "output_hidden_states", @@ -316,3 +353,29 @@ def test_prediction_head(self): device=torch_device, ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) + + # def test_seq_to_seq_generation(self): + # model = PatchTSTForPrediction.from_pretrained("diepi/test_patchtst_prediction_etth1").to(torch_device) + # batch = prepare_batch("val-batch.pt") + # + # torch.manual_seed(0) + # with torch.no_grad(): + # outputs = model.generate( + # past_values=batch["past_values"].to(torch_device), + # future_values=batch["future_values"].to(torch_device) + # ).prediction_outputs + # expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) + # # self.assertEqual(outputs.sequences.shape, expected_shape) + # # + # # expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) + # # mean_prediction = outputs.sequences.mean(dim=1) + # # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) + # + # # expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) + # self.assertEqual(outputs.shape, expected_shape) + # + # expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + # device=torch_device, + # ) + # self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE)) + From 5802f073917d479efe5de230a2e4e69ac759fc6a Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 29 Aug 2023 23:43:34 -0400 Subject: [PATCH 024/189] Revert unrelated model test file --- src/transformers/models/patchtst/modeling_patchtst.py | 1 - tests/models/bert/test_modeling_bert.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6932b87ff2792b..d48c21d5c9889d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1354,4 +1354,3 @@ def forward(self, forecast_outputs=y_hat, hidden_states=model_output.hidden_states ) - \ No newline at end of file diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index dd0afca3ec81a2..9aec91367d8dda 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -485,9 +485,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_save_load(self): - super().test_save_load() - def test_model_various_embeddings(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() for type in ["absolute", "relative_key", "relative_key_query"]: From 3a6643804057d3a45bb04956a3212c4067fc899c Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Wed, 30 Aug 2023 00:15:38 -0400 Subject: [PATCH 025/189] Combine similar output classes --- .../models/patchtst/modeling_patchtst.py | 55 +++++-------------- .../models/patchtst/test_modeling_patchtst.py | 6 +- 2 files changed, 16 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d48c21d5c9889d..fc2b0ea188dd70 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -893,16 +893,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class PatchTSTForPreTrainingOutput(ModelOutput): +class PatchTSTOutput(ModelOutput): """ - Output type of [`BertForPreTraining`]. + Output type of [`PatchTSTForPredictiontion`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, nvars, num_patches, patch_length )`): - Prediction outputs of the modeling head. + MSE loss. + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -917,7 +916,7 @@ class PatchTSTForPreTrainingOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_outputs: torch.FloatTensor = None + prediction_output: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -939,7 +938,7 @@ def forward( self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None - ) -> PatchTSTForPreTrainingOutput: + ) -> PatchTSTOutput: """ past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels @@ -956,9 +955,9 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTForPreTrainingOutput( + return PatchTSTOutput( loss=masked_loss, - prediction_outputs=x_hat, + prediction_output=x_hat, hidden_states=model_output.hidden_states ) @@ -1107,34 +1106,6 @@ def forward(self, x: torch.Tensor): return x -class PatchTSTForPredictionOutput(ModelOutput): - """ - Output type of [`PatchTSTForPredictiontion`]. - - Args: - loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction outputs of the time series modeling heads. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_outputs: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - class PatchTSTForPrediction(PatchTSTPreTrainedModel): # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): @@ -1160,9 +1131,9 @@ def forward(self, loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) - return PatchTSTForPredictionOutput( + return PatchTSTOutput( loss=loss_val, - prediction_outputs=y_hat, + prediction_output=y_hat, hidden_states=model_output.hidden_states ) @@ -1349,8 +1320,8 @@ def forward(self, loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTForForecastingOutput( + return PatchTSTOutput( loss=loss_val, - forecast_outputs=y_hat, + prediction_output=y_hat, hidden_states=model_output.hidden_states ) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 39951b9ae2f373..778bbda0e6bb5a 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -307,7 +307,7 @@ def test_pretrain_head(self): with torch.no_grad(): output = model( past_values=batch["past_values"].to(torch_device) - ).prediction_outputs + ).prediction_output num_patch = (max(model.config.context_length, model.config.patch_length) - model.config.patch_length) // model.config.stride + 1 expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length]) @@ -345,7 +345,7 @@ def test_prediction_head(self): output = model( past_values=batch["past_values"].to(torch_device), future_values=batch["future_values"].to(torch_device) - ).prediction_outputs + ).prediction_output expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) self.assertEqual(output.shape, expected_shape) @@ -363,7 +363,7 @@ def test_prediction_head(self): # outputs = model.generate( # past_values=batch["past_values"].to(torch_device), # future_values=batch["future_values"].to(torch_device) - # ).prediction_outputs + # ).prediction_output # expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) # # self.assertEqual(outputs.sequences.shape, expected_shape) # # From 00ddf8d81612e5dbd2b59e003b83b38844c8e816 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 30 Aug 2023 14:14:27 +0700 Subject: [PATCH 026/189] update PredictionHead --- .../models/patchtst/modeling_patchtst.py | 66 +++++++------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index fc2b0ea188dd70..ece30f97c4b372 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1052,58 +1052,41 @@ class PatchTSTForClassificationOutput(ModelOutput): class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.individual = config.individual - self.n_vars = config.input_size + + self.target_dimension = config.target_dimension self.use_cls_token = config.use_cls_token self.pooling = config.pooling - head_dimension = config.d_model if config.pooling else config.d_model * config.num_patches - if self.individual: - self.linears = nn.ModuleList() - self.dropouts = nn.ModuleList() - self.flattens = nn.ModuleList() - for i in range(self.n_vars): - self.flattens.append(nn.Flatten(start_dim=2)) - self.linears.append(nn.Linear(head_dimension, config.prediction_length)) - self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - ) - else: - self.flatten = nn.Flatten(start_dim=2) - self.linear = nn.Linear(head_dimension, config.prediction_length) - self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + head_dim = config.input_size * config.d_model - def forward(self, x: torch.Tensor): + self.flatten = nn.Flatten(start_dim=1) + self.linear = nn.Linear(head_dim, config.prediction_length * config.target_dimension) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + + def forward(self, x): """ - x: [bs x nvars x num_patches x d_model] - or [bs x nvars x (num_patches+1) x d_model] if use cls_token - output: [bs x forecast_len x nvars] + x: [bs x nvars x num_patch x d_model] + or [bs x nvars x (num_patch+1) x d_model] if use cls_token + output: [bs x pred_len x target_dimension] """ + batch_size = x.shape[0] if self.use_cls_token: - y = x[:, :, 0, :] # y: [bs x nvars x d_model] + x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] + elif self.pooling == 'mean': + x = x.mean(dim=2) # x: [bs x nvars x d_model] + elif self.pooling == 'max': + x = x.max(dim=2) # x: [bs x nvars x d_model] else: - if self.pooling == 'mean': - y = x.mean(dim=2) # y: [bs x nvars x d_model] - elif self.pooling == 'max': - y = x.max(dim=2) # y: [bs x nvars x d_model] - else: - y = x # y: [bs x nvars x num_patches x d_model] + raise Exception(f'pooling operator {self.pooling} is not implemented yet') - if self.individual: - x_out = [] - for i in range(self.n_vars): - z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] - z = self.linears[i](z) # z: [bs x forecast_len] - z = self.dropouts[i](z) - x_out.append(z) - x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] - else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] - z = self.dropout(z) - x = self.linear(z) # x: [bs x nvars x forecast_len] + # flatten the input + x = self.flatten(x) # x: bs x (nvars * d_model) + y = self.linear(self.dropout(x)) # y: bs x (pred_len * target_dimension) - x = x.transpose(2, 1) # [bs x forecast_len x nvars] + # reshape the data + y = y.reshape(batch_size, -1, self.target_dimension) # [bs x pred_len x target_dimension] + return y - return x class PatchTSTForPrediction(PatchTSTPreTrainedModel): @@ -1122,6 +1105,7 @@ def forward(self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None): + output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) From 78c26f2f7e36236cc2a6ad6699d9e2949d64257b Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 30 Aug 2023 14:14:39 +0700 Subject: [PATCH 027/189] Update configuration_patchtst.py --- .../models/patchtst/configuration_patchtst.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 4fc0946a51ec74..9a3bd99c2d3414 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -163,7 +163,7 @@ def __init__( pooling: str = 'mean', num_classes: int = 1, head_dropout: float = 0.0, - proj_dropout: float = 0.0, + # proj_dropout: float = 0.0, qkv_bias: bool = True, num_dynamic_real_features: int = 0, num_static_real_features: int = 0, @@ -211,9 +211,8 @@ def __init__( self.positional_encoding = positional_encoding self.learn_pe = learn_pe self.use_cls_token = use_cls_token - # self.patch_last = patch_last - self.individual = individual self.init_std = init_std + self.qkv_bias = qkv_bias # PatchTST self.patch_length = patch_length @@ -235,14 +234,16 @@ def __init__( self.unmasked_channel_indices = unmasked_channel_indices self.mask_value = mask_value - # Classification + # general head params + self.individual = individual self.pooling = pooling - self.num_classes = num_classes self.head_dropout = head_dropout - self.proj_dropout = proj_dropout - self.qkv_bias = qkv_bias - # Forcasting + # Classification + self.num_classes = num_classes + # self.proj_dropout = proj_dropout + + # Forcasting and prediction self.prediction_length = prediction_length # Regression From 5f7c1a06e1d0fe8982d95f8409f0cffc6a9bef22 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Wed, 30 Aug 2023 11:20:00 -0400 Subject: [PATCH 028/189] Add Revin --- .../models/patchtst/configuration_patchtst.py | 4 +- .../models/patchtst/modeling_patchtst.py | 121 ++++++++++++++++-- 2 files changed, 113 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 9a3bd99c2d3414..feb6d324d52565 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -150,7 +150,8 @@ def __init__( use_cls_token: bool = False, init_std: float = 0.02, individual: bool = False, - seed_number= None, + seed_number: int = None, + revin: Optional[bool] = True, mask_input: Optional[bool] = None, mask_type: str = "random", mask_ratio=0.5, @@ -213,6 +214,7 @@ def __init__( self.use_cls_token = use_cls_token self.init_std = init_std self.qkv_bias = qkv_bias + self.revin = revin # PatchTST self.patch_length = patch_length diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index ece30f97c4b372..a03ed0c17c1ef4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -826,14 +826,75 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): hidden_states: Optional[Tuple[torch.FloatTensor]] = None patched_input: torch.FloatTensor = None mask: torch.FloatTensor = None + revin_mean: torch.FloatTensor = None + revin_std: torch.FloatTensor = None + + +class RevIN(nn.Module): + def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): + """ + :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars] + :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here. + """ + super(RevIN, self).__init__() + self.stdev = None + self.mean = None + self.start_dim = start_dim + self.denorm_channels = denorm_channels + self.eps = eps + + def set_statistics(self, mean, stdev): + self.mean = mean + self.stdev = stdev + + def forward(self, x, mode: str): + if mode == 'norm': + self._get_statistics(x) + x = self._normalize(x) + elif mode == 'denorm': + x = self._denormalize(x) + elif mode == "transform": + x = self._normalize(x) + + else: + raise NotImplementedError + return x + + def _get_statistics(self, x): + dim2reduce = tuple(range(self.start_dim, x.ndim - 1)) + self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() + self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + + def _normalize(self, x): + x = x - self.mean + x = x / self.stdev + return x + + def _denormalize(self, x): + + if self.denorm_channels is None: + x = x * self.stdev + x = x + self.mean + else: + x = x * self.stdev[..., self.denorm_channels] + x = x + self.mean[..., self.denorm_channels] + + return x # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): - def __init__(self, config: PatchTSTConfig, mask_input: bool = False): + def __init__(self, config: PatchTSTConfig): super().__init__(config) + self.use_revin = config.revin + + if self.use_revin: + self.revin = RevIN() + else: + self.revin = nn.Identity() + self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride) - self.mask_input = mask_input # config.mask_input + self.mask_input = config.mask_input if self.mask_input: self.masking = PatchMasking( @@ -860,6 +921,9 @@ def forward(self, output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + + past_values = self.revin(past_values, mode="norm") # x: tensor [bs x seq_len x in_channels] + patched_values = self.patching( past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain if self.mask_input: @@ -870,7 +934,9 @@ def forward(self, return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, hidden_states=encoder_output.hidden_states, patched_input=patched_values, - mask=mask + mask=mask, + revin_mean=self.revin.mean, + revin_stdev=self.revin.stdev ) @@ -926,8 +992,8 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - # config.mask_input = True - self.model = PatchTSTModel(config=config, mask_input=True) + config.mask_input = True + self.model = PatchTSTModel(config=config) self.head = PretrainHead(config) self.loss = torch.nn.MSELoss(reduction='none') @@ -946,10 +1012,14 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model( - past_values, output_hidden_states=output_hidden_states) # x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token - x_hat = self.head(model_output[ - 0]) # tensor [bs x nvars x num_patches x patch_length] or [bs x nvars x (num_patches+1) x patch_length] if use cls_token + + # past_values: [bs x nvars x num_patches x d_model] or + # [bs x nvars x (num_patches+1) x d_model] if use cls_token + model_output = self.model(past_values, output_hidden_states=output_hidden_states) + + # model_output[0]: [bs x nvars x num_patches x patch_length] or + # [bs x nvars x (num_patches+1) x patch_length] if use cls_token + x_hat = self.head(model_output[0]) # calculate masked_loss loss_val = self.loss(x_hat, model_output.patched_input) @@ -1097,6 +1167,11 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PredictionHead(config) self.loss = nn.MSELoss(reduction='mean') + self.use_revin = config.revin + if self.use_revin: + self.revin = RevIN() + else: + self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1112,6 +1187,10 @@ def forward(self, model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) + if self.use_revin: + self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) + y_hat = self.revin(y_hat, mode="denorm") + loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) @@ -1218,6 +1297,11 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = ForecastHead(config) self.loss = nn.MSELoss(reduction='mean') + self.use_revin = config.revin + if self.use_revin: + self.revin = RevIN() + else: + self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1230,7 +1314,12 @@ def forward(self, output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output[0]) + + y_hat = self.head(model_output.last_hidden_state) + + if self.use_revin: + self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) + y_hat = self.revin(y_hat, mode="denorm") loss_val = None if future_values is not None: @@ -1288,6 +1377,11 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = RegressionHead(config) self.loss = nn.MSELoss(reduction='mean') + self.use_revin = config.revin + if self.use_revin: + self.revin = RevIN() + else: + self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1300,7 +1394,12 @@ def forward(self, output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output[0]) + y_hat = self.head(model_output.last_hidden_state) + + if self.use_revin: + self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) + y_hat = self.revin(y_hat, mode="denorm") + loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) From ac8882e7caefc781b3cb784e483b6e4a76044425 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 30 Aug 2023 23:31:32 +0700 Subject: [PATCH 029/189] small edit to PatchTSTModelOutputWithNoAttention --- .../models/patchtst/modeling_patchtst.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a03ed0c17c1ef4..330c79b71c1f0c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -812,14 +812,20 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): Base class for model's outputs, with potential hidden states. Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - patched_input + patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): + patched input to the Transformer + mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) + Bool masked tensor indicating which patches are masked + revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) + mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length + revin_std: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) + std of the input data (batch_size, sequence_length, num_channels) over the sequence_length """ last_hidden_state: torch.FloatTensor = None @@ -935,11 +941,10 @@ def forward(self, hidden_states=encoder_output.hidden_states, patched_input=patched_values, mask=mask, - revin_mean=self.revin.mean, - revin_stdev=self.revin.stdev + revin_mean=self.revin.mean if self.use_revin else None, + revin_stdev=self.revin.stdev if self.use_revin else None ) - class PretrainHead(nn.Module): def __init__(self, config): super().__init__() From 2457e587e109d47ff6cf99d0e0112a02c0cf5461 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 1 Sep 2023 00:30:42 +0700 Subject: [PATCH 030/189] Update modeling_patchtst.py --- .../models/patchtst/modeling_patchtst.py | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 330c79b71c1f0c..891bd1cce54788 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -38,6 +38,7 @@ ] + class PatchTSTAttention(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -850,6 +851,7 @@ def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): self.eps = eps def set_statistics(self, mean, stdev): + # get statistics self.mean = mean self.stdev = stdev @@ -861,7 +863,6 @@ def forward(self, x, mode: str): x = self._denormalize(x) elif mode == "transform": x = self._normalize(x) - else: raise NotImplementedError return x @@ -877,7 +878,7 @@ def _normalize(self, x): return x def _denormalize(self, x): - + # denormalize the data if self.denorm_channels is None: x = x * self.stdev x = x + self.mean @@ -945,7 +946,7 @@ def forward(self, revin_stdev=self.revin.stdev if self.use_revin else None ) -class PretrainHead(nn.Module): +class MaskPretrainHead(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(config.dropout) @@ -992,14 +993,14 @@ class PatchTSTOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None -class PatchTSTForPretraining(PatchTSTPreTrainedModel): +class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel): # PatchTSTModel + Pretraining Head def __init__(self, config: PatchTSTConfig): super().__init__(config) config.mask_input = True self.model = PatchTSTModel(config=config) - self.head = PretrainHead(config) + self.head = MaskPretrainHead(config) self.loss = torch.nn.MSELoss(reduction='none') # Initialize weights and apply final processing @@ -1172,11 +1173,6 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PredictionHead(config) self.loss = nn.MSELoss(reduction='mean') - self.use_revin = config.revin - if self.use_revin: - self.revin = RevIN() - else: - self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1192,10 +1188,6 @@ def forward(self, model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) - if self.use_revin: - self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) - y_hat = self.revin(y_hat, mode="denorm") - loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) @@ -1356,7 +1348,6 @@ def forward(self, past_values): or [bs x nvars x (num_patch+1) x d_model] if use cls_token output: [bs x output_dim] """ - if self.use_cls_token: past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] elif self.pooling == 'mean': @@ -1382,11 +1373,6 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = RegressionHead(config) self.loss = nn.MSELoss(reduction='mean') - self.use_revin = config.revin - if self.use_revin: - self.revin = RevIN() - else: - self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1401,10 +1387,6 @@ def forward(self, model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) - if self.use_revin: - self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) - y_hat = self.revin(y_hat, mode="denorm") - loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) From f1658b231c29f2e6f13ef0a6a614e6e365062ed6 Mon Sep 17 00:00:00 2001 From: Ngoc Diep Do Date: Thu, 31 Aug 2023 17:17:53 +0200 Subject: [PATCH 031/189] Updating integration test for forecasting --- tests/models/patchtst/test_modeling_patchtst.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 778bbda0e6bb5a..364306fc8d0256 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -313,8 +313,8 @@ def test_pretrain_head(self): expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], - [[0.0246]], [[0.0090]]], + expected_slice = torch.tensor([[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], + [[0.0053]], [[0.0090]]], device=torch_device) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) @@ -336,8 +336,8 @@ def test_pretrain_head(self): # ) # self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE)) - def test_prediction_head(self): - model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device) + def test_forecasting_head(self): + model = PatchTSTForForecasting.from_pretrained('diepi/test_patchtst_forecasting_etth1').to(torch_device) batch = prepare_batch(file="test-batch.pt") torch.manual_seed(0) @@ -345,11 +345,11 @@ def test_prediction_head(self): output = model( past_values=batch["past_values"].to(torch_device), future_values=batch["future_values"].to(torch_device) - ).prediction_output + ).forecast_outputs expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + expected_slice = torch.tensor([[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]], device=torch_device, ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) From 43707d7b964a68da93e398f93d4a45b8fd49b35c Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Fri, 1 Sep 2023 13:43:34 -0400 Subject: [PATCH 032/189] Fix unittest after class structure changed --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/index.md | 2 + docs/source/en/model_doc/patchtst.md | 24 ++ src/transformers/__init__.py | 56 +-- src/transformers/models/__init__.py | 2 +- src/transformers/models/auto/__init__.py | 4 +- src/transformers/models/auto/modeling_auto.py | 1 + src/transformers/models/patchtst/__init__.py | 12 +- .../models/patchtst/configuration_patchtst.py | 9 +- .../models/patchtst/modeling_patchtst.py | 353 +++++++++--------- src/transformers/utils/dummy_pt_objects.py | 58 +++ .../models/patchtst/test_modeling_patchtst.py | 85 +++-- utils/check_repo.py | 3 + 19 files changed, 356 insertions(+), 260 deletions(-) diff --git a/README.md b/README.md index 5253b491bae5b2..a7246572381451 100644 --- a/README.md +++ b/README.md @@ -428,6 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_es.md b/README_es.md index bcd84333ef99f8..62085093026a87 100644 --- a/README_es.md +++ b/README_es.md @@ -405,6 +405,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_hd.md b/README_hd.md index d87ef37e8b23bb..5e93de459461a7 100644 --- a/README_hd.md +++ b/README_hd.md @@ -377,6 +377,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया। +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा। 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया। diff --git a/README_ja.md b/README_ja.md index c6b9fb0d790e4e..1067b2e57a25ee 100644 --- a/README_ja.md +++ b/README_ja.md @@ -439,6 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) diff --git a/README_ko.md b/README_ko.md index 5d2056e4f7207b..202d3d4893561a 100644 --- a/README_ko.md +++ b/README_ko.md @@ -354,6 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 23ecd11c23218d..8fe1633f181115 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -378,6 +378,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。 +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 3c05c1962f5114..9c615363e61a81 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -390,6 +390,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ee2d984c981341..b9f65477b5fe1c 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -194,6 +194,7 @@ The documentation is organized into five sections: 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. +1. **[PatchTST](model_doc/patchtst)** (from ) released with the paper []() by . 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. @@ -414,6 +415,7 @@ Flax), PyTorch, and/or TensorFlow. | OpenLlama | ✅ | ❌ | ❌ | | OPT | ✅ | ✅ | ✅ | | OWL-ViT | ✅ | ❌ | ❌ | +| PatchTST | ✅ | ❌ | ❌ | | Pegasus | ✅ | ✅ | ✅ | | PEGASUS-X | ✅ | ❌ | ❌ | | Perceiver | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 14523d65c70f3d..9a30b8294571b0 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -47,4 +47,28 @@ The original code can be found [here](). ## PatchTSTForPrediction [[autodoc]] PatchTSTForPrediction + - forward + + +## PatchTSTForForecasting + +[[autodoc]] PatchTSTForForecasting + - forward + + +## PatchTSTForClassification + +[[autodoc]] PatchTSTForClassification + - forward + + +## PatchTSTForMaskPretraining + +[[autodoc]] PatchTSTForMaskPretraining + - forward + + +## PatchTSTForRegression + +[[autodoc]] PatchTSTForRegression - forward \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d6a15e2aeebf93..9e4fab76a4eba4 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -372,7 +372,6 @@ ], "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"], "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"], - "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"], "models.instructblip": [ "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "InstructBlipConfig", @@ -467,6 +466,7 @@ "OwlViTTextConfig", "OwlViTVisionConfig", ], + "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"], "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"], "models.pegasus_x": ["PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusXConfig"], "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"], @@ -1133,6 +1133,8 @@ "MODEL_FOR_TEXT_ENCODING_MAPPING", "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING", "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING", + "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING", + "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", @@ -1142,8 +1144,6 @@ "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING", "MODEL_MAPPING", "MODEL_WITH_LM_HEAD_MAPPING", - "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING", - "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING", "AutoBackbone", "AutoModel", "AutoModelForAudioClassification", @@ -1992,18 +1992,6 @@ "InformerPreTrainedModel", ] ) - _import_structure["models.patchtst"].extend( - [ - "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", - "PatchTSTModel", - "PatchTSTPreTrainedModel", - "PatchTSTForPrediction", - "PatchTSTForForecasting", - "PatchTSTForPretraining", - "PatchTSTForClassification", - "PatchTSTForRegression", - ] - ) _import_structure["models.instructblip"].extend( [ "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -2420,6 +2408,18 @@ "OwlViTVisionModel", ] ) + _import_structure["models.patchtst"].extend( + [ + "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", + "PatchTSTForClassification", + "PatchTSTForForecasting", + "PatchTSTForMaskPretraining", + "PatchTSTForPrediction", + "PatchTSTForRegression", + "PatchTSTModel", + "PatchTSTPreTrainedModel", + ] + ) _import_structure["models.pegasus"].extend( ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"] ) @@ -4477,7 +4477,6 @@ ) from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig - from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig from .models.instructblip import ( INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, InstructBlipConfig, @@ -4562,6 +4561,7 @@ OwlViTTextConfig, OwlViTVisionConfig, ) + from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer from .models.pegasus_x import PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusXConfig from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer @@ -5158,6 +5158,8 @@ MODEL_FOR_TEXT_ENCODING_MAPPING, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING, + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, @@ -5167,8 +5169,6 @@ MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, - MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, - MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, AutoBackbone, AutoModel, AutoModelForAudioClassification, @@ -5865,16 +5865,6 @@ InformerModel, InformerPreTrainedModel, ) - from .models.patchtst import ( - PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, - PatchTSTModel, - PatchTSTForPrediction, - PatchTSTForForecasting, - PatchTSTForPretraining, - PatchTSTPreTrainedModel, - PatchTSTForClassification, - PatchTSTForRegression, - ) from .models.instructblip import ( INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST, InstructBlipForConditionalGeneration, @@ -6211,6 +6201,16 @@ OwlViTTextModel, OwlViTVisionModel, ) + from .models.patchtst import ( + PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, + PatchTSTForClassification, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForPrediction, + PatchTSTForRegression, + PatchTSTModel, + PatchTSTPreTrainedModel, + ) from .models.pegasus import ( PegasusForCausalLM, PegasusForConditionalGeneration, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 3b958ac5c1df40..cf2a6ce94d37a3 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -105,7 +105,6 @@ idefics, imagegpt, informer, - patchtst, instructblip, jukebox, layoutlm, @@ -152,6 +151,7 @@ openai, opt, owlvit, + patchtst, pegasus, pegasus_x, perceiver, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 6b13313fd6be73..c606cb6c0f967b 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -249,6 +249,8 @@ MODEL_FOR_TEXT_ENCODING_MAPPING, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING, + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, @@ -256,8 +258,6 @@ MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING, - MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, - MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, AutoBackbone, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index af27c099bceb77..4b1f7b43685da1 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1213,6 +1213,7 @@ CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES ) + class AutoModelForMaskGeneration(_BaseAutoModelClass): _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index c5836322f55b99..8979bed2341ab2 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -36,9 +36,9 @@ "PatchTSTPreTrainedModel", "PatchTSTForPrediction", "PatchTSTForForecasting", - "PatchTSTForPretraining", + "PatchTSTForMaskPretraining", "PatchTSTForRegression", - "PatchTSTForClassification" + "PatchTSTForClassification", ] @@ -53,12 +53,12 @@ else: from .modeling_patchtst import ( PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, - PatchTSTForPretraining, - PatchTSTModel, - PatchTSTForPrediction, - PatchTSTForForecasting, PatchTSTForClassification, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForPrediction, PatchTSTForRegression, + PatchTSTModel, PatchTSTPreTrainedModel, ) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index feb6d324d52565..fbdca6db377155 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -14,7 +14,7 @@ # limitations under the License. """PatchTST model configuration""" -from typing import List, Optional, Union +from typing import List, Optional from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -161,7 +161,7 @@ def __init__( d_size: str = "4D", unmasked_channel_indices: list = None, mask_value=0, - pooling: str = 'mean', + pooling: str = "mean", num_classes: int = 1, head_dropout: float = 0.0, # proj_dropout: float = 0.0, @@ -175,17 +175,15 @@ def __init__( prediction_length: int = 24, prediction_range: List = [0, 1], target_dimension: int = 1, - # PatchTST arguments attention_type: str = "prob", sampling_factor: int = 5, distil: bool = True, **kwargs, ): - # time series specific configuration self.context_length = context_length - self.input_size = input_size # n_vars + self.input_size = input_size # n_vars self.num_time_features = num_time_features self.num_dynamic_real_features = num_dynamic_real_features self.num_static_real_features = num_static_real_features @@ -256,4 +254,3 @@ def __init__( def _num_patches(self): return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 - diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 891bd1cce54788..7b485769d7f0c4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 TSFM team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,19 +14,20 @@ # limitations under the License. """ PyTorch PatchTST model.""" -from typing import Optional, Tuple -import torch -from torch import nn import math import random +from typing import Optional, Tuple + import numpy as np +import torch +from torch import nn +from torch.nn.modules.activation import MultiheadAttention -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import add_start_docstrings, logging from transformers.modeling_outputs import BaseModelOutputWithNoAttention -from transformers.utils import ModelOutput -from torch.nn.modules.activation import MultiheadAttention +from transformers.modeling_utils import PreTrainedModel from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig +from transformers.utils import ModelOutput, add_start_docstrings, logging + logger = logging.get_logger(__name__) @@ -38,7 +39,6 @@ ] - class PatchTSTAttention(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -85,7 +85,7 @@ def forward(self, x): def positional_encoding(pe, learn_pe, q_len, d_model): # Positional encoding - if pe == None: + if pe is None: w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe nn.init.uniform_(w_pos, -0.02, 0.02) learn_pe = False @@ -131,9 +131,8 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps= i = 0 for i in range(100): cpe = ( - 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * ( - torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - - 1 + 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) + - 1 ) if abs(cpe.mean()) <= eps: @@ -161,16 +160,17 @@ def set_seed(x=42): random.seed(x) np.random.seed(x) torch.manual_seed(x) - if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(x) def random_masking( - xb: torch.Tensor, - mask_ratio: float, - unmasked_channel_indices: list = None, - channel_consistent_masking: bool = False, - mask_value=0, - seed_number: Optional[int] = None + xb: torch.Tensor, + mask_ratio: float, + unmasked_channel_indices: list = None, + channel_consistent_masking: bool = False, + mask_value=0, + seed_number: Optional[int] = None, ): """random_masking: Mask the input considering the control variables. @@ -178,13 +178,15 @@ def random_masking( seed_number (int, optional): Value to set for the seed number xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. - unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + unmasked_channel_indices (list, optional): + indices of unmasked channels. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): + When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary + across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. Returns: - Tensor: xb_mask, masked input, same shape as input - Tensor: Mask tensor of shape [bs x c x n] + Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] """ if seed_number: set_seed(seed_number) @@ -221,26 +223,25 @@ def compute_num_patches(sequence_length, patch_length, stride): class Patchify(nn.Module): """ - A class to patchify the time series sequence into different patches Args: - sequence_length (int, required): input sequence length - patch_length (int, required): patch length - stride (int, required): stride between patches + A class to patchify the time series sequence into different patches + sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int, + required): stride between patches Returns: z: output tensor data [bs x n_vars x num_patches x patch_length] """ def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + self, + sequence_length: int, + patch_length: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" self.sequence_length = sequence_length @@ -260,9 +261,11 @@ def forward(self, past_values: torch.Tensor): x: output tensor data [bs x n_vars x num_patches x patch_length] """ sequence_length = past_values.shape[-2] - assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." + assert ( + sequence_length == self.sequence_length + ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." - x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] x = x.unfold( dimension=-2, size=self.patch_length, step=self.stride ) # x: [bs x num_patches x n_vars x patch_length] @@ -272,26 +275,19 @@ def forward(self, past_values: torch.Tensor): class PatchEmbeddings(nn.Module): """ - A class to patchify the time series sequence into different patches Args: - sequence_length (int, required): input sequence length - patch_length (int, required): patch length - stride (int, required): stride between patches + A class to patchify the time series sequence into different patches + sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int, + required): stride between patches Returns: embeddings: output tensor data [bs x n_vars x num_patches x embed_dim] """ - def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - embed_dim: int - ): + def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" @@ -307,11 +303,12 @@ def __init__( self.s_begin = sequence_length - new_sequence_length # Embedding - self.projection = nn.Conv1d(in_channels=1, - out_channels=embed_dim, - kernel_size=patch_length, - stride=stride, - ) + self.projection = nn.Conv1d( + in_channels=1, + out_channels=embed_dim, + kernel_size=patch_length, + stride=stride, + ) def forward(self, past_values: torch.Tensor): """ @@ -321,16 +318,19 @@ def forward(self, past_values: torch.Tensor): embeddings: output tensor data [bs x n_vars x num_patches x emb_dim] """ bs, sequence_length, n_vars = past_values.shape - assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." + assert ( + sequence_length == self.sequence_length + ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." - x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] # convert past_values to shape [bs*n_vars x 1 x sequence_length ] x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous() # projection embeddings = self.projection(x) # embeddings: [bs*n_vars x emb_dim x num_patches] # reshape - embeddings = embeddings.transpose(1, 2).view(bs, n_vars, -1, - self.embed_dim).contiguous() # embeddings: [bs x n_vars x num_patches x emb_dim] + embeddings = ( + embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() + ) # embeddings: [bs x n_vars x num_patches x emb_dim] # embeddings = embeddings.flatten(2).transpose(1, 2) return embeddings @@ -345,23 +345,25 @@ class PatchMasking(nn.Module): mask_patches (list, optional): List of patch lengths to mask in the end of the data. mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + unmasked_channel_indices (list, optional): + Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): + When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary + across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. """ def __init__( - self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = False, - unmasked_channel_indices: list = None, - mask_value=0, - seed_number: Optional[int] = None + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = False, + unmasked_channel_indices: list = None, + mask_value=0, + seed_number: Optional[int] = None, ): - # if seed_number: # set_seed(seed_number) self.mask_ratio = mask_ratio @@ -381,11 +383,11 @@ def forward(self, x: torch.Tensor): """ Input: x: patched input - 4D: [bs x n_vars x num_patches x patch_length] + 4D: [bs x n_vars x num_patches x patch_length] Output: x_mask: Masked patched input - 4D: [bs x n_vars x num_patches x patch_length] + 4D: [bs x n_vars x num_patches x patch_length] mask: bool tensor indicating True on masked points 4D: [bs x n_vars x num_patch] """ @@ -397,7 +399,7 @@ def forward(self, x: torch.Tensor): unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, mask_value=self.mask_value, - seed_number=self.seed_number + seed_number=self.seed_number, ) else: @@ -412,17 +414,11 @@ class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.layers = nn.ModuleList( - [ - ChannelAttentionTSTEncoderLayer(config) - for i in range(config.encoder_layers) - ] - ) + self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)]) def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - src: tensor [bs x nvars x sequence_length x d_model] - Return: + src: tensor [bs x nvars x sequence_length x d_model] Return: Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] @@ -476,8 +472,7 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ - src: tensor [bs x nvars x sequence_length x d_model] - Return: + src: tensor [bs x nvars x sequence_length x d_model] Return: Tensor [bs x nvars x sequence_length x d_model] """ bs, n_vars, sequence_length, d_model = src.shape @@ -487,38 +482,46 @@ def forward(self, src: torch.Tensor): if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer1(src)) + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer1( - src + self.dropout_path1(self.self_attn(src))) # src: [(bs*nvars) x sequence_length x d_model] + src + self.dropout_path1(self.self_attn(src)) + ) # src: [(bs*nvars) x sequence_length x d_model] src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] - src = src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, - d_model) # [(bs*sequence_length) x nvars x d_model] + src = ( + src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, d_model) + ) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer2(src)) + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer2( - src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] - src = src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, - 2).contiguous() # src: [bs x nvars x sequence_length x d_model] + src + self.dropout_path2(self.self_attn(src)) + ) # src: [(bs*sequence_length) x nvars x d_model] + src = ( + src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, 2).contiguous() + ) # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden src = src.view(bs * n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3( - self.ff(self.norm_sublayer3(src))) # Add: residual connection with residual dropout + self.ff(self.norm_sublayer3(src)) + ) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer3( - src + self.dropout_path3(self.ff(src))) # Add: residual connection with residual dropout + src + self.dropout_path3(self.ff(src)) + ) # Add: residual connection with residual dropout src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src @@ -546,7 +549,6 @@ def _init_weights(self, module): module.bias_k.data.normal_(mean=0.0, std=self.config.init_std) module.bias_v.data.normal_(mean=0.0, std=self.config.init_std) - def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (ChannelAttentionPatchTSTEncoder)): module.gradient_checkpointing = value @@ -573,11 +575,13 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, - config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model + ) else: - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, - config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, config.num_patches, config.d_model + ) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -588,11 +592,11 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, past_values: torch.Tensor, - output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention: + def forward( + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None + ) -> BaseModelOutputWithNoAttention: """ - x: tensor [bs x nvars x num_patches x patch_length] - return: + x: tensor [bs x nvars x num_patches x patch_length] return: tensor [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token """ @@ -623,14 +627,12 @@ def forward(self, past_values: torch.Tensor, # Encoder past_values, hidden_states = self.encoder( - past_values, output_hidden_states) # x: [bs x nvars x num_patches x d_model] + past_values, output_hidden_states + ) # x: [bs x nvars x num_patches x d_model] # or [bs x nvars x (num_patches+1) x d_model] if use cls_token # return past_values, hidden_states - return BaseModelOutputWithNoAttention( - last_hidden_state=past_values, - hidden_states=hidden_states - ) + return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states) PATCHTST_START_DOCSTRING = r""" @@ -817,8 +819,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of + the model at the output of each layer plus the optional initial embedding outputs. patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): patched input to the Transformer mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) @@ -841,7 +843,8 @@ class RevIN(nn.Module): def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): """ :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars] - :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here. + :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm + input here. """ super(RevIN, self).__init__() self.stdev = None @@ -856,10 +859,10 @@ def set_statistics(self, mean, stdev): self.stdev = stdev def forward(self, x, mode: str): - if mode == 'norm': + if mode == "norm": self._get_statistics(x) x = self._normalize(x) - elif mode == 'denorm': + elif mode == "denorm": x = self._denormalize(x) elif mode == "transform": x = self._normalize(x) @@ -889,7 +892,6 @@ def _denormalize(self, x): return x -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -912,7 +914,7 @@ def __init__(self, config: PatchTSTConfig): channel_consistent_masking=config.channel_consistent_masking, unmasked_channel_indices=config.unmasked_channel_indices, mask_value=config.mask_value, - seed_number=config.seed_number + seed_number=config.seed_number, ) else: self.masking = nn.Identity() @@ -921,10 +923,12 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None): + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -932,19 +936,22 @@ def forward(self, past_values = self.revin(past_values, mode="norm") # x: tensor [bs x seq_len x in_channels] patched_values = self.patching( - past_values) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain + past_values + ) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain if self.mask_input: masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) - return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, - hidden_states=encoder_output.hidden_states, - patched_input=patched_values, - mask=mask, - revin_mean=self.revin.mean if self.use_revin else None, - revin_stdev=self.revin.stdev if self.use_revin else None - ) + return PatchTSTModelOutputWithNoAttention( + last_hidden_state=encoder_output.last_hidden_state, + hidden_states=encoder_output.hidden_states, + patched_input=patched_values, + mask=mask, + revin_mean=self.revin.mean if self.use_revin else None, + revin_stdev=self.revin.stdev if self.use_revin else None, + ) + class MaskPretrainHead(nn.Module): def __init__(self, config): @@ -1001,19 +1008,19 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config=config) self.head = MaskPretrainHead(config) - self.loss = torch.nn.MSELoss(reduction='none') + self.loss = torch.nn.MSELoss(reduction="none") # Initialize weights and apply final processing self.post_init() def forward( - self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, ) -> PatchTSTOutput: """ - past_values (x): tensor [bs x sequence_length x n_vars ] - future_values (y): labels + past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1031,11 +1038,7 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTOutput( - loss=masked_loss, - prediction_output=x_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1062,9 +1065,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] if labels is not None: loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( - loss=loss_val, - prediction_logits=y_hat, - hidden_states=model_output.hidden_states + loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states ) @@ -1079,8 +1080,8 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x): """ - x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token - output: [bs x n_classes] + x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: + [bs x n_classes] """ if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: bs x nvars x d_model @@ -1148,12 +1149,12 @@ def forward(self, x): batch_size = x.shape[0] if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == 'mean': + elif self.pooling == "mean": x = x.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": x = x.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f'pooling operator {self.pooling} is not implemented yet') + raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input x = self.flatten(x) # x: bs x (nvars * d_model) @@ -1164,7 +1165,6 @@ def forward(self, x): return y - class PatchTSTForPrediction(PatchTSTPreTrainedModel): # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): @@ -1172,16 +1172,17 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PredictionHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None): - + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1191,11 +1192,7 @@ def forward(self, loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput( - loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) class PatchTSTForForecastingOutput(ModelOutput): @@ -1245,8 +1242,7 @@ def __init__(self, config: PatchTSTConfig): for i in range(self.n_vars): self.flattens.append(nn.Flatten(start_dim=2)) self.linears.append(nn.Linear(head_dim, config.prediction_length)) - self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - ) + self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: self.flatten = nn.Flatten(start_dim=2) self.linear = nn.Linear(head_dim, config.prediction_length) @@ -1262,9 +1258,9 @@ def forward(self, x: torch.Tensor): if self.use_cls_token: y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: - if self.pooling == 'mean': + if self.pooling == "mean": y = x.mean(dim=2) # y: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": y = x.max(dim=2) # y: [bs x nvars x d_model] else: y = x # y: [bs x nvars x num_patches x d_model] @@ -1293,7 +1289,7 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = ForecastHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") self.use_revin = config.revin if self.use_revin: self.revin = RevIN() @@ -1303,10 +1299,12 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None): + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1322,9 +1320,7 @@ def forward(self, if future_values is not None: loss_val = self.loss(y_hat, future_values) return PatchTSTForForecastingOutput( - loss=loss_val, - forecast_outputs=y_hat, - hidden_states=model_output.hidden_states + loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states ) @@ -1350,12 +1346,12 @@ def forward(self, past_values): """ if self.use_cls_token: past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == 'mean': + elif self.pooling == "mean": past_values = past_values.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": past_values = past_values.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f'pooling operator {self.pooling} is not implemented yet') + raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input past_values = self.flatten(past_values) # x: bs x nvars * d_model y = self.linear(self.dropout(past_values)) # y: bs x output_dim @@ -1372,15 +1368,14 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = RegressionHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - labels: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None): + def forward( + self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1390,8 +1385,4 @@ def forward(self, loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput( - loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index ab7b7c18d62f5a..95d56d3caf25c6 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -624,6 +624,12 @@ def __init__(self, *args, **kwargs): MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = None +MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = None + + +MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = None + + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None @@ -5815,6 +5821,58 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class PatchTSTForClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTForForecasting(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTForMaskPretraining(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTForPrediction(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTForRegression(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PatchTSTPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class PegasusForCausalLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 364306fc8d0256..65bbb309c815a0 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -15,6 +15,7 @@ """ Testing suite for the PyTorch PatchTST model. """ import inspect +import random import tempfile import unittest @@ -22,9 +23,9 @@ from huggingface_hub import hf_hub_download from transformers import is_torch_available -from transformers.testing_utils import is_flaky, require_torch, torch_device, slow from transformers.models.auto import get_values -import random +from transformers.testing_utils import is_flaky, require_torch, slow, torch_device + from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -34,10 +35,18 @@ if is_torch_available(): import torch - from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING - from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining, \ - PatchTSTForClassification, PatchTSTForRegression + from transformers import ( + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, + PatchTSTConfig, + PatchTSTForClassification, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForPrediction, + PatchTSTForRegression, + PatchTSTModel, + ) @require_torch @@ -111,7 +120,7 @@ def get_config(self): activation_function=self.hidden_act, seed_number=self.seed_number, num_classes=self.num_classes, - target_dimension=self.target_dimension + target_dimension=self.target_dimension, ) def prepare_patchtst_inputs_dict(self, config): @@ -142,16 +151,20 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (PatchTSTModel, - PatchTSTForPrediction, - PatchTSTForForecasting, - PatchTSTForPretraining, - PatchTSTForClassification, - PatchTSTForRegression) + ( + PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForClassification, + PatchTSTForRegression, + ) if is_torch_available() else () ) - all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () + all_generative_model_classes = ( + (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () + ) pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} is_encoder_decoder = False test_pruning = False @@ -161,7 +174,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_inputs_embeds = False test_model_common_attributes = False - test_resize_embeddings = True test_resize_position_embeddings = False test_mismatched_shapes = True @@ -206,7 +218,7 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) -# + # def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) @@ -233,7 +245,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - print('model_class: ', model_class) + print("model_class: ", model_class) check_hidden_states_output(inputs_dict, config, model_class) @@ -242,8 +254,9 @@ def check_hidden_states_output(inputs_dict, config, model_class): config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) -# -# # Ignore since we have no tokens embeddings + + # + # # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass @@ -273,8 +286,9 @@ def test_forward_signature(self): "past_values", "future_values", ] - if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \ - model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): + if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING + ): expected_arg_names.remove("future_values") expected_arg_names.append("labels") expected_arg_names.extend( @@ -290,7 +304,7 @@ def test_retain_grad_hidden_states_attentions(self): super().test_retain_grad_hidden_states_attentions() -def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'): +def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"): file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") batch = torch.load(file, map_location=torch_device) return batch @@ -300,22 +314,21 @@ def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'): @slow class PatchTSTModelIntegrationTests(unittest.TestCase): def test_pretrain_head(self): - model = PatchTSTForPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device) + model = PatchTSTForMaskPretraining.from_pretrained("diepi/test_patchtst_pretrained_etth1").to(torch_device) batch = prepare_batch() torch.manual_seed(0) with torch.no_grad(): - output = model( - past_values=batch["past_values"].to(torch_device) - ).prediction_output - num_patch = (max(model.config.context_length, - model.config.patch_length) - model.config.patch_length) // model.config.stride + 1 + output = model(past_values=batch["past_values"].to(torch_device)).prediction_output + num_patch = ( + max(model.config.context_length, model.config.patch_length) - model.config.patch_length + ) // model.config.stride + 1 expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], - [[0.0053]], [[0.0090]]], - device=torch_device) + expected_slice = torch.tensor( + [[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], [[0.0053]], [[0.0090]]], device=torch_device + ) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # def test_classification_head(self): @@ -337,21 +350,22 @@ def test_pretrain_head(self): # self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE)) def test_forecasting_head(self): - model = PatchTSTForForecasting.from_pretrained('diepi/test_patchtst_forecasting_etth1').to(torch_device) + model = PatchTSTForForecasting.from_pretrained("./hf_etth_forecasting").to(torch_device) batch = prepare_batch(file="test-batch.pt") torch.manual_seed(0) with torch.no_grad(): output = model( past_values=batch["past_values"].to(torch_device), - future_values=batch["future_values"].to(torch_device) + future_values=batch["future_values"].to(torch_device), ).forecast_outputs expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]], - device=torch_device, - ) + expected_slice = torch.tensor( + [[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]], + device=torch_device, + ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) # def test_seq_to_seq_generation(self): @@ -378,4 +392,3 @@ def test_forecasting_head(self): # device=torch_device, # ) # self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE)) - diff --git a/utils/check_repo.py b/utils/check_repo.py index c46b82b7c67ecb..d28679c78ef58a 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -176,6 +176,9 @@ "TimeSeriesTransformerForPrediction", "InformerForPrediction", "AutoformerForPrediction", + "PatchTSTForForecasting", + "PatchTSTForMaskPretraining", + "PatchTSTForPrediction", "JukeboxVQVAE", "JukeboxPrior", "SamModel", From a69cb59be055650a513d84b72858e078bf545d7a Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Fri, 1 Sep 2023 14:32:21 -0400 Subject: [PATCH 033/189] docstring updates --- .../models/patchtst/configuration_patchtst.py | 11 ++-- .../models/patchtst/modeling_patchtst.py | 55 +++++++++++++++---- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index fbdca6db377155..e18a98d51d1fe5 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -45,9 +45,6 @@ class PatchTSTConfig(PretrainedConfig): input_size (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. - scaling (`string` or `bool`, *optional* defaults to `"mean"`): - Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the - scaler is set to "mean". num_time_features (`int`, *optional*, defaults to 0): The number of time features in the input time series. num_dynamic_real_features (`int`, *optional*, defaults to 0): @@ -155,11 +152,11 @@ def __init__( mask_input: Optional[bool] = None, mask_type: str = "random", mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], + mask_patches: List[int] = [2, 3], + mask_patch_ratios: List[int] = [1, 1], channel_consistent_masking: bool = False, d_size: str = "4D", - unmasked_channel_indices: list = None, + unmasked_channel_indices: Optional[List[int]] = None, mask_value=0, pooling: str = "mean", num_classes: int = 1, @@ -173,7 +170,7 @@ def __init__( is_encoder_decoder: bool = False, encoder_layerdrop: float = 0.1, prediction_length: int = 24, - prediction_range: List = [0, 1], + prediction_range: List[int] = [0, 1], target_dimension: int = 1, # PatchTST arguments attention_type: str = "prob", diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 7b485769d7f0c4..285fa961b532c9 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -175,7 +175,6 @@ def random_masking( """random_masking: Mask the input considering the control variables. Args: - seed_number (int, optional): Value to set for the seed number xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. unmasked_channel_indices (list, optional): @@ -184,6 +183,7 @@ def random_masking( When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. + seed_number (int, optional): Value to set for the random seed. Returns: Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] @@ -351,6 +351,7 @@ class PatchMasking(nn.Module): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. + seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. """ def __init__( @@ -576,11 +577,17 @@ def __init__(self, config: PatchTSTConfig): if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) self.w_pos = positional_encoding( - config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model + config.positional_encoding, + config.learn_pe, + config.num_patches + 1, + config.d_model, ) else: self.w_pos = positional_encoding( - config.positional_encoding, config.learn_pe, config.num_patches, config.d_model + config.positional_encoding, + config.learn_pe, + config.num_patches, + config.d_model, ) # Positional dropout @@ -596,7 +603,8 @@ def forward( self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None ) -> BaseModelOutputWithNoAttention: """ - x: tensor [bs x nvars x num_patches x patch_length] return: + past_values: tensor [bs x nvars x num_patches x patch_length] output_hidden_states (bool, optional): Boolean + indicating if hidden states should be outtput return: tensor [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token """ @@ -807,7 +815,7 @@ def forward( @add_start_docstrings( - "The bare PatchTST Model outputting raw hidden-states without any specific head on top.", + "The bare PatchTST Model outputting raw hidden-states without any specific head.", PATCHTST_START_DOCSTRING, ) class PatchTSTModelOutputWithNoAttention(ModelOutput): @@ -902,7 +910,11 @@ def __init__(self, config: PatchTSTConfig): else: self.revin = nn.Identity() - self.patching = Patchify(config.context_length, patch_length=config.patch_length, stride=config.stride) + self.patching = Patchify( + config.context_length, + patch_length=config.patch_length, + stride=config.stride, + ) self.mask_input = config.mask_input if self.mask_input: @@ -1038,7 +1050,11 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput( + loss=masked_loss, + prediction_output=x_hat, + hidden_states=model_output.hidden_states, + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1065,7 +1081,9 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] if labels is not None: loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( - loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states + loss=loss_val, + prediction_logits=y_hat, + hidden_states=model_output.hidden_states, ) @@ -1192,7 +1210,11 @@ def forward( loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput( + loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + ) class PatchTSTForForecastingOutput(ModelOutput): @@ -1320,7 +1342,9 @@ def forward( if future_values is not None: loss_val = self.loss(y_hat, future_values) return PatchTSTForForecastingOutput( - loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states + loss=loss_val, + forecast_outputs=y_hat, + hidden_states=model_output.hidden_states, ) @@ -1374,7 +1398,10 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + labels: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1385,4 +1412,8 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput( + loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + ) From 2be37c5823df50424ea65cfcba79e168f511b382 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sat, 2 Sep 2023 01:17:20 +0700 Subject: [PATCH 034/189] change input_size to num_input_channels --- .../models/patchtst/configuration_patchtst.py | 18 +- .../models/patchtst/modeling_patchtst.py | 435 +++++++++--------- .../models/patchtst/test_modeling_patchtst.py | 115 ++--- 3 files changed, 270 insertions(+), 298 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index e18a98d51d1fe5..df870b35f2c05d 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -14,7 +14,7 @@ # limitations under the License. """PatchTST model configuration""" -from typing import List, Optional +from typing import List, Optional, Union from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -42,7 +42,7 @@ class PatchTSTConfig(PretrainedConfig): context_length (`int`, *optional*, defaults to `prediction_length`): The context length for the encoder. If `None`, the context length will be the same as the `prediction_length`. - input_size (`int`, *optional*, defaults to 1): + num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. num_time_features (`int`, *optional*, defaults to 0): @@ -122,7 +122,7 @@ class PatchTSTConfig(PretrainedConfig): def __init__( self, - input_size: int = 1, + num_input_channels: int = 1, context_length: int = 32, patch_length: int = 8, stride: int = 8, @@ -158,7 +158,7 @@ def __init__( d_size: str = "4D", unmasked_channel_indices: Optional[List[int]] = None, mask_value=0, - pooling: str = "mean", + pooling: str = 'mean', num_classes: int = 1, head_dropout: float = 0.0, # proj_dropout: float = 0.0, @@ -170,17 +170,18 @@ def __init__( is_encoder_decoder: bool = False, encoder_layerdrop: float = 0.1, prediction_length: int = 24, - prediction_range: List[int] = [0, 1], - target_dimension: int = 1, + prediction_range: List = [0, 1], + num_output_channels: int = 1, # PatchTST arguments attention_type: str = "prob", sampling_factor: int = 5, distil: bool = True, **kwargs, ): + # time series specific configuration self.context_length = context_length - self.input_size = input_size # n_vars + self.num_input_channels = num_input_channels # n_vars self.num_time_features = num_time_features self.num_dynamic_real_features = num_dynamic_real_features self.num_static_real_features = num_static_real_features @@ -244,10 +245,11 @@ def __init__( self.prediction_length = prediction_length # Regression - self.target_dimension = target_dimension + self.num_output_channels = num_output_channels self.prediction_range = prediction_range super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) def _num_patches(self): return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 + diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 285fa961b532c9..6abbfd08839bb2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 TSFM team. All rights reserved. +# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,20 +14,19 @@ # limitations under the License. """ PyTorch PatchTST model.""" -import math -import random from typing import Optional, Tuple - -import numpy as np import torch from torch import nn -from torch.nn.modules.activation import MultiheadAttention +import math +import random +import numpy as np -from transformers.modeling_outputs import BaseModelOutputWithNoAttention from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, logging +from transformers.modeling_outputs import BaseModelOutputWithNoAttention +from transformers.utils import ModelOutput +from torch.nn.modules.activation import MultiheadAttention from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig -from transformers.utils import ModelOutput, add_start_docstrings, logging - logger = logging.get_logger(__name__) @@ -39,6 +38,7 @@ ] + class PatchTSTAttention(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -85,7 +85,7 @@ def forward(self, x): def positional_encoding(pe, learn_pe, q_len, d_model): # Positional encoding - if pe is None: + if pe == None: w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe nn.init.uniform_(w_pos, -0.02, 0.02) learn_pe = False @@ -131,8 +131,9 @@ def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps= i = 0 for i in range(100): cpe = ( - 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - - 1 + 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * ( + torch.linspace(0, 1, d_model).reshape(1, -1) ** x) + - 1 ) if abs(cpe.mean()) <= eps: @@ -160,33 +161,30 @@ def set_seed(x=42): random.seed(x) np.random.seed(x) torch.manual_seed(x) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(x) + if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) def random_masking( - xb: torch.Tensor, - mask_ratio: float, - unmasked_channel_indices: list = None, - channel_consistent_masking: bool = False, - mask_value=0, - seed_number: Optional[int] = None, + xb: torch.Tensor, + mask_ratio: float, + unmasked_channel_indices: list = None, + channel_consistent_masking: bool = False, + mask_value=0, + seed_number: Optional[int] = None ): """random_masking: Mask the input considering the control variables. Args: xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. - unmasked_channel_indices (list, optional): - indices of unmasked channels. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): - When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary - across channels. Defaults to True. + unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. seed_number (int, optional): Value to set for the random seed. Returns: - Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] + Tensor: xb_mask, masked input, same shape as input + Tensor: Mask tensor of shape [bs x c x n] """ if seed_number: set_seed(seed_number) @@ -223,25 +221,26 @@ def compute_num_patches(sequence_length, patch_length, stride): class Patchify(nn.Module): """ - Args: A class to patchify the time series sequence into different patches - sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int, - required): stride between patches + Args: + sequence_length (int, required): input sequence length + patch_length (int, required): patch length + stride (int, required): stride between patches Returns: - z: output tensor data [bs x n_vars x num_patches x patch_length] + z: output tensor data [bs x num_input_channels x num_patches x patch_length] """ def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + self, + sequence_length: int, + patch_length: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" self.sequence_length = sequence_length @@ -256,38 +255,43 @@ def __init__( def forward(self, past_values: torch.Tensor): """ Args: - past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars] + past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] Returns: - x: output tensor data [bs x n_vars x num_patches x patch_length] + x: output tensor data [bs x num_input_channels x num_patches x patch_length] """ sequence_length = past_values.shape[-2] - assert ( - sequence_length == self.sequence_length - ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." + assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." - x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] x = x.unfold( dimension=-2, size=self.patch_length, step=self.stride - ) # x: [bs x num_patches x n_vars x patch_length] - x = x.transpose(-2, -3).contiguous() # xb: [bs x n_vars x num_patches x patch_length] + ) # x: [bs x num_patches x num_input_channels x patch_length] + x = x.transpose(-2, -3).contiguous() # xb: [bs x num_input_channels x num_patches x patch_length] return x class PatchEmbeddings(nn.Module): """ - Args: A class to patchify the time series sequence into different patches - sequence_length (int, required): input sequence length patch_length (int, required): patch length stride (int, - required): stride between patches + Args: + sequence_length (int, required): input sequence length + patch_length (int, required): patch length + stride (int, required): stride between patches Returns: - embeddings: output tensor data [bs x n_vars x num_patches x embed_dim] + embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim] """ - def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int): + def __init__( + self, + sequence_length: int, + patch_length: int, + stride: int, + embed_dim: int + ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" @@ -303,34 +307,30 @@ def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_d self.s_begin = sequence_length - new_sequence_length # Embedding - self.projection = nn.Conv1d( - in_channels=1, - out_channels=embed_dim, - kernel_size=patch_length, - stride=stride, - ) + self.projection = nn.Conv1d(in_channels=1, + out_channels=embed_dim, + kernel_size=patch_length, + stride=stride, + ) def forward(self, past_values: torch.Tensor): """ Args: - past_values (torch.Tensor, required): Input of shape [bs x sequence_length x n_vars] + past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] Returns: - embeddings: output tensor data [bs x n_vars x num_patches x emb_dim] + embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim] """ - bs, sequence_length, n_vars = past_values.shape - assert ( - sequence_length == self.sequence_length - ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." + bs, sequence_length, num_input_channels = past_values.shape + assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." - x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] - # convert past_values to shape [bs*n_vars x 1 x sequence_length ] - x = x.transpose(1, 2).reshape(bs * n_vars, 1, -1).contiguous() + x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ] + x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous() # projection - embeddings = self.projection(x) # embeddings: [bs*n_vars x emb_dim x num_patches] + embeddings = self.projection(x) # embeddings: [bs*num_input_channels x emb_dim x num_patches] # reshape - embeddings = ( - embeddings.transpose(1, 2).view(bs, n_vars, -1, self.embed_dim).contiguous() - ) # embeddings: [bs x n_vars x num_patches x emb_dim] + embeddings = embeddings.transpose(1, 2).view(bs, num_input_channels, -1, + self.embed_dim).contiguous() # embeddings: [bs x num_input_channels x num_patches x emb_dim] # embeddings = embeddings.flatten(2).transpose(1, 2) return embeddings @@ -345,26 +345,24 @@ class PatchMasking(nn.Module): mask_patches (list, optional): List of patch lengths to mask in the end of the data. mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, optional): - Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): - When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary - across channels. Defaults to True. + unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. """ def __init__( - self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = False, - unmasked_channel_indices: list = None, - mask_value=0, - seed_number: Optional[int] = None, + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = False, + unmasked_channel_indices: list = None, + mask_value=0, + seed_number: Optional[int] = None ): + # if seed_number: # set_seed(seed_number) self.mask_ratio = mask_ratio @@ -384,13 +382,13 @@ def forward(self, x: torch.Tensor): """ Input: x: patched input - 4D: [bs x n_vars x num_patches x patch_length] + 4D: [bs x num_input_channels x num_patches x patch_length] Output: x_mask: Masked patched input - 4D: [bs x n_vars x num_patches x patch_length] + 4D: [bs x num_input_channels x num_patches x patch_length] mask: bool tensor indicating True on masked points - 4D: [bs x n_vars x num_patch] + 4D: [bs x num_input_channels x num_patch] """ if self.mask_type == "random": @@ -400,13 +398,13 @@ def forward(self, x: torch.Tensor): unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, mask_value=self.mask_value, - seed_number=self.seed_number, + seed_number=self.seed_number ) else: raise Exception("Invalid mask type") - mask = mask.bool() # mask: [bs x n_vars x num_patch] + mask = mask.bool() # mask: [bs x num_input_channels x num_patch] return x_mask, mask @@ -415,11 +413,17 @@ class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)]) + self.layers = nn.ModuleList( + [ + ChannelAttentionTSTEncoderLayer(config) + for i in range(config.encoder_layers) + ] + ) def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - src: tensor [bs x nvars x sequence_length x d_model] Return: + src: tensor [bs x nvars x sequence_length x d_model] + Return: Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] @@ -473,57 +477,50 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ - src: tensor [bs x nvars x sequence_length x d_model] Return: + src: tensor [bs x nvars x sequence_length x d_model] + Return: Tensor [bs x nvars x sequence_length x d_model] """ - bs, n_vars, sequence_length, d_model = src.shape + bs, num_input_channels, sequence_length, d_model = src.shape # First sublayer: attention across time - src = src.view(bs * n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view(bs * num_input_channels, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src)) - ) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer1( - src + self.dropout_path1(self.self_attn(src)) - ) # src: [(bs*nvars) x sequence_length x d_model] - src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src + self.dropout_path1(self.self_attn(src))) # src: [(bs*nvars) x sequence_length x d_model] + src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] - src = ( - src.transpose(2, 1).contiguous().view(bs * sequence_length, n_vars, d_model) - ) # [(bs*sequence_length) x nvars x d_model] + src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, + d_model) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src)) - ) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer2( - src + self.dropout_path2(self.self_attn(src)) - ) # src: [(bs*sequence_length) x nvars x d_model] - src = ( - src.reshape(bs, sequence_length, n_vars, d_model).transpose(1, 2).contiguous() - ) # src: [bs x nvars x sequence_length x d_model] + src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] + src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, + 2).contiguous() # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden - src = src.view(bs * n_vars, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view(bs * num_input_channels, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3( - self.ff(self.norm_sublayer3(src)) - ) # Add: residual connection with residual dropout + self.ff(self.norm_sublayer3(src))) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer3( - src + self.dropout_path3(self.ff(src)) - ) # Add: residual connection with residual dropout - src = src.reshape(bs, n_vars, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src + self.dropout_path3(self.ff(src))) # Add: residual connection with residual dropout + src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src @@ -550,6 +547,7 @@ def _init_weights(self, module): module.bias_k.data.normal_(mean=0.0, std=self.config.init_std) module.bias_v.data.normal_(mean=0.0, std=self.config.init_std) + def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (ChannelAttentionPatchTSTEncoder)): module.gradient_checkpointing = value @@ -558,7 +556,7 @@ def _set_gradient_checkpointing(self, module, value=False): class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.n_vars = config.input_size + self.num_input_channels = config.num_input_channels self.num_patches = config.num_patches self.patch_length = config.patch_length self.d_model = config.d_model @@ -569,26 +567,18 @@ def __init__(self, config: PatchTSTConfig): # Input encoding: projection of feature vectors onto a d-dim vector space if not config.shared_embedding: self.w_p = nn.ModuleList() - for _ in range(self.n_vars): + for _ in range(self.num_input_channels): self.w_p.append(nn.Linear(config.patch_length, config.d_model)) else: self.w_p = nn.Linear(config.patch_length, config.d_model) # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding( - config.positional_encoding, - config.learn_pe, - config.num_patches + 1, - config.d_model, - ) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, + config.d_model) else: - self.w_pos = positional_encoding( - config.positional_encoding, - config.learn_pe, - config.num_patches, - config.d_model, - ) + self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, + config.d_model) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -599,17 +589,16 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None - ) -> BaseModelOutputWithNoAttention: + def forward(self, past_values: torch.Tensor, + output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention: """ - past_values: tensor [bs x nvars x num_patches x patch_length] output_hidden_states (bool, optional): Boolean - indicating if hidden states should be outtput return: + x: tensor [bs x nvars x num_patches x patch_length] + return: tensor [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token """ - # bs, num_patches, n_vars, patch_length = x.shape - bs, n_vars, num_patches, patch_length = past_values.shape + # bs, num_patches, num_input_channels, patch_length = x.shape + bs, num_input_channels, num_patches, patch_length = past_values.shape output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -617,7 +606,7 @@ def forward( # Input encoding if not self.shared_embedding: x_out = [] - for i in range(n_vars): + for i in range(num_input_channels): z = self.w_p[i](past_values[:, i, :, :]) x_out.append(z) past_values = torch.stack(x_out, dim=1) @@ -635,12 +624,14 @@ def forward( # Encoder past_values, hidden_states = self.encoder( - past_values, output_hidden_states - ) # x: [bs x nvars x num_patches x d_model] + past_values, output_hidden_states) # x: [bs x nvars x num_patches x d_model] # or [bs x nvars x (num_patches+1) x d_model] if use cls_token # return past_values, hidden_states - return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states) + return BaseModelOutputWithNoAttention( + last_hidden_state=past_values, + hidden_states=hidden_states + ) PATCHTST_START_DOCSTRING = r""" @@ -661,7 +652,7 @@ def forward( PATCHTST_INPUTS_DOCSTRING = r""" Args: - past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`): + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`): Past values of the time series, that serve as context in order to predict the future. The sequence size of this tensor must be larger than the `context_length` of the model, since the model will use the larger size to construct lag features, i.e. additional values from the past which are added in order to serve as "extra @@ -677,7 +668,7 @@ def forward( Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`. - For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of + For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of variates in the time series per time step. past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`): Required time features, which the model internally will add to `past_values`. These could be things like @@ -695,7 +686,7 @@ def forward( must but known at prediction time. The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. - past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*): + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*): Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in `[0, 1]`: @@ -715,7 +706,7 @@ def forward( Static real features are features which have the same value for all time steps (static over time). A typical example of a static real feature is promotion information. - future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, input_size)`, *optional*): + future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*): Future values of the time series, that serve as labels for the model. The `future_values` is what the Transformer needs during training to learn to output, given the `past_values`. @@ -726,7 +717,7 @@ def forward( Optionally, during training any missing values need to be replaced with zeros and indicated via the `future_observed_mask`. - For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number of + For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of variates in the time series per time step. future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`): Required time features for the prediction window, which the model internally will add to `future_values`. @@ -745,7 +736,7 @@ def forward( must but known at prediction time. The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. - future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*): + future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*): Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected in `[0, 1]`: @@ -827,8 +818,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of - the model at the output of each layer plus the optional initial embedding outputs. + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): patched input to the Transformer mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) @@ -850,9 +841,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): class RevIN(nn.Module): def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): """ - :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x n_vars] - :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm - input here. + :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x num_input_channels] + :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here. """ super(RevIN, self).__init__() self.stdev = None @@ -867,10 +857,10 @@ def set_statistics(self, mean, stdev): self.stdev = stdev def forward(self, x, mode: str): - if mode == "norm": + if mode == 'norm': self._get_statistics(x) x = self._normalize(x) - elif mode == "denorm": + elif mode == 'denorm': x = self._denormalize(x) elif mode == "transform": x = self._normalize(x) @@ -900,6 +890,7 @@ def _denormalize(self, x): return x +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -926,7 +917,7 @@ def __init__(self, config: PatchTSTConfig): channel_consistent_masking=config.channel_consistent_masking, unmasked_channel_indices=config.unmasked_channel_indices, mask_value=config.mask_value, - seed_number=config.seed_number, + seed_number=config.seed_number ) else: self.masking = nn.Identity() @@ -935,12 +926,10 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward( - self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - ): + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -948,22 +937,19 @@ def forward( past_values = self.revin(past_values, mode="norm") # x: tensor [bs x seq_len x in_channels] patched_values = self.patching( - past_values - ) # patched_values: [bs x n_vars x num_patches x patch_length] for pretrain + past_values) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain if self.mask_input: masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) - return PatchTSTModelOutputWithNoAttention( - last_hidden_state=encoder_output.last_hidden_state, - hidden_states=encoder_output.hidden_states, - patched_input=patched_values, - mask=mask, - revin_mean=self.revin.mean if self.use_revin else None, - revin_stdev=self.revin.stdev if self.use_revin else None, - ) - + return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, + hidden_states=encoder_output.hidden_states, + patched_input=patched_values, + mask=mask, + revin_mean=self.revin.mean if self.use_revin else None, + revin_stdev=self.revin.stdev if self.use_revin else None + ) class MaskPretrainHead(nn.Module): def __init__(self, config): @@ -1020,19 +1006,19 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config=config) self.head = MaskPretrainHead(config) - self.loss = torch.nn.MSELoss(reduction="none") + self.loss = torch.nn.MSELoss(reduction='none') # Initialize weights and apply final processing self.post_init() def forward( - self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, + self, past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None ) -> PatchTSTOutput: """ - past_values (x): tensor [bs x sequence_length x n_vars ] future_values (y): labels + past_values (x): tensor [bs x sequence_length x num_input_channels ] + future_values (y): labels """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1053,7 +1039,7 @@ def forward( return PatchTSTOutput( loss=masked_loss, prediction_output=x_hat, - hidden_states=model_output.hidden_states, + hidden_states=model_output.hidden_states ) @@ -1083,7 +1069,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] return PatchTSTForClassificationOutput( loss=loss_val, prediction_logits=y_hat, - hidden_states=model_output.hidden_states, + hidden_states=model_output.hidden_states ) @@ -1094,12 +1080,12 @@ def __init__(self, config: PatchTSTConfig): self.pooling = config.pooling self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - self.linear = nn.Linear(config.input_size * config.d_model, config.num_classes) + self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes) def forward(self, x): """ - x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: - [bs x n_classes] + x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token + output: [bs x n_classes] """ if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: bs x nvars x d_model @@ -1148,41 +1134,42 @@ class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.target_dimension = config.target_dimension + self.num_output_channels = config.num_output_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling - head_dim = config.input_size * config.d_model + head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) - self.linear = nn.Linear(head_dim, config.prediction_length * config.target_dimension) + self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() def forward(self, x): """ x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token - output: [bs x pred_len x target_dimension] + output: [bs x pred_len x num_output_channels] """ batch_size = x.shape[0] if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == "mean": + elif self.pooling == 'mean': x = x.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == "max": + elif self.pooling == 'max': x = x.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f"pooling operator {self.pooling} is not implemented yet") + raise Exception(f'pooling operator {self.pooling} is not implemented yet') # flatten the input x = self.flatten(x) # x: bs x (nvars * d_model) - y = self.linear(self.dropout(x)) # y: bs x (pred_len * target_dimension) + y = self.linear(self.dropout(x)) # y: bs x (pred_len * num_output_channels) # reshape the data - y = y.reshape(batch_size, -1, self.target_dimension) # [bs x pred_len x target_dimension] + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] return y + class PatchTSTForPrediction(PatchTSTPreTrainedModel): # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): @@ -1190,17 +1177,16 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PredictionHead(config) - self.loss = nn.MSELoss(reduction="mean") + self.loss = nn.MSELoss(reduction='mean') # Initialize weights and apply final processing self.post_init() - def forward( - self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - ): + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None): + output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1213,7 +1199,7 @@ def forward( return PatchTSTOutput( loss=loss_val, prediction_output=y_hat, - hidden_states=model_output.hidden_states, + hidden_states=model_output.hidden_states ) @@ -1252,7 +1238,7 @@ def __init__(self, config: PatchTSTConfig): super().__init__() self.individual = config.individual - self.n_vars = config.input_size + self.num_input_channels = config.num_input_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling head_dim = config.d_model if self.pooling else config.d_model * config.num_patches @@ -1261,10 +1247,11 @@ def __init__(self, config: PatchTSTConfig): self.linears = nn.ModuleList() self.dropouts = nn.ModuleList() self.flattens = nn.ModuleList() - for i in range(self.n_vars): + for i in range(self.num_input_channels): self.flattens.append(nn.Flatten(start_dim=2)) self.linears.append(nn.Linear(head_dim, config.prediction_length)) - self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) + self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + ) else: self.flatten = nn.Flatten(start_dim=2) self.linear = nn.Linear(head_dim, config.prediction_length) @@ -1280,16 +1267,16 @@ def forward(self, x: torch.Tensor): if self.use_cls_token: y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: - if self.pooling == "mean": + if self.pooling == 'mean': y = x.mean(dim=2) # y: [bs x nvars x d_model] - elif self.pooling == "max": + elif self.pooling == 'max': y = x.max(dim=2) # y: [bs x nvars x d_model] else: y = x # y: [bs x nvars x num_patches x d_model] if self.individual: x_out = [] - for i in range(self.n_vars): + for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.linears[i](z) # z: [bs x forecast_len] z = self.dropouts[i](z) @@ -1311,7 +1298,7 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = ForecastHead(config) - self.loss = nn.MSELoss(reduction="mean") + self.loss = nn.MSELoss(reduction='mean') self.use_revin = config.revin if self.use_revin: self.revin = RevIN() @@ -1321,12 +1308,10 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward( - self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None, - ): + def forward(self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1344,7 +1329,7 @@ def forward( return PatchTSTForForecastingOutput( loss=loss_val, forecast_outputs=y_hat, - hidden_states=model_output.hidden_states, + hidden_states=model_output.hidden_states ) @@ -1358,9 +1343,9 @@ def __init__(self, config: PatchTSTConfig): self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - input_dim = config.input_size * config.d_model - # if is_flatten: input_dim *= num_patch - self.linear = nn.Linear(input_dim, config.target_dimension) + head_dim = config.num_input_channels * config.d_model + # if is_flatten: head_dim *= num_patch + self.linear = nn.Linear(head_dim, config.num_output_channels) def forward(self, past_values): """ @@ -1370,12 +1355,12 @@ def forward(self, past_values): """ if self.use_cls_token: past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == "mean": + elif self.pooling == 'mean': past_values = past_values.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == "max": + elif self.pooling == 'max': past_values = past_values.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f"pooling operator {self.pooling} is not implemented yet") + raise Exception(f'pooling operator {self.pooling} is not implemented yet') # flatten the input past_values = self.flatten(past_values) # x: bs x nvars * d_model y = self.linear(self.dropout(past_values)) # y: bs x output_dim @@ -1392,17 +1377,15 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = RegressionHead(config) - self.loss = nn.MSELoss(reduction="mean") + self.loss = nn.MSELoss(reduction='mean') # Initialize weights and apply final processing self.post_init() - def forward( - self, - past_values: torch.Tensor, - labels: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None, - ): + def forward(self, + past_values: torch.Tensor, + labels: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1415,5 +1398,5 @@ def forward( return PatchTSTOutput( loss=loss_val, prediction_output=y_hat, - hidden_states=model_output.hidden_states, + hidden_states=model_output.hidden_states ) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 65bbb309c815a0..911d5160db46c7 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -15,7 +15,6 @@ """ Testing suite for the PyTorch PatchTST model. """ import inspect -import random import tempfile import unittest @@ -23,9 +22,9 @@ from huggingface_hub import hf_hub_download from transformers import is_torch_available +from transformers.testing_utils import is_flaky, require_torch, torch_device, slow from transformers.models.auto import get_values -from transformers.testing_utils import is_flaky, require_torch, slow, torch_device - +import random from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -35,18 +34,10 @@ if is_torch_available(): import torch + from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING + from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining, \ + PatchTSTForClassification, PatchTSTForRegression - from transformers import ( - MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, - MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, - PatchTSTConfig, - PatchTSTForClassification, - PatchTSTForForecasting, - PatchTSTForMaskPretraining, - PatchTSTForPrediction, - PatchTSTForRegression, - PatchTSTModel, - ) @require_torch @@ -59,7 +50,7 @@ def __init__( context_length=14, patch_length=5, stride=5, - input_size=1, + num_input_channels=1, num_time_features=1, is_training=True, hidden_size=16, @@ -74,7 +65,7 @@ def __init__( distil=False, seed_number=42, num_classes=2, - target_dimension=2, + num_output_channels=2, ): self.parent = parent self.batch_size = batch_size @@ -82,7 +73,7 @@ def __init__( self.context_length = context_length self.patch_length = patch_length self.stride = stride - self.input_size = input_size + self.num_input_channels = num_input_channels self.num_time_features = num_time_features self.lags_sequence = lags_sequence self.is_training = is_training @@ -99,7 +90,7 @@ def __init__( ) self.seed_number = seed_number self.num_classes = num_classes - self.target_dimension = target_dimension + self.num_output_channels = num_output_channels self.sampling_factor = sampling_factor self.distil = distil self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 @@ -109,7 +100,7 @@ def get_config(self): prediction_length=self.prediction_length, patch_length=self.patch_length, stride=self.stride, - input_size=self.input_size, + num_input_channels=self.num_input_channels, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, @@ -120,17 +111,17 @@ def get_config(self): activation_function=self.hidden_act, seed_number=self.seed_number, num_classes=self.num_classes, - target_dimension=self.target_dimension, + num_output_channels=self.num_output_channels ) def prepare_patchtst_inputs_dict(self, config): _past_length = config.context_length - # bs, n_vars, num_patch, patch_len + # bs, num_input_channels, num_patch, patch_len - # [bs x seq_len x n_vars] - past_values = floats_tensor([self.batch_size, _past_length, self.input_size]) + # [bs x seq_len x num_input_channels] + past_values = floats_tensor([self.batch_size, _past_length, self.num_input_channels]) - future_values = floats_tensor([self.batch_size, config.prediction_length, self.input_size]) + future_values = floats_tensor([self.batch_size, config.prediction_length, self.num_input_channels]) inputs_dict = { "past_values": past_values, @@ -151,20 +142,16 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - ( - PatchTSTModel, - PatchTSTForPrediction, - PatchTSTForForecasting, - PatchTSTForMaskPretraining, - PatchTSTForClassification, - PatchTSTForRegression, - ) + (PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForClassification, + PatchTSTForRegression) if is_torch_available() else () ) - all_generative_model_classes = ( - (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () - ) + all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} is_encoder_decoder = False test_pruning = False @@ -174,6 +161,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_inputs_embeds = False test_model_common_attributes = False + test_resize_embeddings = True test_resize_position_embeddings = False test_mismatched_shapes = True @@ -203,7 +191,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict.pop("future_values") elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): rng = random.Random(self.model_tester.seed_number) - labels = floats_tensor([self.model_tester.batch_size, self.model_tester.target_dimension], rng=rng) + labels = floats_tensor([self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng) inputs_dict["labels"] = labels inputs_dict.pop("future_values") return inputs_dict @@ -218,7 +206,7 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) - # +# def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) @@ -245,7 +233,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - print("model_class: ", model_class) + print('model_class: ', model_class) check_hidden_states_output(inputs_dict, config, model_class) @@ -254,9 +242,8 @@ def check_hidden_states_output(inputs_dict, config, model_class): config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) - - # - # # Ignore since we have no tokens embeddings +# +# # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass @@ -286,9 +273,8 @@ def test_forward_signature(self): "past_values", "future_values", ] - if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( - MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING - ): + if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \ + model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): expected_arg_names.remove("future_values") expected_arg_names.append("labels") expected_arg_names.extend( @@ -304,7 +290,7 @@ def test_retain_grad_hidden_states_attentions(self): super().test_retain_grad_hidden_states_attentions() -def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"): +def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'): file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") batch = torch.load(file, map_location=torch_device) return batch @@ -314,21 +300,22 @@ def prepare_batch(repo_id="diepi/test-etth1", file="train-batch.pt"): @slow class PatchTSTModelIntegrationTests(unittest.TestCase): def test_pretrain_head(self): - model = PatchTSTForMaskPretraining.from_pretrained("diepi/test_patchtst_pretrained_etth1").to(torch_device) + model = PatchTSTForMaskPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device) batch = prepare_batch() torch.manual_seed(0) with torch.no_grad(): - output = model(past_values=batch["past_values"].to(torch_device)).prediction_output - num_patch = ( - max(model.config.context_length, model.config.patch_length) - model.config.patch_length - ) // model.config.stride + 1 - expected_shape = torch.Size([64, model.config.input_size, num_patch, model.config.patch_length]) + output = model( + past_values=batch["past_values"].to(torch_device) + ).prediction_output + num_patch = (max(model.config.context_length, + model.config.patch_length) - model.config.patch_length) // model.config.stride + 1 + expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor( - [[[0.0160]], [[0.0148]], [[0.0090]], [[0.0166]], [[0.0099]], [[0.0053]], [[0.0090]]], device=torch_device - ) + expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], + [[0.0246]], [[0.0090]]], + device=torch_device) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # def test_classification_head(self): @@ -349,23 +336,22 @@ def test_pretrain_head(self): # ) # self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE)) - def test_forecasting_head(self): - model = PatchTSTForForecasting.from_pretrained("./hf_etth_forecasting").to(torch_device) + def test_prediction_head(self): + model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device) batch = prepare_batch(file="test-batch.pt") torch.manual_seed(0) with torch.no_grad(): output = model( past_values=batch["past_values"].to(torch_device), - future_values=batch["future_values"].to(torch_device), - ).forecast_outputs - expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) + future_values=batch["future_values"].to(torch_device) + ).prediction_output + expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor( - [[-0.9027, 0.3814, -0.8322, 0.4250, -0.7183, -0.0635, -0.8747]], - device=torch_device, - ) + expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + device=torch_device, + ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) # def test_seq_to_seq_generation(self): @@ -385,10 +371,11 @@ def test_forecasting_head(self): # # mean_prediction = outputs.sequences.mean(dim=1) # # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) # - # # expected_shape = torch.Size([64, model.config.prediction_length, model.config.input_size]) + # # expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) # self.assertEqual(outputs.shape, expected_shape) # # expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], # device=torch_device, # ) # self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE)) + From 22adead4c72d8b13e2f03b445d0b88baaae8853c Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Fri, 1 Sep 2023 14:59:11 -0400 Subject: [PATCH 035/189] more formatting --- .../models/patchtst/modeling_patchtst.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6abbfd08839bb2..7b796500b755de 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -222,10 +222,12 @@ def compute_num_patches(sequence_length, patch_length, stride): class Patchify(nn.Module): """ A class to patchify the time series sequence into different patches + Args: - sequence_length (int, required): input sequence length - patch_length (int, required): patch length - stride (int, required): stride between patches + sequence_length (int, required): input sequence length. + patch_length (int, required): patch length. + stride (int, required): stride between patches. + Returns: z: output tensor data [bs x num_input_channels x num_patches x patch_length] """ @@ -256,6 +258,7 @@ def forward(self, past_values: torch.Tensor): """ Args: past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] + Returns: x: output tensor data [bs x num_input_channels x num_patches x patch_length] """ @@ -274,9 +277,10 @@ class PatchEmbeddings(nn.Module): """ A class to patchify the time series sequence into different patches Args: - sequence_length (int, required): input sequence length - patch_length (int, required): patch length - stride (int, required): stride between patches + sequence_length (int, required): input sequence length. + patch_length (int, required): patch length. + stride (int, required): stride between patches. + Returns: embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim] """ @@ -592,7 +596,10 @@ def __init__(self, config: PatchTSTConfig): def forward(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention: """ - x: tensor [bs x nvars x num_patches x patch_length] + Args: + past_values: tensor [bs x nvars x num_patches x patch_length]. + output_hidden_states (bool, optional): Indicates if hidden states should be output. + return: tensor [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token @@ -890,7 +897,6 @@ def _denormalize(self, x): return x -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerModel with TimeSeriesTransformer->PatchTST,TIME_SERIES_TRANSFORMER->PATCHTST,time-series-transformer->patchtst,TimeSeries->PatchTST class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) From 76adaaefa4dff5d597f12e28cee5d9b2af7633f0 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sat, 2 Sep 2023 23:10:52 +0700 Subject: [PATCH 036/189] Remove some unused params --- .../models/patchtst/configuration_patchtst.py | 44 +++++++------------ .../models/patchtst/test_modeling_patchtst.py | 5 --- 2 files changed, 15 insertions(+), 34 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index df870b35f2c05d..66220c802fc8c6 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -28,6 +28,7 @@ class PatchTSTConfig(PretrainedConfig): + model_type = "patchtst" r""" This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an PatchTST model according to the specified arguments, defining the model architecture. @@ -89,13 +90,7 @@ class PatchTSTConfig(PretrainedConfig): init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal weight initialization distribution. use_cache (`bool`, *optional*, defaults to `True`): - Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. - attention_type (`str`, *optional*, defaults to "prob"): - Attention used in encoder. This can be set to "prob" (PatchTST's ProbAttention) or "full" (vanilla - transformer's canonical self-attention). - sampling_factor (`int`, *optional*, defaults to 5): - ProbSparse sampling factor (only makes affect when `attention_type`="prob"). It is used to control the - reduced query matrix (Q_reduce) input length. + Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. distil (`bool`, *optional*, defaults to `True`): Whether to use distilling in encoder. @@ -113,7 +108,6 @@ class PatchTSTConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "patchtst" attribute_map = { "hidden_size": "d_model", "num_attention_heads": "encoder_attention_heads", @@ -122,10 +116,18 @@ class PatchTSTConfig(PretrainedConfig): def __init__( self, + # time series specific configuration num_input_channels: int = 1, context_length: int = 32, + num_dynamic_real_features: int = 0, + num_static_real_features: int = 0, + num_static_categorical_features: int = 0, + num_time_features: int = 0, + is_encoder_decoder: bool = False, + # PatchTST arguments patch_length: int = 8, stride: int = 8, + # Transformer architecture configuration encoder_layers: int = 3, d_model: int = 128, encoder_attention_heads: int = 16, @@ -149,33 +151,23 @@ def __init__( individual: bool = False, seed_number: int = None, revin: Optional[bool] = True, + qkv_bias: bool = True, + # mask pretraining mask_input: Optional[bool] = None, mask_type: str = "random", mask_ratio=0.5, mask_patches: List[int] = [2, 3], mask_patch_ratios: List[int] = [1, 1], channel_consistent_masking: bool = False, - d_size: str = "4D", unmasked_channel_indices: Optional[List[int]] = None, mask_value=0, - pooling: str = 'mean', + # head + pooling: str = "mean", num_classes: int = 1, head_dropout: float = 0.0, - # proj_dropout: float = 0.0, - qkv_bias: bool = True, - num_dynamic_real_features: int = 0, - num_static_real_features: int = 0, - num_static_categorical_features: int = 0, - num_time_features: int = 0, - is_encoder_decoder: bool = False, - encoder_layerdrop: float = 0.1, prediction_length: int = 24, prediction_range: List = [0, 1], num_output_channels: int = 1, - # PatchTST arguments - attention_type: str = "prob", - sampling_factor: int = 5, - distil: bool = True, **kwargs, ): @@ -194,7 +186,6 @@ def __init__( self.encoder_layers = encoder_layers self.dropout = dropout self.attention_dropout = attention_dropout - self.encoder_layerdrop = encoder_layerdrop self.shared_embedding = shared_embedding self.channel_attention = channel_attention self.norm = norm @@ -216,11 +207,8 @@ def __init__( self.patch_length = patch_length self.stride = stride self.num_patches = self._num_patches() - self.attention_type = attention_type - self.sampling_factor = sampling_factor - self.distil = distil - # Masking + # Mask pretraining self.seed_number = seed_number self.mask_input = mask_input self.mask_type = mask_type @@ -228,7 +216,6 @@ def __init__( self.mask_patches = mask_patches self.mask_patch_ratios = mask_patch_ratios self.channel_consistent_masking = channel_consistent_masking - self.d_size = d_size self.unmasked_channel_indices = unmasked_channel_indices self.mask_value = mask_value @@ -239,7 +226,6 @@ def __init__( # Classification self.num_classes = num_classes - # self.proj_dropout = proj_dropout # Forcasting and prediction self.prediction_length = prediction_length diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 911d5160db46c7..8208070116999a 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -61,7 +61,6 @@ def __init__( hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, lags_sequence=[1, 2, 3, 4, 5], - sampling_factor=10, distil=False, seed_number=42, num_classes=2, @@ -85,13 +84,9 @@ def __init__( self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.encoder_seq_length = min( - sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length - ) self.seed_number = seed_number self.num_classes = num_classes self.num_output_channels = num_output_channels - self.sampling_factor = sampling_factor self.distil = distil self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 From 1b078c7d442667215f0b2d1156fc7ea3676d3bf9 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Sat, 2 Sep 2023 21:28:23 -0400 Subject: [PATCH 037/189] Add a comment for pretrained models --- .../models/patchtst/modeling_patchtst.py | 143 ++---------------- .../models/patchtst/test_modeling_patchtst.py | 55 +------ 2 files changed, 18 insertions(+), 180 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 7b796500b755de..03e028b9c8e99e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 Amazon and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 TSFM team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -665,54 +665,14 @@ def forward(self, past_values: torch.Tensor, to construct lag features, i.e. additional values from the past which are added in order to serve as "extra context". - The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if no - `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest - look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length of - the past. + The `sequence_length` here is equal to `config.context_length` The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as - `static_categorical_features`, `static_real_features`, `past_time_features` and lags). + `static_categorical_features`, `static_real_features`). - Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`. - - For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of - variates in the time series per time step. - past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`): - Required time features, which the model internally will add to `past_values`. These could be things like - "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These - could also be so-called "age" features, which basically help the model know "at which point in life" a - time-series is. Age features have small values for distant past time steps and increase monotonically the - more we approach the current time step. Holiday features are also a good example of time features. - - These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where - the position encodings are learned from scratch internally as parameters of the model, the Time Series - Transformer requires to provide additional time features. The Time Series Transformer only learns - additional embeddings for `static_categorical_features`. - - Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features - must but known at prediction time. - - The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. - past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*): - Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in - `[0, 1]`: - - - 1 for values that are **observed**, - - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - - static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*): - Optional static categorical features for which the model will learn an embedding, which it will add to the - values of the time series. - - Static categorical features are features which have the same value for all time steps (static over time). - - A typical example of a static categorical feature is a time series ID. - static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*): - Optional static real features which the model will add to the values of the time series. - - Static real features are features which have the same value for all time steps (static over time). - - A typical example of a static real feature is promotion information. + For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the + number of variates in the time series per time step. + future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*): Future values of the time series, that serve as labels for the model. The `future_values` is what the Transformer needs during training to learn to output, given the `past_values`. @@ -721,94 +681,11 @@ def forward(self, past_values: torch.Tensor, See the demo notebook and code snippets for details. - Optionally, during training any missing values need to be replaced with zeros and indicated via the - `future_observed_mask`. - - For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of - variates in the time series per time step. - future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`): - Required time features for the prediction window, which the model internally will add to `future_values`. - These could be things like "month of year", "day of the month", etc. encoded as vectors (for instance as - Fourier features). These could also be so-called "age" features, which basically help the model know "at - which point in life" a time-series is. Age features have small values for distant past time steps and - increase monotonically the more we approach the current time step. Holiday features are also a good example - of time features. - - These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where - the position encodings are learned from scratch internally as parameters of the model, the Time Series - Transformer requires to provide additional time features. The Time Series Transformer only learns - additional embeddings for `static_categorical_features`. - - Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these features - must but known at prediction time. - - The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. - future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`, *optional*): - Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected - in `[0, 1]`: - - - 1 for values that are **observed**, - - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - - This mask is used to filter out missing values for the final loss calculation. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to - make sure the model can only look at previous inputs in order to predict the future. - head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): - Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*) - `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of - hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. + For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the + number of variates in the time series per time step. + output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + Whether or not to return the hidden states of all layers. """ diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 8208070116999a..8a444f1eecd5d3 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -285,17 +285,20 @@ def test_retain_grad_hidden_states_attentions(self): super().test_retain_grad_hidden_states_attentions() -def prepare_batch(repo_id="diepi/test-etth1", file='train-batch.pt'): +# Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable. +def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"): file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") batch = torch.load(file, map_location=torch_device) return batch +# Note: Publishing of pretrained weights is under internal review. Pretrained model is not yet downloadable. @require_torch @slow class PatchTSTModelIntegrationTests(unittest.TestCase): + # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_pretrain_head(self): - model = PatchTSTForMaskPretraining.from_pretrained('diepi/test_patchtst_pretrained_etth1').to(torch_device) + model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst_pretrained_etth1").to(torch_device) batch = prepare_batch() torch.manual_seed(0) @@ -313,26 +316,10 @@ def test_pretrain_head(self): device=torch_device) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) - # def test_classification_head(self): - # # mock data, test - # model = PatchTSTForClassification.from_pretrained('diepi/test_patchtst_classification_mock').to(torch_device) - # batch = prepare_batch(repo_id="diepi/mock-data", file="test-mock-patchtst.pt") - # - # torch.manual_seed(0) - # with torch.no_grad(): - # output = model( - # past_values=batch["past_values"].to(torch_device) - # ).prediction_logits - # expected_shape = torch.Size([1, model.config.num_classes]) - # self.assertEqual(output.shape, expected_shape) - # - # expected_slice = torch.tensor([[-0.2774, -0.1081, 0.6771]], - # device=torch_device, - # ) - # self.assertTrue(torch.allclose(output, expected_slice, atol=TOLERANCE)) - + # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_prediction_head(self): - model = PatchTSTForPrediction.from_pretrained('diepi/test_patchtst_prediction_etth1').to(torch_device) + model = PatchTSTForPrediction.from_pretrained("ibm/patchtst_prediction_etth1").to(torch_device) + batch = prepare_batch(file="test-batch.pt") torch.manual_seed(0) @@ -348,29 +335,3 @@ def test_prediction_head(self): device=torch_device, ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) - - # def test_seq_to_seq_generation(self): - # model = PatchTSTForPrediction.from_pretrained("diepi/test_patchtst_prediction_etth1").to(torch_device) - # batch = prepare_batch("val-batch.pt") - # - # torch.manual_seed(0) - # with torch.no_grad(): - # outputs = model.generate( - # past_values=batch["past_values"].to(torch_device), - # future_values=batch["future_values"].to(torch_device) - # ).prediction_output - # expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) - # # self.assertEqual(outputs.sequences.shape, expected_shape) - # # - # # expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) - # # mean_prediction = outputs.sequences.mean(dim=1) - # # self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) - # - # # expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) - # self.assertEqual(outputs.shape, expected_shape) - # - # expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], - # device=torch_device, - # ) - # self.assertTrue(torch.allclose(outputs[0, :1, :7], expected_slice, atol=TOLERANCE)) - From f718c042d6a08323d270dd521271464c6fde5202 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 3 Sep 2023 23:50:34 +0700 Subject: [PATCH 038/189] add channel_attention option add channel_attention option and remove unused positional encoders. --- .../models/patchtst/modeling_patchtst.py | 86 +++++-------------- 1 file changed, 23 insertions(+), 63 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 03e028b9c8e99e..c77a94a4e1c0fb 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -89,26 +89,15 @@ def positional_encoding(pe, learn_pe, q_len, d_model): w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe nn.init.uniform_(w_pos, -0.02, 0.02) learn_pe = False - elif pe == "zero": - w_pos = torch.empty((q_len, 1)) - nn.init.uniform_(w_pos, -0.02, 0.02) elif pe == "zeros": w_pos = torch.empty((q_len, d_model)) nn.init.uniform_(w_pos, -0.02, 0.02) - elif pe == "normal" or pe == "gauss": + elif pe == "normal": w_pos = torch.zeros((q_len, 1)) torch.nn.init.normal_(w_pos, mean=0.0, std=0.1) elif pe == "uniform": w_pos = torch.zeros((q_len, 1)) nn.init.uniform_(w_pos, a=0.0, b=0.1) - elif pe == "lin1d": - w_pos = coord1d_pos_encoding(q_len, exponential=False, normalize=True) - elif pe == "exp1d": - w_pos = coord1d_pos_encoding(q_len, exponential=True, normalize=True) - elif pe == "lin2d": - w_pos = coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True) - elif pe == "exp2d": - w_pos = coord2d_pos_encoding(q_len, d_model, exponential=True, normalize=True) elif pe == "sincos": pos_enc = torch.zeros(q_len, d_model) position = torch.arange(0, q_len).unsqueeze(1) @@ -120,43 +109,11 @@ def positional_encoding(pe, learn_pe, q_len, d_model): w_pos = pos_enc else: raise ValueError( - f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \ - 'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)" + f"{pe} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." ) return nn.Parameter(w_pos, requires_grad=learn_pe) -def coord2d_pos_encoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False): - x = 0.5 if exponential else 1 - i = 0 - for i in range(100): - cpe = ( - 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * ( - torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - - 1 - ) - - if abs(cpe.mean()) <= eps: - break - elif cpe.mean() > eps: - x += 0.001 - else: - x -= 0.001 - i += 1 - if normalize: - cpe = cpe - cpe.mean() - cpe = cpe / (cpe.std() * 10) - return cpe - - -def coord1d_pos_encoding(q_len, exponential=False, normalize=True): - cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** (0.5 if exponential else 1)) - 1 - if normalize: - cpe = cpe - cpe.mean() - cpe = cpe / (cpe.std() * 10) - return cpe - - def set_seed(x=42): random.seed(x) np.random.seed(x) @@ -444,6 +401,7 @@ class ChannelAttentionTSTEncoderLayer(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() + self.channel_attention = config.channel_attention # Multi-Head attention self.self_attn = PatchTSTAttention(config) @@ -455,11 +413,12 @@ def __init__(self, config: PatchTSTConfig): self.norm_sublayer1 = nn.LayerNorm(config.d_model) # Add & Norm of the sublayer 2 - self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() - if "batch" in config.norm.lower(): - self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) - else: - self.norm_sublayer2 = nn.LayerNorm(config.d_model) + if self.channel_attention: + self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() + if "batch" in config.norm.lower(): + self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + else: + self.norm_sublayer2 = nn.LayerNorm(config.d_model) # Position-wise Feed-Forward self.ff = nn.Sequential( @@ -501,18 +460,19 @@ def forward(self, src: torch.Tensor): # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] - src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, - d_model) # [(bs*sequence_length) x nvars x d_model] - if self.pre_norm: - ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout - else: - ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer2( - src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] - src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, - 2).contiguous() # src: [bs x nvars x sequence_length x d_model] + if self.channel_attention: + src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, + d_model) # [(bs*sequence_length) x nvars x d_model] + if self.pre_norm: + ## Norm and Multi-Head attention and Add residual connection + src = src + self.dropout_path2( + self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout + else: + ## Multi-Head attention and Add residual connection and Norm + src = self.norm_sublayer2( + src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] + src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, + 2).contiguous() # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden src = src.view(bs * num_input_channels, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] @@ -521,7 +481,7 @@ def forward(self, src: torch.Tensor): src = src + self.dropout_path3( self.ff(self.norm_sublayer3(src))) # Add: residual connection with residual dropout else: - ## Position-wise Feed-Forward and Add residual connection and Norm - Standard Transformer from BERT + ## Position-wise Feed-Forward and Add residual connection and Norm src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src))) # Add: residual connection with residual dropout src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] From 3c09a3393c0fd5a75fb2c47a1a12497b3ead2f60 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 5 Sep 2023 19:14:13 -0400 Subject: [PATCH 039/189] Update PatchTST models to use HF's MultiHeadAttention module --- .../models/patchtst/configuration_patchtst.py | 25 +- .../models/patchtst/modeling_patchtst.py | 523 +++++++++++------- .../models/patchtst/test_modeling_patchtst.py | 82 +-- 3 files changed, 383 insertions(+), 247 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 66220c802fc8c6..71efa3e480f6b6 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -14,7 +14,7 @@ # limitations under the License. """PatchTST model configuration""" -from typing import List, Optional, Union +from typing import List, Optional from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -24,6 +24,7 @@ PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP = { "ibm/patchtst-base": "https://huggingface.co/ibm/patchtst-base/resolve/main/config.json", + # See all PatchTST models at https://huggingface.co/ibm/models?filter=patchtst } @@ -32,6 +33,7 @@ class PatchTSTConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an PatchTST model according to the specified arguments, defining the model architecture. + [ibm/patchtst](https://huggingface.co/ibm/patchtst) architecture. Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -90,7 +92,7 @@ class PatchTSTConfig(PretrainedConfig): init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal weight initialization distribution. use_cache (`bool`, *optional*, defaults to `True`): - Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. + Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. distil (`bool`, *optional*, defaults to `True`): Whether to use distilling in encoder. @@ -119,11 +121,6 @@ def __init__( # time series specific configuration num_input_channels: int = 1, context_length: int = 32, - num_dynamic_real_features: int = 0, - num_static_real_features: int = 0, - num_static_categorical_features: int = 0, - num_time_features: int = 0, - is_encoder_decoder: bool = False, # PatchTST arguments patch_length: int = 8, stride: int = 8, @@ -143,7 +140,6 @@ def __init__( bias: bool = True, activation_function: str = "gelu", pre_norm: bool = False, - store_attn: bool = False, positional_encoding: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, @@ -151,7 +147,6 @@ def __init__( individual: bool = False, seed_number: int = None, revin: Optional[bool] = True, - qkv_bias: bool = True, # mask pretraining mask_input: Optional[bool] = None, mask_type: str = "random", @@ -170,14 +165,9 @@ def __init__( num_output_channels: int = 1, **kwargs, ): - # time series specific configuration self.context_length = context_length - self.num_input_channels = num_input_channels # n_vars - self.num_time_features = num_time_features - self.num_dynamic_real_features = num_dynamic_real_features - self.num_static_real_features = num_static_real_features - self.num_static_categorical_features = num_static_categorical_features + self.num_input_channels = num_input_channels # n_vars # Transformer architecture configuration self.d_model = d_model @@ -195,12 +185,10 @@ def __init__( self.bias = bias self.activation_function = activation_function self.pre_norm = pre_norm - self.store_attention = store_attn self.positional_encoding = positional_encoding self.learn_pe = learn_pe self.use_cls_token = use_cls_token self.init_std = init_std - self.qkv_bias = qkv_bias self.revin = revin # PatchTST @@ -234,8 +222,7 @@ def __init__( self.num_output_channels = num_output_channels self.prediction_range = prediction_range - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + super().__init__(**kwargs) def _num_patches(self): return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 - diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c77a94a4e1c0fb..4289fdb2d41fbd 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -14,19 +14,20 @@ # limitations under the License. """ PyTorch PatchTST model.""" -from typing import Optional, Tuple -import torch -from torch import nn import math import random +from typing import Optional, Tuple + import numpy as np +import torch +from torch import nn +from torch.nn.modules.activation import MultiheadAttention -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import add_start_docstrings, logging from transformers.modeling_outputs import BaseModelOutputWithNoAttention -from transformers.utils import ModelOutput -from torch.nn.modules.activation import MultiheadAttention +from transformers.modeling_utils import PreTrainedModel from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig +from transformers.utils import ModelOutput, add_start_docstrings, logging + logger = logging.get_logger(__name__) @@ -38,27 +39,159 @@ ] - +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST class PatchTSTAttention(nn.Module): - def __init__(self, config: PatchTSTConfig): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder - self.self_attn = MultiheadAttention( - embed_dim=config.d_model, - num_heads=config.encoder_attention_heads, - dropout=config.attention_dropout, - bias=config.bias, - add_bias_kv=True, - add_zero_attn=False, - batch_first=True, - ) + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - def forward(self, src: torch.Tensor) -> torch.Tensor: - """ - src: Tensor [bs x q_len x d_model] - """ - src, _ = self.self_attn(src, src, src, need_weights=False) - return src + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value def get_activation_fn(activation): @@ -85,7 +218,7 @@ def forward(self, x): def positional_encoding(pe, learn_pe, q_len, d_model): # Positional encoding - if pe == None: + if pe is None: w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe nn.init.uniform_(w_pos, -0.02, 0.02) learn_pe = False @@ -118,30 +251,33 @@ def set_seed(x=42): random.seed(x) np.random.seed(x) torch.manual_seed(x) - if torch.cuda.is_available(): torch.cuda.manual_seed_all(x) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(x) def random_masking( - xb: torch.Tensor, - mask_ratio: float, - unmasked_channel_indices: list = None, - channel_consistent_masking: bool = False, - mask_value=0, - seed_number: Optional[int] = None + xb: torch.Tensor, + mask_ratio: float, + unmasked_channel_indices: list = None, + channel_consistent_masking: bool = False, + mask_value=0, + seed_number: Optional[int] = None, ): """random_masking: Mask the input considering the control variables. Args: xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. - unmasked_channel_indices (list, optional): indices of unmasked channels. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + unmasked_channel_indices (list, optional): + indices of unmasked channels. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): + When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary + across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. seed_number (int, optional): Value to set for the random seed. Returns: - Tensor: xb_mask, masked input, same shape as input - Tensor: Mask tensor of shape [bs x c x n] + Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] """ if seed_number: set_seed(seed_number) @@ -190,16 +326,16 @@ class Patchify(nn.Module): """ def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence + self, + sequence_length: int, + patch_length: int, + stride: int, + padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" self.sequence_length = sequence_length @@ -220,9 +356,11 @@ def forward(self, past_values: torch.Tensor): x: output tensor data [bs x num_input_channels x num_patches x patch_length] """ sequence_length = past_values.shape[-2] - assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." + assert ( + sequence_length == self.sequence_length + ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." - x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] x = x.unfold( dimension=-2, size=self.patch_length, step=self.stride ) # x: [bs x num_patches x num_input_channels x patch_length] @@ -232,27 +370,20 @@ def forward(self, past_values: torch.Tensor): class PatchEmbeddings(nn.Module): """ - A class to patchify the time series sequence into different patches Args: - sequence_length (int, required): input sequence length. - patch_length (int, required): patch length. - stride (int, required): stride between patches. + A class to patchify the time series sequence into different patches + sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride + (int, required): stride between patches. Returns: embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim] """ - def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - embed_dim: int - ): + def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int): super().__init__() assert ( - sequence_length > patch_length + sequence_length > patch_length ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" @@ -268,11 +399,12 @@ def __init__( self.s_begin = sequence_length - new_sequence_length # Embedding - self.projection = nn.Conv1d(in_channels=1, - out_channels=embed_dim, - kernel_size=patch_length, - stride=stride, - ) + self.projection = nn.Conv1d( + in_channels=1, + out_channels=embed_dim, + kernel_size=patch_length, + stride=stride, + ) def forward(self, past_values: torch.Tensor): """ @@ -282,16 +414,19 @@ def forward(self, past_values: torch.Tensor): embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim] """ bs, sequence_length, num_input_channels = past_values.shape - assert sequence_length == self.sequence_length, f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." + assert ( + sequence_length == self.sequence_length + ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." - x = past_values[:, self.s_begin:, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ] x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous() # projection embeddings = self.projection(x) # embeddings: [bs*num_input_channels x emb_dim x num_patches] # reshape - embeddings = embeddings.transpose(1, 2).view(bs, num_input_channels, -1, - self.embed_dim).contiguous() # embeddings: [bs x num_input_channels x num_patches x emb_dim] + embeddings = ( + embeddings.transpose(1, 2).view(bs, num_input_channels, -1, self.embed_dim).contiguous() + ) # embeddings: [bs x num_input_channels x num_patches x emb_dim] # embeddings = embeddings.flatten(2).transpose(1, 2) return embeddings @@ -306,24 +441,26 @@ class PatchMasking(nn.Module): mask_patches (list, optional): List of patch lengths to mask in the end of the data. mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. + unmasked_channel_indices (list, optional): + Control Variable channel indices. These channels will not be masked. Defaults to None. + channel_consistent_masking (bool, optional): + When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary + across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. """ def __init__( - self, - mask_type: str = "random", - mask_ratio=0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], - channel_consistent_masking: bool = False, - unmasked_channel_indices: list = None, - mask_value=0, - seed_number: Optional[int] = None + self, + mask_type: str = "random", + mask_ratio=0.5, + mask_patches: list = [2, 3], + mask_patch_ratios: list = [1, 1], + channel_consistent_masking: bool = False, + unmasked_channel_indices: list = None, + mask_value=0, + seed_number: Optional[int] = None, ): - # if seed_number: # set_seed(seed_number) self.mask_ratio = mask_ratio @@ -343,11 +480,11 @@ def forward(self, x: torch.Tensor): """ Input: x: patched input - 4D: [bs x num_input_channels x num_patches x patch_length] + 4D: [bs x num_input_channels x num_patches x patch_length] Output: x_mask: Masked patched input - 4D: [bs x num_input_channels x num_patches x patch_length] + 4D: [bs x num_input_channels x num_patches x patch_length] mask: bool tensor indicating True on masked points 4D: [bs x num_input_channels x num_patch] """ @@ -359,7 +496,7 @@ def forward(self, x: torch.Tensor): unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, mask_value=self.mask_value, - seed_number=self.seed_number + seed_number=self.seed_number, ) else: @@ -374,17 +511,11 @@ class ChannelAttentionTSTEncoder(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.layers = nn.ModuleList( - [ - ChannelAttentionTSTEncoderLayer(config) - for i in range(config.encoder_layers) - ] - ) + self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)]) def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - src: tensor [bs x nvars x sequence_length x d_model] - Return: + src: tensor [bs x nvars x sequence_length x d_model] Return: Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] @@ -403,7 +534,13 @@ def __init__(self, config: PatchTSTConfig): self.channel_attention = config.channel_attention # Multi-Head attention - self.self_attn = PatchTSTAttention(config) + # self.self_attn = PatchTSTAttention(config) + + self.self_attn = PatchTSTAttention( + embed_dim=config.d_model, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) # Add & Norm of the sublayer 1 self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() @@ -436,54 +573,64 @@ def __init__(self, config: PatchTSTConfig): self.norm_sublayer3 = nn.LayerNorm(config.d_model) self.pre_norm = config.pre_norm - self.store_attn = config.store_attention def forward(self, src: torch.Tensor): """ - src: tensor [bs x nvars x sequence_length x d_model] - Return: + src: tensor [bs x nvars x sequence_length x d_model] Return: Tensor [bs x nvars x sequence_length x d_model] """ bs, num_input_channels, sequence_length, d_model = src.shape # First sublayer: attention across time - src = src.view(bs * num_input_channels, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view( + bs * num_input_channels, sequence_length, d_model + ) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src))) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer1(src)[0]) + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer1( - src + self.dropout_path1(self.self_attn(src))) # src: [(bs*nvars) x sequence_length x d_model] + src + self.dropout_path1(self.self_attn(src)[0]) + ) # src: [(bs*nvars) x sequence_length x d_model] src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] if self.channel_attention: - src = src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, - d_model) # [(bs*sequence_length) x nvars x d_model] + src = ( + src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, d_model) + ) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src))) # Add: residual connection with residual dropout + self.self_attn(self.norm_sublayer2(src)[0]) + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm src = self.norm_sublayer2( - src + self.dropout_path2(self.self_attn(src))) # src: [(bs*sequence_length) x nvars x d_model] - src = src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, - 2).contiguous() # src: [bs x nvars x sequence_length x d_model] + src + self.dropout_path2(self.self_attn(src)[0]) + ) # src: [(bs*sequence_length) x nvars x d_model] + src = ( + src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() + ) # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden - src = src.view(bs * num_input_channels, sequence_length, d_model) # src: [(bs*nvars) x sequence_length x d_model] + src = src.view( + bs * num_input_channels, sequence_length, d_model + ) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3( - self.ff(self.norm_sublayer3(src))) # Add: residual connection with residual dropout + self.ff(self.norm_sublayer3(src)) + ) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm src = self.norm_sublayer3( - src + self.dropout_path3(self.ff(src))) # Add: residual connection with residual dropout + src + self.dropout_path3(self.ff(src)) + ) # Add: residual connection with residual dropout src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src @@ -511,7 +658,6 @@ def _init_weights(self, module): module.bias_k.data.normal_(mean=0.0, std=self.config.init_std) module.bias_v.data.normal_(mean=0.0, std=self.config.init_std) - def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (ChannelAttentionPatchTSTEncoder)): module.gradient_checkpointing = value @@ -538,11 +684,13 @@ def __init__(self, config: PatchTSTConfig): # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches + 1, - config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model + ) else: - self.w_pos = positional_encoding(config.positional_encoding, config.learn_pe, config.num_patches, - config.d_model) + self.w_pos = positional_encoding( + config.positional_encoding, config.learn_pe, config.num_patches, config.d_model + ) # Positional dropout self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() @@ -553,8 +701,9 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, past_values: torch.Tensor, - output_hidden_states: Optional[bool] = None) -> BaseModelOutputWithNoAttention: + def forward( + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None + ) -> BaseModelOutputWithNoAttention: """ Args: past_values: tensor [bs x nvars x num_patches x patch_length]. @@ -591,14 +740,12 @@ def forward(self, past_values: torch.Tensor, # Encoder past_values, hidden_states = self.encoder( - past_values, output_hidden_states) # x: [bs x nvars x num_patches x d_model] + past_values, output_hidden_states + ) # x: [bs x nvars x num_patches x d_model] # or [bs x nvars x (num_patches+1) x d_model] if use cls_token # return past_values, hidden_states - return BaseModelOutputWithNoAttention( - last_hidden_state=past_values, - hidden_states=hidden_states - ) + return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states) PATCHTST_START_DOCSTRING = r""" @@ -632,7 +779,7 @@ def forward(self, past_values: torch.Tensor, For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of variates in the time series per time step. - + future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)` or `(batch_size, prediction_length, num_input_channels)`, *optional*): Future values of the time series, that serve as labels for the model. The `future_values` is what the Transformer needs during training to learn to output, given the `past_values`. @@ -645,7 +792,7 @@ def forward(self, past_values: torch.Tensor, number of variates in the time series per time step. output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. + Whether or not to return the hidden states of all layers. """ @@ -662,8 +809,8 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of + the model at the output of each layer plus the optional initial embedding outputs. patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): patched input to the Transformer mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) @@ -685,8 +832,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): class RevIN(nn.Module): def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): """ - :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x num_input_channels] - :denorm_channels if the denorm input shape has less number of channels, mention the channels in the denorm input here. + :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x + num_input_channels] :denorm_channels if the denorm input shape has less number of channels, mention the + channels in the denorm input here. """ super(RevIN, self).__init__() self.stdev = None @@ -701,10 +849,10 @@ def set_statistics(self, mean, stdev): self.stdev = stdev def forward(self, x, mode: str): - if mode == 'norm': + if mode == "norm": self._get_statistics(x) x = self._normalize(x) - elif mode == 'denorm': + elif mode == "denorm": x = self._denormalize(x) elif mode == "transform": x = self._normalize(x) @@ -760,7 +908,7 @@ def __init__(self, config: PatchTSTConfig): channel_consistent_masking=config.channel_consistent_masking, unmasked_channel_indices=config.unmasked_channel_indices, mask_value=config.mask_value, - seed_number=config.seed_number + seed_number=config.seed_number, ) else: self.masking = nn.Identity() @@ -769,10 +917,12 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None): + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -780,19 +930,22 @@ def forward(self, past_values = self.revin(past_values, mode="norm") # x: tensor [bs x seq_len x in_channels] patched_values = self.patching( - past_values) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain + past_values + ) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain if self.mask_input: masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) - return PatchTSTModelOutputWithNoAttention(last_hidden_state=encoder_output.last_hidden_state, - hidden_states=encoder_output.hidden_states, - patched_input=patched_values, - mask=mask, - revin_mean=self.revin.mean if self.use_revin else None, - revin_stdev=self.revin.stdev if self.use_revin else None - ) + return PatchTSTModelOutputWithNoAttention( + last_hidden_state=encoder_output.last_hidden_state, + hidden_states=encoder_output.hidden_states, + patched_input=patched_values, + mask=mask, + revin_mean=self.revin.mean if self.use_revin else None, + revin_stdev=self.revin.stdev if self.use_revin else None, + ) + class MaskPretrainHead(nn.Module): def __init__(self, config): @@ -849,19 +1002,19 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config=config) self.head = MaskPretrainHead(config) - self.loss = torch.nn.MSELoss(reduction='none') + self.loss = torch.nn.MSELoss(reduction="none") # Initialize weights and apply final processing self.post_init() def forward( - self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, ) -> PatchTSTOutput: """ - past_values (x): tensor [bs x sequence_length x num_input_channels ] - future_values (y): labels + past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -879,11 +1032,7 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTOutput( - loss=masked_loss, - prediction_output=x_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -910,9 +1059,7 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] if labels is not None: loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( - loss=loss_val, - prediction_logits=y_hat, - hidden_states=model_output.hidden_states + loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states ) @@ -927,8 +1074,8 @@ def __init__(self, config: PatchTSTConfig): def forward(self, x): """ - x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token - output: [bs x n_classes] + x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: + [bs x n_classes] """ if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: bs x nvars x d_model @@ -996,12 +1143,12 @@ def forward(self, x): batch_size = x.shape[0] if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == 'mean': + elif self.pooling == "mean": x = x.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": x = x.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f'pooling operator {self.pooling} is not implemented yet') + raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input x = self.flatten(x) # x: bs x (nvars * d_model) @@ -1012,7 +1159,6 @@ def forward(self, x): return y - class PatchTSTForPrediction(PatchTSTPreTrainedModel): # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): @@ -1020,16 +1166,17 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PredictionHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None): - + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1039,11 +1186,7 @@ def forward(self, loss_val = None if future_values is not None: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput( - loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) class PatchTSTForForecastingOutput(ModelOutput): @@ -1093,8 +1236,7 @@ def __init__(self, config: PatchTSTConfig): for i in range(self.num_input_channels): self.flattens.append(nn.Flatten(start_dim=2)) self.linears.append(nn.Linear(head_dim, config.prediction_length)) - self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - ) + self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: self.flatten = nn.Flatten(start_dim=2) self.linear = nn.Linear(head_dim, config.prediction_length) @@ -1110,9 +1252,9 @@ def forward(self, x: torch.Tensor): if self.use_cls_token: y = x[:, :, 0, :] # y: [bs x nvars x d_model] else: - if self.pooling == 'mean': + if self.pooling == "mean": y = x.mean(dim=2) # y: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": y = x.max(dim=2) # y: [bs x nvars x d_model] else: y = x # y: [bs x nvars x num_patches x d_model] @@ -1141,7 +1283,7 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = ForecastHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") self.use_revin = config.revin if self.use_revin: self.revin = RevIN() @@ -1151,10 +1293,12 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - future_values: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None): + def forward( + self, + past_values: torch.Tensor, + future_values: Optional[torch.Tensor], + output_hidden_states: Optional[bool] = None, + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1170,9 +1314,7 @@ def forward(self, if future_values is not None: loss_val = self.loss(y_hat, future_values) return PatchTSTForForecastingOutput( - loss=loss_val, - forecast_outputs=y_hat, - hidden_states=model_output.hidden_states + loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states ) @@ -1198,12 +1340,12 @@ def forward(self, past_values): """ if self.use_cls_token: past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] - elif self.pooling == 'mean': + elif self.pooling == "mean": past_values = past_values.mean(dim=2) # x: [bs x nvars x d_model] - elif self.pooling == 'max': + elif self.pooling == "max": past_values = past_values.max(dim=2) # x: [bs x nvars x d_model] else: - raise Exception(f'pooling operator {self.pooling} is not implemented yet') + raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input past_values = self.flatten(past_values) # x: bs x nvars * d_model y = self.linear(self.dropout(past_values)) # y: bs x output_dim @@ -1220,15 +1362,14 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) self.head = RegressionHead(config) - self.loss = nn.MSELoss(reduction='mean') + self.loss = nn.MSELoss(reduction="mean") # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - labels: Optional[torch.Tensor], - output_hidden_states: Optional[bool] = None): + def forward( + self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None + ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1238,8 +1379,4 @@ def forward(self, loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput( - loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 8a444f1eecd5d3..f3a045ef756de4 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -15,16 +15,16 @@ """ Testing suite for the PyTorch PatchTST model. """ import inspect +import random import tempfile import unittest -import numpy as np from huggingface_hub import hf_hub_download from transformers import is_torch_available -from transformers.testing_utils import is_flaky, require_torch, torch_device, slow from transformers.models.auto import get_values -import random +from transformers.testing_utils import is_flaky, require_torch, slow, torch_device + from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -34,10 +34,18 @@ if is_torch_available(): import torch - from transformers import PatchTSTConfig, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING - from transformers import PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining, \ - PatchTSTForClassification, PatchTSTForRegression + from transformers import ( + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING, + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, + PatchTSTConfig, + PatchTSTForClassification, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForPrediction, + PatchTSTForRegression, + PatchTSTModel, + ) @require_torch @@ -106,7 +114,7 @@ def get_config(self): activation_function=self.hidden_act, seed_number=self.seed_number, num_classes=self.num_classes, - num_output_channels=self.num_output_channels + num_output_channels=self.num_output_channels, ) def prepare_patchtst_inputs_dict(self, config): @@ -137,18 +145,21 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( - (PatchTSTModel, - PatchTSTForPrediction, - PatchTSTForForecasting, - PatchTSTForMaskPretraining, - PatchTSTForClassification, - PatchTSTForRegression) + ( + PatchTSTModel, + PatchTSTForPrediction, + PatchTSTForForecasting, + PatchTSTForMaskPretraining, + PatchTSTForClassification, + PatchTSTForRegression, + ) if is_torch_available() else () ) - all_generative_model_classes = (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () + all_generative_model_classes = ( + (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () + ) pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} - is_encoder_decoder = False test_pruning = False test_head_masking = False test_missing_keys = False @@ -156,7 +167,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase test_inputs_embeds = False test_model_common_attributes = False - test_resize_embeddings = True test_resize_position_embeddings = False test_mismatched_shapes = True @@ -201,7 +211,7 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) -# + # def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) @@ -211,7 +221,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + hidden_states = outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers @@ -228,7 +238,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True - print('model_class: ', model_class) + print("model_class: ", model_class) check_hidden_states_output(inputs_dict, config, model_class) @@ -237,8 +247,9 @@ def check_hidden_states_output(inputs_dict, config, model_class): config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) -# -# # Ignore since we have no tokens embeddings + + # + # # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass @@ -268,8 +279,9 @@ def test_forward_signature(self): "past_values", "future_values", ] - if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or \ - model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): + if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( + MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING + ): expected_arg_names.remove("future_values") expected_arg_names.append("labels") expected_arg_names.extend( @@ -303,17 +315,16 @@ def test_pretrain_head(self): torch.manual_seed(0) with torch.no_grad(): - output = model( - past_values=batch["past_values"].to(torch_device) - ).prediction_output - num_patch = (max(model.config.context_length, - model.config.patch_length) - model.config.patch_length) // model.config.stride + 1 + output = model(past_values=batch["past_values"].to(torch_device)).prediction_output + num_patch = ( + max(model.config.context_length, model.config.patch_length) - model.config.patch_length + ) // model.config.stride + 1 expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], - [[0.0246]], [[0.0090]]], - device=torch_device) + expected_slice = torch.tensor( + [[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], [[0.0246]], [[0.0090]]], device=torch_device + ) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. @@ -326,12 +337,13 @@ def test_prediction_head(self): with torch.no_grad(): output = model( past_values=batch["past_values"].to(torch_device), - future_values=batch["future_values"].to(torch_device) + future_values=batch["future_values"].to(torch_device), ).prediction_output expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor([[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], - device=torch_device, - ) + expected_slice = torch.tensor( + [[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + device=torch_device, + ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) From 3bada036d8dd0349ef7f2a75c59a17d37c80bc5b Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Thu, 7 Sep 2023 14:21:23 -0400 Subject: [PATCH 040/189] Update paper + github urls --- README.md | 2 +- README_es.md | 2 +- README_hd.md | 2 +- README_ja.md | 2 +- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- docs/source/en/index.md | 2 +- docs/source/en/model_doc/patchtst.md | 6 +++--- src/transformers/models/patchtst/modeling_patchtst.py | 1 + 10 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a7246572381451..471a64174c3296 100644 --- a/README.md +++ b/README.md @@ -428,7 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_es.md b/README_es.md index 62085093026a87..67e6559faee2cf 100644 --- a/README_es.md +++ b/README_es.md @@ -405,7 +405,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_hd.md b/README_hd.md index 5e93de459461a7..914064bb151e9a 100644 --- a/README_hd.md +++ b/README_hd.md @@ -377,7 +377,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया। -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा। 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया। diff --git a/README_ja.md b/README_ja.md index 1067b2e57a25ee..b0026ec09b3326 100644 --- a/README_ja.md +++ b/README_ja.md @@ -439,7 +439,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) diff --git a/README_ko.md b/README_ko.md index 202d3d4893561a..722199cb2950a3 100644 --- a/README_ko.md +++ b/README_ko.md @@ -354,7 +354,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 8fe1633f181115..59e0c58dcf69cd 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -378,7 +378,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。 -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 9c615363e61a81..34b2eee82cb8f4 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -390,7 +390,7 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index b9f65477b5fe1c..d059a059bcac27 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -194,7 +194,7 @@ The documentation is organized into five sections: 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](model_doc/patchtst)** (from ) released with the paper []() by . +1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 9a30b8294571b0..9d08bdd628f0ba 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -18,19 +18,19 @@ rendered properly in your Markdown viewer. ## Overview -The PatchTST model was proposed in []() by . +The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. The abstract from the paper is the following: -** +*We propose an efficient design of Transformer-based models for multivariate time series forecasting and self-supervised representation learning. It is based on two key components: (i) segmentation of time series into subseries-level patches which are served as input tokens to Transformer; (ii) channel-independence where each channel contains a single univariate time series that shares the same embedding and Transformer weights across all the series. Patching design naturally has three-fold benefit: local semantic information is retained in the embedding; computation and memory usage of the attention maps are quadratically reduced given the same look-back window; and the model can attend longer history. Our channel-independent patch time series Transformer (PatchTST) can improve the long-term forecasting accuracy significantly when compared with that of SOTA Transformer-based models. We also apply our model to self-supervised pre-training tasks and attain excellent fine-tuning performance, which outperforms supervised training on large datasets. Transferring of masked pre-trained representation on one dataset to others also produces SOTA forecasting accuracy.* Tips: This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). -The original code can be found [here](). +The original code can be found [here](https://github.com/yuqinie98/PatchTST). ## PatchTSTConfig diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4289fdb2d41fbd..c230fc10622095 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1289,6 +1289,7 @@ def __init__(self, config: PatchTSTConfig): self.revin = RevIN() else: self.revin = nn.Identity() + config.pooling = None # Initialize weights and apply final processing self.post_init() From 55065926064955ff3de149f23d831ff421423fcd Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Thu, 7 Sep 2023 14:39:59 -0400 Subject: [PATCH 041/189] Fix hidden_state return value --- README.md | 2 +- docs/source/en/index.md | 2 +- src/transformers/models/patchtst/modeling_patchtst.py | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 471a64174c3296..37400c7d14d93b 100644 --- a/README.md +++ b/README.md @@ -428,7 +428,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index d059a059bcac27..f56263447b10d8 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -194,7 +194,7 @@ The documentation is organized into five sections: 1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. +1. **[PatchTST](model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c230fc10622095..14ff3b2aadeb36 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -520,9 +520,8 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None """ all_hidden_states = [] for mod in self.layers: - if output_hidden_states: - src = mod(src) - all_hidden_states.append(src) + src = mod(src) + all_hidden_states.append(src) if output_hidden_states: return src, all_hidden_states return src, None @@ -1289,7 +1288,6 @@ def __init__(self, config: PatchTSTConfig): self.revin = RevIN() else: self.revin = nn.Identity() - config.pooling = None # Initialize weights and apply final processing self.post_init() From bd2a1c542e8aefff85370c07da4e37ab096133de Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Sun, 10 Sep 2023 21:49:58 -0400 Subject: [PATCH 042/189] Update integration test to use PatchTSTForForecasting --- docs/source/en/model_doc/patchtst.md | 2 +- .../models/patchtst/modeling_patchtst.py | 15 ++++++++------- tests/models/patchtst/test_modeling_patchtst.py | 12 ++++++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 9d08bdd628f0ba..209e50a6b12480 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -29,7 +29,7 @@ Tips: -This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif). The original code can be found [here](https://github.com/yuqinie98/PatchTST). diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 14ff3b2aadeb36..15a588c158fecf 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -23,10 +23,10 @@ from torch import nn from torch.nn.modules.activation import MultiheadAttention -from transformers.modeling_outputs import BaseModelOutputWithNoAttention -from transformers.modeling_utils import PreTrainedModel -from transformers.models.patchtst.configuration_patchtst import PatchTSTConfig -from transformers.utils import ModelOutput, add_start_docstrings, logging +from ...modeling_outputs import BaseModelOutputWithNoAttention +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, add_start_docstrings, logging +from .configuration_patchtst import PatchTSTConfig logger = logging.get_logger(__name__) @@ -519,10 +519,11 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] - for mod in self.layers: - src = mod(src) - all_hidden_states.append(src) + if output_hidden_states: + for mod in self.layers: + src = mod(src) + all_hidden_states.append(src) return src, all_hidden_states return src, None diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index f3a045ef756de4..32d9e5fdcedde8 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -298,7 +298,7 @@ def test_retain_grad_hidden_states_attentions(self): # Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable. -def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"): +def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"): file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") batch = torch.load(file, map_location=torch_device) return batch @@ -310,7 +310,7 @@ def prepare_batch(repo_id="ibm/etth1", file="train-batch.pt"): class PatchTSTModelIntegrationTests(unittest.TestCase): # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_pretrain_head(self): - model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst_pretrained_etth1").to(torch_device) + model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth-pretrain-dev").to(torch_device) batch = prepare_batch() torch.manual_seed(0) @@ -323,13 +323,13 @@ def test_pretrain_head(self): self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( - [[[-0.0170]], [[0.0163]], [[0.0090]], [[0.0139]], [[0.0067]], [[0.0246]], [[0.0090]]], device=torch_device + [[[0.0100]], [[0.0242]], [[0.0128]], [[0.0125]], [[-0.0160]], [[0.0395]], [[0.0135]]], device=torch_device ) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_prediction_head(self): - model = PatchTSTForPrediction.from_pretrained("ibm/patchtst_prediction_etth1").to(torch_device) + model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth-forecasting-dev").to(torch_device) batch = prepare_batch(file="test-batch.pt") @@ -338,12 +338,12 @@ def test_prediction_head(self): output = model( past_values=batch["past_values"].to(torch_device), future_values=batch["future_values"].to(torch_device), - ).prediction_output + ).forecast_outputs expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( - [[-0.8200, 0.3741, -0.7543, 0.3971, -0.6659, -0.0124, -0.8308]], + [[0.2781, 0.4699, 0.4292, 0.4278, -0.2669, 0.4660, -0.8898]], device=torch_device, ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) From 76416156b5478e6cd397b686e19319ae5a344eb2 Mon Sep 17 00:00:00 2001 From: diepi Date: Mon, 11 Sep 2023 11:22:07 +0200 Subject: [PATCH 043/189] Adding dataclass decorator for model output classes --- src/transformers/models/patchtst/modeling_patchtst.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 15a588c158fecf..98fa57a103f896 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -17,6 +17,7 @@ import math import random from typing import Optional, Tuple +from dataclasses import dataclass import numpy as np import torch @@ -796,6 +797,7 @@ def forward( """ +@dataclass @add_start_docstrings( "The bare PatchTST Model outputting raw hidden-states without any specific head.", PATCHTST_START_DOCSTRING, @@ -817,7 +819,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): Bool masked tensor indicating which patches are masked revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length - revin_std: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) + revin_stdev: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) std of the input data (batch_size, sequence_length, num_channels) over the sequence_length """ @@ -826,7 +828,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): patched_input: torch.FloatTensor = None mask: torch.FloatTensor = None revin_mean: torch.FloatTensor = None - revin_std: torch.FloatTensor = None + revin_stdev: torch.FloatTensor = None class RevIN(nn.Module): @@ -966,6 +968,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x +@dataclass class PatchTSTOutput(ModelOutput): """ Output type of [`PatchTSTForPredictiontion`]. @@ -1091,6 +1094,7 @@ def forward(self, x): return y +@dataclass class PatchTSTForClassificationOutput(ModelOutput): """ Output type of [`PatchTSTForClassification`]. @@ -1189,6 +1193,7 @@ def forward( return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) +@dataclass class PatchTSTForForecastingOutput(ModelOutput): """ Output type of [`PatchTSTForPredictiontion`]. From a14053f4c25c6e9b3d265cf6f406c21fd46121e0 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Tue, 12 Sep 2023 11:30:29 -0400 Subject: [PATCH 044/189] Run fixup script --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 98fa57a103f896..87582e874dc563 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -16,8 +16,8 @@ import math import random -from typing import Optional, Tuple from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import torch From 2b704b450a7628e1abab15e8774aedfe9932858c Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Wed, 13 Sep 2023 13:59:04 -0400 Subject: [PATCH 045/189] Rename model repos for integration test --- tests/models/patchtst/test_modeling_patchtst.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 32d9e5fdcedde8..83c457d9c43fd1 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -298,7 +298,7 @@ def test_retain_grad_hidden_states_attentions(self): # Note: Publishing of this dataset is under internal review. The dataset is not yet downloadable. -def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"): +def prepare_batch(repo_id="ibm/etth1-forecast-test", file="train-batch.pt"): file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset") batch = torch.load(file, map_location=torch_device) return batch @@ -310,7 +310,7 @@ def prepare_batch(repo_id="ibm/etth-forecast-dev", file="train-batch.pt"): class PatchTSTModelIntegrationTests(unittest.TestCase): # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_pretrain_head(self): - model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth-pretrain-dev").to(torch_device) + model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device) batch = prepare_batch() torch.manual_seed(0) @@ -329,7 +329,7 @@ def test_pretrain_head(self): # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_prediction_head(self): - model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth-forecasting-dev").to(torch_device) + model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device) batch = prepare_batch(file="test-batch.pt") From d46e0c8bb1d4cea7ebf0ce32a2567cdd1ed8cf5b Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 13 Sep 2023 16:19:23 -0400 Subject: [PATCH 046/189] edit argument explanation --- .../models/patchtst/configuration_patchtst.py | 136 +++++++++++------- 1 file changed, 82 insertions(+), 54 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 71efa3e480f6b6..11b320ef85e866 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -39,63 +39,89 @@ class PatchTSTConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - prediction_length (`int`): - The prediction length for the decoder. In other words, the prediction horizon of the model. This value is - typically dictated by the dataset and we recommend to set it appropriately. - context_length (`int`, *optional*, defaults to `prediction_length`): - The context length for the encoder. If `None`, the context length will be the same as the - `prediction_length`. num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. - num_time_features (`int`, *optional*, defaults to 0): - The number of time features in the input time series. - num_dynamic_real_features (`int`, *optional*, defaults to 0): - The number of dynamic real valued features. - num_static_categorical_features (`int`, *optional*, defaults to 0): - The number of static categorical features. - num_static_real_features (`int`, *optional*, defaults to 0): - The number of static real valued features. - embedding_dimension (`list[int]`, *optional*): - The dimension of the embedding for each of the static categorical features. Should be a list of integers, - having the same length as `num_static_categorical_features`. Cannot be `None` if - `num_static_categorical_features` is > 0. - d_model (`int`, *optional*, defaults to 64): - Dimensionality of the transformer layers. + context_length (`int`, defaults to 32): + The context length for the encoder. + + patch_length (`int`, *optional*, defaults to 1): + Define the patch length of the patchification process. Default to 1 + stride (`int`, *optional*, defaults to 1): + define the stride of the patchification process. Default to 1 + encoder_layers (`int`, *optional*, defaults to 2): Number of encoder layers. - decoder_layers (`int`, *optional*, defaults to 2): - Number of decoder layers. - encoder_attention_heads (`int`, *optional*, defaults to 2): + d_model (`int`, *optional*, defaults to 64): + Dimensionality of the transformer layers. + encoder_attention_heads (`int`, *optional*, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. - decoder_attention_heads (`int`, *optional*, defaults to 2): - Number of attention heads for each attention layer in the Transformer decoder. - encoder_ffn_dim (`int`, *optional*, defaults to 32): + shared_embedding (`bool`, *optional*, defaults to True): + Sharing the input embedding across all channels. + channel_attention (`bool`, *optional*, defaults to False): + Activate channel attention block in the Transformer to allow channels to attend each other. + encoder_ffn_dim (`int`, *optional*, defaults to 256): Dimension of the "intermediate" (often named feed-forward) layer in encoder. - decoder_ffn_dim (`int`, *optional*, defaults to 32): - Dimension of the "intermediate" (often named feed-forward) layer in decoder. - activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and - `"relu"` are supported. - dropout (`float`, *optional*, defaults to 0.1): - The dropout probability for all fully connected layers in the encoder, and decoder. - encoder_layerdrop (`float`, *optional*, defaults to 0.1): - The dropout probability for the attention and fully connected layers for each encoder layer. - decoder_layerdrop (`float`, *optional*, defaults to 0.1): - The dropout probability for the attention and fully connected layers for each decoder layer. - attention_dropout (`float`, *optional*, defaults to 0.1): + norm (`str` , *optional*, defaults to `"BatchNorm"`): + Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`. + attention_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for the attention probabilities. - activation_dropout (`float`, *optional*, defaults to 0.1): + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the encoder, and decoder. + positional_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability in the positional embedding layer. + dropout_path (`float`, *optional*, defaults to 0.0): + The dropout path in the residual block. + ff_dropout (`float`, *optional*, defaults to 0.0): The dropout probability used between the two layers of the feed-forward networks. - num_parallel_samples (`int`, *optional*, defaults to 100): - The number of samples to generate in parallel for each time step of inference. + bias (`bool`, *optional*, defaults to True): + Consider bias in the feed-forward networks. + activation_function (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. + positional_encoding (`str`, *optional*, defaults to `"sincos"`): + Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. + learn_pe (`bool`, *optional*, defaults to False): + Whether the positional encoding is updated during training. + use_cls_token (`bool`, *optional*, defaults to False): + Whether cls token is used. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal weight initialization distribution. - use_cache (`bool`, *optional*, defaults to `True`): - Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. - distil (`bool`, *optional*, defaults to `True`): - Whether to use distilling in encoder. - + shared_projection (`bool`, *optional*, defaults to True): + Sharing the projection layer across different channels in the forecast head. + seed_number (`int`, *optional*, defaults to None): + Use seed number for random masking. + revin (`bool`, *optional*, defaults to True): + Apply reverse instance normalization on each input batch. + + mask_input (`bool`, *optional*, defaults to False): + Apply masking during the pretraining. + mask_type (`str`, *optional*, defaults to `"random"`): + Masking type. Only `"random"` is currently supported. + mask_ratio (`float`, *optional*, defaults to 0.5): + Masking ratio is applied to mask the input data during pretraining. + channel_consistent_masking (`bool`, *optional*, defaults to False): + If channel consistent masking is True, all the channels will have the same masking. + unmasked_channel_indices (`list`, *optional*, defaults to None): + Channels are not masked during pretraining. + mask_value (`int`, *optional*, defaults to 0): + Mask value to set. + + pooling (`str`, *optional*, defaults to `"mean"`): + Pooling in the latent representation. `"mean"`, `"max"` and None are supported. + num_classes (`int`, *optional*, defaults to 1): + Number of classes is defined for classification task. + head_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for head. + prediction_length (`int`): + The prediction length for the encoder. In other words, the prediction horizon of the model. + prediction_length (`int`): + The prediction length for the encoder. In other words, the prediction horizon of the model. + num_output_channels (`int`, *optional*, defaults to 1): + Number of output channels. + prediction_range (`list`, *optional*, defaults to None): + The range of prediction values can be set to enforce the model to produce values within a range. + + Example: ```python @@ -122,12 +148,12 @@ def __init__( num_input_channels: int = 1, context_length: int = 32, # PatchTST arguments - patch_length: int = 8, - stride: int = 8, + patch_length: int = 1, + stride: int = 1, # Transformer architecture configuration encoder_layers: int = 3, - d_model: int = 128, - encoder_attention_heads: int = 16, + d_model: int = 64, + encoder_attention_heads: int = 4, shared_embedding: bool = True, channel_attention: bool = False, encoder_ffn_dim: int = 256, @@ -144,13 +170,13 @@ def __init__( learn_pe: bool = False, use_cls_token: bool = False, init_std: float = 0.02, - individual: bool = False, + shared_projection: bool = True, seed_number: int = None, revin: Optional[bool] = True, # mask pretraining mask_input: Optional[bool] = None, mask_type: str = "random", - mask_ratio=0.5, + mask_ratio: float = 0.5, mask_patches: List[int] = [2, 3], mask_patch_ratios: List[int] = [1, 1], channel_consistent_masking: bool = False, @@ -161,8 +187,8 @@ def __init__( num_classes: int = 1, head_dropout: float = 0.0, prediction_length: int = 24, - prediction_range: List = [0, 1], num_output_channels: int = 1, + prediction_range: List = None, **kwargs, ): # time series specific configuration @@ -208,10 +234,12 @@ def __init__( self.mask_value = mask_value # general head params - self.individual = individual self.pooling = pooling self.head_dropout = head_dropout + # Forecast head + self.shared_projection = shared_projection + # Classification self.num_classes = num_classes From 5c240ddcecf64ac4bd4731e455ca4095b4b91a63 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 13 Sep 2023 16:20:42 -0400 Subject: [PATCH 047/189] change individual option to shared_projection --- .../models/patchtst/modeling_patchtst.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 87582e874dc563..74d66f7d81638b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -682,6 +682,7 @@ def __init__(self, config: PatchTSTConfig): self.w_p.append(nn.Linear(config.patch_length, config.d_model)) else: self.w_p = nn.Linear(config.patch_length, config.d_model) + # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) @@ -694,7 +695,7 @@ def __init__(self, config: PatchTSTConfig): ) # Positional dropout - self.dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() + self.positional_dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() # Encoder self.encoder = ChannelAttentionTSTEncoder(config) @@ -731,13 +732,13 @@ def forward( past_values = self.w_p(past_values) # x: [bs x nvars x num_patches x d_model] if self.use_cls_token: - past_values = self.dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patches x d_model] + past_values = self.positional_dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patches x d_model] # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x nvars x (num_patches+1) x d_model] else: - past_values = self.dropout(past_values + self.w_pos) # x: [bs x nvars x num_patches x d_model] + past_values = self.positional_dropout(past_values + self.w_pos) # x: [bs x nvars x num_patches x d_model] # Encoder past_values, hidden_states = self.encoder( @@ -1228,13 +1229,13 @@ class ForecastHead(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.individual = config.individual + self.shared_projection = config.shared_projection self.num_input_channels = config.num_input_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling head_dim = config.d_model if self.pooling else config.d_model * config.num_patches - if self.individual: + if not self.shared_projection: self.linears = nn.ModuleList() self.dropouts = nn.ModuleList() self.flattens = nn.ModuleList() @@ -1264,7 +1265,7 @@ def forward(self, x: torch.Tensor): else: y = x # y: [bs x nvars x num_patches x d_model] - if self.individual: + if not self.shared_projection: x_out = [] for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] From 2916ec09de53f308b9d447ede630bd46dda65566 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 14 Sep 2023 19:51:09 +0200 Subject: [PATCH 048/189] style --- .../models/patchtst/configuration_patchtst.py | 36 +++++++++---------- .../models/patchtst/modeling_patchtst.py | 7 ++-- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 11b320ef85e866..c044692824d95a 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -43,17 +43,15 @@ class PatchTSTConfig(PretrainedConfig): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. context_length (`int`, defaults to 32): - The context length for the encoder. - + The context length for the encoder. patch_length (`int`, *optional*, defaults to 1): - Define the patch length of the patchification process. Default to 1 + Define the patch length of the patchification process. Default to 1. stride (`int`, *optional*, defaults to 1): - define the stride of the patchification process. Default to 1 - + define the stride of the patchification process. Default to 1. encoder_layers (`int`, *optional*, defaults to 2): Number of encoder layers. d_model (`int`, *optional*, defaults to 64): - Dimensionality of the transformer layers. + Dimensionality of the transformer layers. encoder_attention_heads (`int`, *optional*, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. shared_embedding (`bool`, *optional*, defaults to True): @@ -69,9 +67,9 @@ class PatchTSTConfig(PretrainedConfig): dropout (`float`, *optional*, defaults to 0.0): The dropout probability for all fully connected layers in the encoder, and decoder. positional_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability in the positional embedding layer. + The dropout probability in the positional embedding layer. dropout_path (`float`, *optional*, defaults to 0.0): - The dropout path in the residual block. + The dropout path in the residual block. ff_dropout (`float`, *optional*, defaults to 0.0): The dropout probability used between the two layers of the feed-forward networks. bias (`bool`, *optional*, defaults to True): @@ -79,49 +77,47 @@ class PatchTSTConfig(PretrainedConfig): activation_function (`str`, *optional*, defaults to `"gelu"`): The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. positional_encoding (`str`, *optional*, defaults to `"sincos"`): - Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. + Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. learn_pe (`bool`, *optional*, defaults to False): - Whether the positional encoding is updated during training. + Whether the positional encoding is updated during training. use_cls_token (`bool`, *optional*, defaults to False): Whether cls token is used. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal weight initialization distribution. shared_projection (`bool`, *optional*, defaults to True): - Sharing the projection layer across different channels in the forecast head. + Sharing the projection layer across different channels in the forecast head. seed_number (`int`, *optional*, defaults to None): Use seed number for random masking. revin (`bool`, *optional*, defaults to True): Apply reverse instance normalization on each input batch. - mask_input (`bool`, *optional*, defaults to False): Apply masking during the pretraining. mask_type (`str`, *optional*, defaults to `"random"`): Masking type. Only `"random"` is currently supported. mask_ratio (`float`, *optional*, defaults to 0.5): - Masking ratio is applied to mask the input data during pretraining. + Masking ratio is applied to mask the input data during pretraining. channel_consistent_masking (`bool`, *optional*, defaults to False): If channel consistent masking is True, all the channels will have the same masking. unmasked_channel_indices (`list`, *optional*, defaults to None): Channels are not masked during pretraining. mask_value (`int`, *optional*, defaults to 0): Mask value to set. - pooling (`str`, *optional*, defaults to `"mean"`): Pooling in the latent representation. `"mean"`, `"max"` and None are supported. num_classes (`int`, *optional*, defaults to 1): Number of classes is defined for classification task. head_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for head. + The dropout probability for head. prediction_length (`int`): - The prediction length for the encoder. In other words, the prediction horizon of the model. + The prediction length for the encoder. In other words, the prediction horizon of the model. prediction_length (`int`): - The prediction length for the encoder. In other words, the prediction horizon of the model. + The prediction length for the encoder. In other words, the prediction horizon of the model. num_output_channels (`int`, *optional*, defaults to 1): Number of output channels. prediction_range (`list`, *optional*, defaults to None): - The range of prediction values can be set to enforce the model to produce values within a range. - - + The range of prediction values can be set to enforce the model to produce values within a range. + + Example: ```python diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 74d66f7d81638b..ce4c7e9be773b3 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -695,7 +695,9 @@ def __init__(self, config: PatchTSTConfig): ) # Positional dropout - self.positional_dropout = nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() + self.positional_dropout = ( + nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() + ) # Encoder self.encoder = ChannelAttentionTSTEncoder(config) @@ -732,7 +734,8 @@ def forward( past_values = self.w_p(past_values) # x: [bs x nvars x num_patches x d_model] if self.use_cls_token: - past_values = self.positional_dropout(past_values + self.w_pos[1:, :]) # x: [bs x nvars x num_patches x d_model] + # x: [bs x nvars x num_patches x d_model] + past_values = self.positional_dropout(past_values + self.w_pos[1:, :]) # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples From 208b83c1aef3308d7a66bba2b46f80a4b06f0911 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Thu, 14 Sep 2023 16:13:38 -0400 Subject: [PATCH 049/189] Rename integration test + import cleanup --- src/transformers/models/patchtst/modeling_patchtst.py | 7 +------ tests/models/patchtst/test_modeling_patchtst.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index ce4c7e9be773b3..a776fc80886fe3 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -22,7 +22,6 @@ import numpy as np import torch from torch import nn -from torch.nn.modules.activation import MultiheadAttention from ...modeling_outputs import BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel @@ -35,7 +34,7 @@ _CONFIG_FOR_DOC = "PatchTSTConfig" PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "ibm/patchtst-base", + "ibm/patchtst-etth1-pretrain", # See all PatchTST models at https://huggingface.co/models?filter=patchtst ] @@ -654,10 +653,6 @@ def _init_weights(self, module): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, MultiheadAttention): - module.in_proj_weight.data.normal_(mean=0.0, std=self.config.init_std) - module.bias_k.data.normal_(mean=0.0, std=self.config.init_std) - module.bias_v.data.normal_(mean=0.0, std=self.config.init_std) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (ChannelAttentionPatchTSTEncoder)): diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 83c457d9c43fd1..fb8767390b3b33 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -328,7 +328,7 @@ def test_pretrain_head(self): self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. - def test_prediction_head(self): + def test_forecast_head(self): model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device) batch = prepare_batch(file="test-batch.pt") From ba7290719322c256047fdd0b16a1c943aff2bc98 Mon Sep 17 00:00:00 2001 From: Gift Sinthong Date: Thu, 14 Sep 2023 18:11:15 -0400 Subject: [PATCH 050/189] Fix outpu_hidden_states return value --- src/transformers/models/patchtst/modeling_patchtst.py | 11 ++++++----- tests/models/patchtst/test_modeling_patchtst.py | 5 +++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a776fc80886fe3..24e42977ce2161 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -520,12 +520,13 @@ def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None """ all_hidden_states = [] - if output_hidden_states: - for mod in self.layers: - src = mod(src) + for mod in self.layers: + src = mod(src) + if output_hidden_states: all_hidden_states.append(src) - return src, all_hidden_states - return src, None + if output_hidden_states is None: + return src, None + return src, all_hidden_states class ChannelAttentionTSTEncoderLayer(nn.Module): diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index fb8767390b3b33..4f3cb2f1f465bc 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -323,7 +323,8 @@ def test_pretrain_head(self): self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( - [[[0.0100]], [[0.0242]], [[0.0128]], [[0.0125]], [[-0.0160]], [[0.0395]], [[0.0135]]], device=torch_device + [[[-0.5409]], [[0.3093]], [[-0.3759]], [[0.5068]], [[-0.8387]], [[0.0937]], [[0.2809]]], + device=torch_device, ) self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) @@ -343,7 +344,7 @@ def test_forecast_head(self): self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( - [[0.2781, 0.4699, 0.4292, 0.4278, -0.2669, 0.4660, -0.8898]], + [[0.3228, 0.4320, 0.4591, 0.4066, -0.3461, 0.3094, -0.8426]], device=torch_device, ) self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE)) From eb96b0266346995fed79a4d501a30e8802b86b0f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 15 Sep 2023 13:35:25 +0200 Subject: [PATCH 051/189] removed unused mode --- src/transformers/models/patchtst/modeling_patchtst.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 24e42977ce2161..5bf7e64befab38 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -856,8 +856,6 @@ def forward(self, x, mode: str): x = self._normalize(x) elif mode == "denorm": x = self._denormalize(x) - elif mode == "transform": - x = self._normalize(x) else: raise NotImplementedError return x From 474e981e217cafa6e14a37be019cfc46f6a0b491 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 19 Sep 2023 10:07:21 +0200 Subject: [PATCH 052/189] added std, mean and nops scaler --- .../models/patchtst/configuration_patchtst.py | 11 +- .../models/patchtst/modeling_patchtst.py | 203 ++++++++++++------ 2 files changed, 141 insertions(+), 73 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index c044692824d95a..ea4d7382deb187 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -14,7 +14,7 @@ # limitations under the License. """PatchTST model configuration""" -from typing import List, Optional +from typing import List, Optional, Union from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -88,8 +88,9 @@ class PatchTSTConfig(PretrainedConfig): Sharing the projection layer across different channels in the forecast head. seed_number (`int`, *optional*, defaults to None): Use seed number for random masking. - revin (`bool`, *optional*, defaults to True): - Apply reverse instance normalization on each input batch. + scaling (`string` or `bool`, *optional* defaults to `"mean"`): + Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the + scaler is set to "mean". mask_input (`bool`, *optional*, defaults to False): Apply masking during the pretraining. mask_type (`str`, *optional*, defaults to `"random"`): @@ -168,7 +169,7 @@ def __init__( init_std: float = 0.02, shared_projection: bool = True, seed_number: int = None, - revin: Optional[bool] = True, + scaling: Optional[Union[str, bool]] = "mean", # mask pretraining mask_input: Optional[bool] = None, mask_type: str = "random", @@ -211,7 +212,7 @@ def __init__( self.learn_pe = learn_pe self.use_cls_token = use_cls_token self.init_std = init_std - self.revin = revin + self.scaling = scaling # PatchTST self.patch_length = patch_length diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5bf7e64befab38..86e49ddc88befc 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -817,9 +817,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): patched input to the Transformer mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) Bool masked tensor indicating which patches are masked - revin_mean: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) + loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length - revin_stdev: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) + scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*) std of the input data (batch_size, sequence_length, num_channels) over the sequence_length """ @@ -827,70 +827,137 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): hidden_states: Optional[Tuple[torch.FloatTensor]] = None patched_input: torch.FloatTensor = None mask: torch.FloatTensor = None - revin_mean: torch.FloatTensor = None - revin_stdev: torch.FloatTensor = None + loc: torch.FloatTensor = None + scale: torch.FloatTensor = None -class RevIN(nn.Module): - def __init__(self, start_dim=1, eps=1e-5, denorm_channels: list = None): - """ - :param start_dim: it is 1 if [bs x seq_len x nvars], it is 3 is [bs x tsg1 x tsg2 x seq_len x - num_input_channels] :denorm_channels if the denorm input shape has less number of channels, mention the - channels in the denorm input here. - """ - super(RevIN, self).__init__() - self.stdev = None - self.mean = None - self.start_dim = start_dim - self.denorm_channels = denorm_channels - self.eps = eps - - def set_statistics(self, mean, stdev): - # get statistics - self.mean = mean - self.stdev = stdev - - def forward(self, x, mode: str): - if mode == "norm": - self._get_statistics(x) - x = self._normalize(x) - elif mode == "denorm": - x = self._denormalize(x) - else: - raise NotImplementedError - return x +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST +class PatchTSTStdScaler(nn.Module): + """ + Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it + by subtracting from the mean and dividing by the standard deviation. - def _get_statistics(self, x): - dim2reduce = tuple(range(self.start_dim, x.ndim - 1)) - self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() - self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + Args: + dim (`int`): + Dimension along which to calculate the mean and standard deviation. + keepdim (`bool`, *optional*, defaults to `False`): + Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + minimum_scale (`float`, *optional*, defaults to 1e-5): + Default scale that is used for elements that are constantly zero along dimension `dim`. + """ - def _normalize(self, x): - x = x - self.mean - x = x / self.stdev - return x + def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5): + super().__init__() + if not dim > 0: + raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0") + self.dim = dim + self.keepdim = keepdim + self.minimum_scale = minimum_scale + + @torch.no_grad() + def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + denominator = weights.sum(self.dim, keepdim=self.keepdim) + denominator = denominator.clamp_min(1.0) + loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + + variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + scale = torch.sqrt(variance + self.minimum_scale) + return (data - loc) / scale, loc, scale + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST +class PatchTSTMeanScaler(nn.Module): + """ + Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data + accordingly. + + Args: + dim (`int`): + Dimension along which to compute the scale. + keepdim (`bool`, *optional*, defaults to `False`): + Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + default_scale (`float`, *optional*, defaults to `None`): + Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch. + minimum_scale (`float`, *optional*, defaults to 1e-10): + Default minimum possible scale that is used for any item. + """ + + def __init__( + self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 + ): + super().__init__() + self.dim = dim + self.keepdim = keepdim + self.minimum_scale = minimum_scale + self.default_scale = default_scale - def _denormalize(self, x): - # denormalize the data - if self.denorm_channels is None: - x = x * self.stdev - x = x + self.mean + @torch.no_grad() + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # shape: (N, [C], T=1) + ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) + num_observed = observed_indicator.sum(self.dim, keepdim=True) + + scale = ts_sum / torch.clamp(num_observed, min=1) + + # If `default_scale` is provided, we use it, otherwise we use the scale + # of the batch. + if self.default_scale is None: + batch_sum = ts_sum.sum(dim=0) + batch_observations = torch.clamp(num_observed.sum(0), min=1) + default_scale = torch.squeeze(batch_sum / batch_observations) else: - x = x * self.stdev[..., self.denorm_channels] - x = x + self.mean[..., self.denorm_channels] + default_scale = self.default_scale * torch.ones_like(scale) - return x + # apply default scale where there are no observations + scale = torch.where(num_observed > 0, scale, default_scale) + + # ensure the scale is at least `self.minimum_scale` + scale = torch.clamp(scale, min=self.minimum_scale) + scaled_data = data / scale + + if not self.keepdim: + scale = scale.squeeze(dim=self.dim) + + return scaled_data, torch.zeros_like(scale), scale + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST +class PatchTSTNOPScaler(nn.Module): + """ + Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. + + Args: + dim (`int`): + Dimension along which to compute the scale. + keepdim (`bool`, *optional*, defaults to `False`): + Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + """ + + def __init__(self, dim: int, keepdim: bool = False): + super().__init__() + self.dim = dim + self.keepdim = keepdim + + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + return data, loc, scale class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) - self.use_revin = config.revin - if self.use_revin: - self.revin = RevIN() + if config.scaling == "mean" or config.scaling is True: + self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True) + elif config.scaling == "std": + self.scaler = PatchTSTStdScaler(dim=1, keepdim=True) else: - self.revin = nn.Identity() + self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) self.patching = Patchify( config.context_length, @@ -920,6 +987,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ): @@ -927,11 +995,14 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - past_values = self.revin(past_values, mode="norm") # x: tensor [bs x seq_len x in_channels] + if past_observed_mask is None: + past_observed_mask = torch.ones_like(past_values) - patched_values = self.patching( - past_values - ) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain + # x: tensor [bs x seq_len x in_channels] + scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask) + + # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain + patched_values = self.patching(scaled_past_values) if self.mask_input: masked_values, mask = self.masking(patched_values) else: @@ -942,8 +1013,8 @@ def forward( hidden_states=encoder_output.hidden_states, patched_input=patched_values, mask=mask, - revin_mean=self.revin.mean if self.use_revin else None, - revin_stdev=self.revin.stdev if self.use_revin else None, + loc=loc, + scale=scale, ) @@ -1287,11 +1358,6 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = ForecastHead(config) self.loss = nn.MSELoss(reduction="mean") - self.use_revin = config.revin - if self.use_revin: - self.revin = RevIN() - else: - self.revin = nn.Identity() # Initialize weights and apply final processing self.post_init() @@ -1300,18 +1366,19 @@ def forward( self, past_values: torch.Tensor, future_values: Optional[torch.Tensor], + past_observed_mask: Optional[torch.Tensor] = None, + future_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model(past_values, output_hidden_states=output_hidden_states) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) y_hat = self.head(model_output.last_hidden_state) - - if self.use_revin: - self.revin.set_statistics(mean=model_output.revin_mean, stdev=model_output.revin_stdev) - y_hat = self.revin(y_hat, mode="denorm") + y_hat = y_hat * model_output.scale + model_output.loc loss_val = None if future_values is not None: From 46e89d6b3e0b4caa570b10dc0ce0791fcd9e59b2 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 20 Sep 2023 13:23:01 +0200 Subject: [PATCH 053/189] add initial distributional loss for predition --- .../models/patchtst/configuration_patchtst.py | 9 ++++ .../models/patchtst/modeling_patchtst.py | 53 ++++++++++++++++--- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index ea4d7382deb187..3b306268785f74 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -44,6 +44,11 @@ class PatchTSTConfig(PretrainedConfig): multivariate targets. context_length (`int`, defaults to 32): The context length for the encoder. + distribution_output (`string`, *optional*, defaults to `"student_t"`): + The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial". + loss (`string`, *optional*, defaults to `"mse"`): + The loss function for the model corresponding to the `distribution_output` head. For parametric + distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse". patch_length (`int`, *optional*, defaults to 1): Define the patch length of the patchification process. Default to 1. stride (`int`, *optional*, defaults to 1): @@ -144,6 +149,8 @@ def __init__( # time series specific configuration num_input_channels: int = 1, context_length: int = 32, + distribution_output: str = "student_t", + loss: str = "mse", # PatchTST arguments patch_length: int = 1, stride: int = 1, @@ -191,6 +198,8 @@ def __init__( # time series specific configuration self.context_length = context_length self.num_input_channels = num_input_channels # n_vars + self.loss = loss + self.distribution_output = distribution_output # Transformer architecture configuration self.d_model = d_model diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 86e49ddc88befc..c6397ecbc2c57a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -25,6 +25,7 @@ from ...modeling_outputs import BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel +from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import ModelOutput, add_start_docstrings, logging from .configuration_patchtst import PatchTSTConfig @@ -831,6 +832,14 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): scale: torch.FloatTensor = None +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll +def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor: + """ + Computes the negative log likelihood loss from input distribution with respect to target. + """ + return -input.log_prob(target) + + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): """ @@ -1194,17 +1203,23 @@ class PatchTSTForClassificationOutput(ModelOutput): class PredictionHead(nn.Module): - def __init__(self, config: PatchTSTConfig): + def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.num_output_channels = config.num_output_channels + self.dist_output_size = config.num_output_channels * config.d_model // config.encoder_attention_heads self.use_cls_token = config.use_cls_token self.pooling = config.pooling head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) - self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + if distribution_output is None: + self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + self.args_proj = None + else: + self.linear = nn.Linearr(head_dim, config.prediction_length * self.dist_output_size) + self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() def forward(self, x): @@ -1226,9 +1241,13 @@ def forward(self, x): # flatten the input x = self.flatten(x) # x: bs x (nvars * d_model) y = self.linear(self.dropout(x)) # y: bs x (pred_len * num_output_channels) + if self.args_proj is None: + # reshape the data + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + else: + # reshape and project prarameters of distribution + y = self.args_proj(y.reshape(batch_size, -1, self.dist_output_size)) - # reshape the data - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] return y @@ -1238,8 +1257,21 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) - self.head = PredictionHead(config) - self.loss = nn.MSELoss(reduction="mean") + if config.loss == "mse": + self.loss = nn.MSELoss(reduction="mean") + self.distribution_output = None + else: + self.loss = nll + if config.distribution_output == "student_t": + self.distribution_output = StudentTOutput(dim=config.num_output_channels) + elif config.distribution_output == "normal": + self.distribution_output = NormalOutput(dim=config.num_output_channels) + elif config.distribution_output == "negative_binomial": + self.distribution_output = NegativeBinomialOutput(dim=config.num_output_channels) + else: + raise ValueError(f"Unknown distribution output {config.distribution_output}") + + self.head = PredictionHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1255,10 +1287,15 @@ def forward( ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) - loss_val = None if future_values is not None: - loss_val = self.loss(y_hat, future_values) + if self.distribution_output: + distribution = self.distribution_output.distribution( + y_hat, loc=model_output.loc, scale=model_output.scale + ) + loss_val = self.loss(distribution, future_values) + else: + loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) From 48b8621c0419e9c1009501f01451de5e3e00af47 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 25 Sep 2023 12:06:18 +0200 Subject: [PATCH 054/189] fix typo in docs --- src/transformers/models/patchtst/modeling_patchtst.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c6397ecbc2c57a..6d93d7177242c2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1049,11 +1049,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: @dataclass class PatchTSTOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForPrediction`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. + MSE loss or nll loss. prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): @@ -1302,7 +1302,7 @@ def forward( @dataclass class PatchTSTForForecastingOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForForecasting`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): From e5cf09d6270289544618a05e68e76ee780680c1f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 25 Sep 2023 12:12:27 +0200 Subject: [PATCH 055/189] add generate function --- .../models/patchtst/modeling_patchtst.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6d93d7177242c2..9019a2beea0854 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -23,7 +23,7 @@ import torch from torch import nn -from ...modeling_outputs import BaseModelOutputWithNoAttention +from ...modeling_outputs import BaseModelOutputWithNoAttention, SampleTSPredictionOutput from ...modeling_utils import PreTrainedModel from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import ModelOutput, add_start_docstrings, logging @@ -1297,6 +1297,25 @@ def forward( else: loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + + @torch.no_grad() + def generate(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> SampleTSPredictionOutput: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + model_output = self.model(past_values, output_hidden_states=output_hidden_states) + y_hat = self.head(model_output.last_hidden_state) + if self.distribution_output: + distribution = self.distribution_output.distribution( + y_hat, loc=model_output.loc, scale=model_output.scale + ) + y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,)) + else: + y_hat = y_hat * model_output.scale + model_output.loc + + return SampleTSPredictionOutput(sequences=y_hat) + + @dataclass From 5a7fb303e9fa8c6a7538a2ff904a9c9306620c2b Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 25 Sep 2023 12:13:13 +0200 Subject: [PATCH 056/189] formatting --- .../models/patchtst/modeling_patchtst.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 9019a2beea0854..587626a7e455b1 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1297,25 +1297,23 @@ def forward( else: loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) - + @torch.no_grad() - def generate(self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None) -> SampleTSPredictionOutput: + def generate( + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None + ) -> SampleTSPredictionOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) if self.distribution_output: - distribution = self.distribution_output.distribution( - y_hat, loc=model_output.loc, scale=model_output.scale - ) + distribution = self.distribution_output.distribution(y_hat, loc=model_output.loc, scale=model_output.scale) y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,)) else: y_hat = y_hat * model_output.scale + model_output.loc - - return SampleTSPredictionOutput(sequences=y_hat) - + return SampleTSPredictionOutput(sequences=y_hat) @dataclass From 18a43f56ba4c5b8faa9bf60dd5d3f4db2c229937 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 25 Sep 2023 13:58:30 +0200 Subject: [PATCH 057/189] add num_parallel_samples --- src/transformers/models/patchtst/configuration_patchtst.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 3b306268785f74..81a438bc2703f6 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -122,7 +122,8 @@ class PatchTSTConfig(PretrainedConfig): Number of output channels. prediction_range (`list`, *optional*, defaults to None): The range of prediction values can be set to enforce the model to produce values within a range. - + num_parallel_samples (`int`, *optional*, defaults to 100): + The number of samples to generate in parallel for probablistic forecast. Example: @@ -173,6 +174,7 @@ def __init__( positional_encoding: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, + num_parallel_samples: int = 100, init_std: float = 0.02, shared_projection: bool = True, seed_number: int = None, @@ -200,6 +202,7 @@ def __init__( self.num_input_channels = num_input_channels # n_vars self.loss = loss self.distribution_output = distribution_output + self.num_parallel_samples = num_parallel_samples # Transformer architecture configuration self.d_model = d_model From b54047ba7698cd5de7acbe69d3de7d80735876ba Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 27 Sep 2023 23:06:16 -0400 Subject: [PATCH 058/189] Fix a typo --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 587626a7e455b1..0ce3bbebf72db0 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1218,7 +1218,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) self.args_proj = None else: - self.linear = nn.Linearr(head_dim, config.prediction_length * self.dist_output_size) + self.linear = nn.Linear(head_dim, config.prediction_length * self.dist_output_size) self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() From 406cf00e3eddc984efdac629cbf60701a0daafff Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 1 Oct 2023 21:44:12 -0400 Subject: [PATCH 059/189] copy weighted_average function, edit PredictionHead --- .../models/patchtst/modeling_patchtst.py | 98 ++++++++++--------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 0ce3bbebf72db0..c5ebdafb103297 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -23,7 +23,7 @@ import torch from torch import nn -from ...modeling_outputs import BaseModelOutputWithNoAttention, SampleTSPredictionOutput +from ...modeling_outputs import BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import ModelOutput, add_start_docstrings, logging @@ -840,6 +840,31 @@ def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch. return -input.log_prob(target) +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average +def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor: + """ + Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero, + meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. + + Args: + input_tensor (`torch.FloatTensor`): + Input tensor, of which the average must be computed. + weights (`torch.FloatTensor`, *optional*): + Weights tensor, of the same shape as `input_tensor`. + dim (`int`, *optional*): + The dim along which to average `input_tensor`. + + Returns: + `torch.FloatTensor`: The tensor with values averaged along the specified `dim`. + """ + if weights is not None: + weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor)) + sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0) + return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights + else: + return input_tensor.mean(dim=dim) + + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): """ @@ -1049,11 +1074,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: @dataclass class PatchTSTOutput(ModelOutput): """ - Output type of [`PatchTSTForPrediction`]. + Output type of [`PatchTSTForPredictiontion`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss or nll loss. + MSE loss. prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): @@ -1206,8 +1231,7 @@ class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() - self.num_output_channels = config.num_output_channels - self.dist_output_size = config.num_output_channels * config.d_model // config.encoder_attention_heads + self.num_output_channels = config.num_output_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling @@ -1217,9 +1241,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): if distribution_output is None: self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) self.args_proj = None - else: - self.linear = nn.Linear(head_dim, config.prediction_length * self.dist_output_size) - self.args_proj = distribution_output.get_parameter_projection(self.dist_output_size) + else: + self.args_proj = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() def forward(self, x): @@ -1238,15 +1261,18 @@ def forward(self, x): else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") - # flatten the input - x = self.flatten(x) # x: bs x (nvars * d_model) - y = self.linear(self.dropout(x)) # y: bs x (pred_len * num_output_channels) + # flatten the input + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # y = self.linear(self.dropout(x)) # y: bs x (pred_len * num_output_channels) if self.args_proj is None: - # reshape the data - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + y = self.linear(x) # y: bs x (pred_len * num_output_channels) + # reshape the data to [bs x pred_len x num_output_channels] + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] else: - # reshape and project prarameters of distribution - y = self.args_proj(y.reshape(batch_size, -1, self.dist_output_size)) + # project prarameters of distribution + y = self.args_proj(x) + # reshape the data to be a tuple of [bs x pred_len x num_output_channels] + y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) return y @@ -1263,11 +1289,11 @@ def __init__(self, config: PatchTSTConfig): else: self.loss = nll if config.distribution_output == "student_t": - self.distribution_output = StudentTOutput(dim=config.num_output_channels) + self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "normal": - self.distribution_output = NormalOutput(dim=config.num_output_channels) + self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput(dim=config.num_output_channels) + self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1286,40 +1312,24 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output.last_hidden_state) + y_hat = self.head(model_output.last_hidden_state) + loss_val = None - if future_values is not None: - if self.distribution_output: - distribution = self.distribution_output.distribution( - y_hat, loc=model_output.loc, scale=model_output.scale - ) + if future_values is not None: + if self.distribution_output: + distribution = self.distribution_output.distribution(y_hat) loss_val = self.loss(distribution, future_values) - else: - loss_val = self.loss(y_hat * model_output.scale + model_output.loc, future_values) + # take average of the loss + loss_val = weighted_average(loss_val) + else: + loss_val = self.loss(y_hat, future_values) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) - @torch.no_grad() - def generate( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None - ) -> SampleTSPredictionOutput: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output.last_hidden_state) - if self.distribution_output: - distribution = self.distribution_output.distribution(y_hat, loc=model_output.loc, scale=model_output.scale) - y_hat = distribution.sample(sample_shape=(self.config.num_parallel_samples,)) - else: - y_hat = y_hat * model_output.scale + model_output.loc - - return SampleTSPredictionOutput(sequences=y_hat) - @dataclass class PatchTSTForForecastingOutput(ModelOutput): """ - Output type of [`PatchTSTForForecasting`]. + Output type of [`PatchTSTForPredictiontion`]. Args: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): From e89477c194f8ff77019e3efd8a2a32cb720e1992 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 1 Oct 2023 21:53:13 -0400 Subject: [PATCH 060/189] edit PredictionHead --- .../models/patchtst/modeling_patchtst.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c5ebdafb103297..04c23a0a67a24a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1238,11 +1238,16 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) + # if distribution_output is None: + # self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + # self.args_proj = None + # else: + # self.args_proj = distribution_output.get_parameter_projection(head_dim) if distribution_output is None: - self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - self.args_proj = None + self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) else: - self.args_proj = distribution_output.get_parameter_projection(head_dim) + self.projection = distribution_output.get_parameter_projection(head_dim) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() def forward(self, x): @@ -1263,17 +1268,13 @@ def forward(self, x): # flatten the input x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) - # y = self.linear(self.dropout(x)) # y: bs x (pred_len * num_output_channels) - if self.args_proj is None: - y = self.linear(x) # y: bs x (pred_len * num_output_channels) - # reshape the data to [bs x pred_len x num_output_channels] + # projection + y = self.projection(x) + # reshape y + if isinstance(y, tuple): # for distribution head + y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) # tuple of [bs x pred_len x num_output_channels] + else: # for linear head y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] - else: - # project prarameters of distribution - y = self.args_proj(x) - # reshape the data to be a tuple of [bs x pred_len x num_output_channels] - y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) - return y From a0815ee1d9ccb3746561c1124b7aeaf89c77c398 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 2 Oct 2023 14:28:06 -0400 Subject: [PATCH 061/189] add distribution head to forecasting --- .../models/patchtst/modeling_patchtst.py | 96 ++++++++++++++----- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 04c23a0a67a24a..e695a6224fb9ed 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1238,11 +1238,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) - # if distribution_output is None: - # self.linear = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - # self.args_proj = None - # else: - # self.args_proj = distribution_output.get_parameter_projection(head_dim) + if distribution_output is None: self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) else: @@ -1324,7 +1320,10 @@ def forward( loss_val = weighted_average(loss_val) else: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) @dataclass @@ -1359,7 +1358,7 @@ class PatchTSTForForecastingOutput(ModelOutput): class ForecastHead(nn.Module): - def __init__(self, config: PatchTSTConfig): + def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.shared_projection = config.shared_projection @@ -1369,16 +1368,32 @@ def __init__(self, config: PatchTSTConfig): head_dim = config.d_model if self.pooling else config.d_model * config.num_patches if not self.shared_projection: - self.linears = nn.ModuleList() + # if each channel has its own head + self.projections = nn.ModuleList() self.dropouts = nn.ModuleList() self.flattens = nn.ModuleList() for i in range(self.num_input_channels): self.flattens.append(nn.Flatten(start_dim=2)) - self.linears.append(nn.Linear(head_dim, config.prediction_length)) + if distribution_output is None: + # use linear head + self.projections.append( + nn.Linear(head_dim, config.prediction_length) + ) + else: + # use distribution head + self.projections.append( + distribution_output.get_parameter_projection(head_dim) + ) self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: + # all the channels share the same head self.flatten = nn.Flatten(start_dim=2) - self.linear = nn.Linear(head_dim, config.prediction_length) + if distribution_output is None: + # use linear head + self.projection = nn.Linear(head_dim, config.prediction_length) + else: + # use distribution head + self.projection = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() def forward(self, x: torch.Tensor): @@ -1402,16 +1417,19 @@ def forward(self, x: torch.Tensor): x_out = [] for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] - z = self.linears[i](z) # z: [bs x forecast_len] z = self.dropouts[i](z) + z = self.projections[i](z) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) - x = self.linear(z) # x: [bs x nvars x forecast_len] + x = self.projection(z) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head - x = x.transpose(2, 1) # [bs x forecast_len x nvars] + if isinstance(x, tuple): + x = (z.transpose(2,1) for z in x) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) + else: + x = x.transpose(2, 1) # [bs x forecast_len x nvars] return x @@ -1421,8 +1439,22 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) - self.head = ForecastHead(config) - self.loss = nn.MSELoss(reduction="mean") + + if config.loss == "mse": + self.loss = nn.MSELoss(reduction="mean") + self.distribution_output = None + else: + self.loss = nll + if config.distribution_output == "student_t": + self.distribution_output = StudentTOutput(dim=config.prediction_length) + elif config.distribution_output == "normal": + self.distribution_output = NormalOutput(dim=config.prediction_length) + elif config.distribution_output == "negative_binomial": + self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length) + else: + raise ValueError(f"Unknown distribution output {config.distribution_output}") + + self.head = ForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1438,19 +1470,30 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states - ) + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states + ) y_hat = self.head(model_output.last_hidden_state) - y_hat = y_hat * model_output.scale + model_output.loc - + loss_val = None if future_values is not None: - loss_val = self.loss(y_hat, future_values) - return PatchTSTForForecastingOutput( - loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states - ) + if self.distribution_output: + distribution = self.distribution_output.distribution(y_hat, + loc=model_output.loc, + scale=model_output.scale) + loss_val = self.loss(distribution, future_values) + # take average of the loss + loss_val = weighted_average(loss_val) + else: + y_hat = y_hat * model_output.scale + model_output.loc + loss_val = self.loss(y_hat, future_values) + + return PatchTSTForForecastingOutput(loss=loss_val, + forecast_outputs=y_hat, + hidden_states=model_output.hidden_states + ) class RegressionHead(nn.Module): @@ -1514,4 +1557,7 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) From d727ef7e69d98cb54cf9810f04b1a7357be7b729 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 4 Oct 2023 09:38:08 +0200 Subject: [PATCH 062/189] formatting --- .../models/patchtst/modeling_patchtst.py | 88 +++++++++---------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e695a6224fb9ed..e988af3992b586 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -863,7 +863,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights else: return input_tensor.mean(dim=dim) - + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): @@ -1231,7 +1231,7 @@ class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() - self.num_output_channels = config.num_output_channels + self.num_output_channels = config.num_output_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling @@ -1240,8 +1240,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flatten = nn.Flatten(start_dim=1) if distribution_output is None: - self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - else: + self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + else: self.projection = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() @@ -1262,15 +1262,17 @@ def forward(self, x): else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") - # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) - # projection + # flatten the input + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # projection y = self.projection(x) # reshape y - if isinstance(y, tuple): # for distribution head - y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) # tuple of [bs x pred_len x num_output_channels] - else: # for linear head - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + if isinstance(y, tuple): # for distribution head + y = ( + z.reshape(batch_size, -1, self.num_output_channels) for z in y + ) # tuple of [bs x pred_len x num_output_channels] + else: # for linear head + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] return y @@ -1290,7 +1292,9 @@ def __init__(self, config: PatchTSTConfig): elif config.distribution_output == "normal": self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) + self.distribution_output = NegativeBinomialOutput( + dim=config.prediction_length * config.num_output_channels + ) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1309,21 +1313,18 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output.last_hidden_state) + y_hat = self.head(model_output.last_hidden_state) loss_val = None - if future_values is not None: - if self.distribution_output: + if future_values is not None: + if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) - else: + else: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) @dataclass @@ -1376,14 +1377,10 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flattens.append(nn.Flatten(start_dim=2)) if distribution_output is None: # use linear head - self.projections.append( - nn.Linear(head_dim, config.prediction_length) - ) + self.projections.append(nn.Linear(head_dim, config.prediction_length)) else: # use distribution head - self.projections.append( - distribution_output.get_parameter_projection(head_dim) - ) + self.projections.append(distribution_output.get_parameter_projection(head_dim)) self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: # all the channels share the same head @@ -1418,16 +1415,20 @@ def forward(self, x: torch.Tensor): for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.dropouts[i](z) - z = self.projections[i](z) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + z = self.projections[i]( + z + ) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head x_out.append(z) x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) - x = self.projection(z) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head + x = self.projection( + z + ) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head if isinstance(x, tuple): - x = (z.transpose(2,1) for z in x) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) + x = (z.transpose(2, 1) for z in x) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) else: x = x.transpose(2, 1) # [bs x forecast_len x nvars] @@ -1454,7 +1455,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = ForecastHead(config, self.distribution_output) + self.head = ForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1470,19 +1471,18 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states - ) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) y_hat = self.head(model_output.last_hidden_state) - + loss_val = None if future_values is not None: if self.distribution_output: - distribution = self.distribution_output.distribution(y_hat, - loc=model_output.loc, - scale=model_output.scale) + distribution = self.distribution_output.distribution( + y_hat, loc=model_output.loc, scale=model_output.scale + ) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) @@ -1490,10 +1490,9 @@ def forward( y_hat = y_hat * model_output.scale + model_output.loc loss_val = self.loss(y_hat, future_values) - return PatchTSTForForecastingOutput(loss=loss_val, - forecast_outputs=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTForForecastingOutput( + loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states + ) class RegressionHead(nn.Module): @@ -1557,7 +1556,4 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) From e391bd3933753abd52fa700dd6f41a56208f51bd Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 4 Oct 2023 10:51:33 -0400 Subject: [PATCH 063/189] Add generate function for forecasting --- .../models/patchtst/modeling_patchtst.py | 168 ++++++++++++------ 1 file changed, 116 insertions(+), 52 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e988af3992b586..f4ebfee875c5ae 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -863,7 +863,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights else: return input_tensor.mean(dim=dim) - + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): @@ -1138,7 +1138,10 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput(loss=masked_loss, + prediction_output=x_hat, + hidden_states=model_output.hidden_states + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1165,8 +1168,10 @@ def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] if labels is not None: loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( - loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states - ) + loss=loss_val, + prediction_logits=y_hat, + hidden_states=model_output.hidden_states + ) class ClassificationHead(nn.Module): @@ -1231,7 +1236,7 @@ class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() - self.num_output_channels = config.num_output_channels + self.num_output_channels = config.num_output_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling @@ -1240,8 +1245,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flatten = nn.Flatten(start_dim=1) if distribution_output is None: - self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - else: + self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + else: self.projection = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() @@ -1262,17 +1267,15 @@ def forward(self, x): else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") - # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) - # projection + # flatten the input + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # projection y = self.projection(x) # reshape y - if isinstance(y, tuple): # for distribution head - y = ( - z.reshape(batch_size, -1, self.num_output_channels) for z in y - ) # tuple of [bs x pred_len x num_output_channels] - else: # for linear head - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + if isinstance(y, tuple): # for distribution head + y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) # tuple of [bs x pred_len x num_output_channels] + else: # for linear head + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] return y @@ -1292,9 +1295,7 @@ def __init__(self, config: PatchTSTConfig): elif config.distribution_output == "normal": self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput( - dim=config.prediction_length * config.num_output_channels - ) + self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1313,18 +1314,21 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output.last_hidden_state) + y_hat = self.head(model_output.last_hidden_state) loss_val = None - if future_values is not None: - if self.distribution_output: + if future_values is not None: + if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) - else: + else: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) @dataclass @@ -1356,6 +1360,22 @@ class PatchTSTForForecastingOutput(ModelOutput): forecast_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None + loc: torch.FloatTensor = None + scale: torch.FloatTensor = None + + +@dataclass +class SamplePatchTSTForecastOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Args: + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or + `(batch_size, num_samples, prediction_length, number_channels)`): + Sampled values from the chosen distribution. + """ + sequences: torch.FloatTensor = None class ForecastHead(nn.Module): @@ -1377,10 +1397,14 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flattens.append(nn.Flatten(start_dim=2)) if distribution_output is None: # use linear head - self.projections.append(nn.Linear(head_dim, config.prediction_length)) + self.projections.append( + nn.Linear(head_dim, config.prediction_length) + ) else: # use distribution head - self.projections.append(distribution_output.get_parameter_projection(head_dim)) + self.projections.append( + distribution_output.get_parameter_projection(head_dim) + ) self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: # all the channels share the same head @@ -1415,24 +1439,20 @@ def forward(self, x: torch.Tensor): for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.dropouts[i](z) - z = self.projections[i]( - z - ) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + z = self.projections[i](z) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head x_out.append(z) - x = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] + output = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) - x = self.projection( - z - ) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head - - if isinstance(x, tuple): - x = (z.transpose(2, 1) for z in x) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) + output = self.projection(z) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head + + if isinstance(output, tuple): + output = tuple(z.transpose(2,1) for z in output) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) else: - x = x.transpose(2, 1) # [bs x forecast_len x nvars] + output = output.transpose(2, 1) # [bs x forecast_len x nvars] - return x + return output class PatchTSTForForecasting(PatchTSTPreTrainedModel): @@ -1455,7 +1475,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = ForecastHead(config, self.distribution_output) + self.head = ForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1463,7 +1483,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor], + future_values: Optional[torch.Tensor] = None, past_observed_mask: Optional[torch.Tensor] = None, future_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, @@ -1471,28 +1491,69 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states - ) - - y_hat = self.head(model_output.last_hidden_state) + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states + ) + y_hat = self.head(model_output.last_hidden_state) + loss_val = None + if future_values is not None: if self.distribution_output: - distribution = self.distribution_output.distribution( - y_hat, loc=model_output.loc, scale=model_output.scale - ) + distribution = self.distribution_output.distribution(y_hat, + loc=model_output.loc, + scale=model_output.scale) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) + # for testing + # loss_val = nn.MSELoss(reduction='none')(distribution.mean, future_values) + # loss_val = weighted_average(loss_val) else: y_hat = y_hat * model_output.scale + model_output.loc loss_val = self.loss(y_hat, future_values) - return PatchTSTForForecastingOutput( - loss=loss_val, forecast_outputs=y_hat, hidden_states=model_output.hidden_states - ) + return PatchTSTForForecastingOutput(loss=loss_val, + forecast_outputs=y_hat, + hidden_states=model_output.hidden_states, + loc=model_output.loc, + scale=model_output.scale + ) + + def generate(self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None + ): + """ + Return: + [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of + samples, prediction_length)` or `(batch_size, number of samples, prediction_length, number_channels)` for + multivariate predictions. + """ + # get number of samples + num_parallel_samples = self.config.num_parallel_samples + + # get model output + outputs = self(past_values=past_values, + future_values=None, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states + ) + + # get distribution + distribution = self.distribution_output.distribution( + outputs.forecast_outputs, + loc=outputs.loc, + scale=outputs.scale + ) + # get samples + samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x forecast_len x nvars] + # stack tensors + samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] + return samples class RegressionHead(nn.Module): @@ -1556,4 +1617,7 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + return PatchTSTOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) From a50f6c23dcaa3a4a8df6fdc9cc53045601cc3173 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 4 Oct 2023 23:08:15 -0400 Subject: [PATCH 064/189] Add generate function to prediction task --- .../models/patchtst/configuration_patchtst.py | 8 +- .../models/patchtst/modeling_patchtst.py | 418 ++++++++++++------ 2 files changed, 284 insertions(+), 142 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 81a438bc2703f6..e55737adcf4dda 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -38,7 +38,7 @@ class PatchTSTConfig(PretrainedConfig): Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - Args: + Parameters: num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. @@ -173,8 +173,7 @@ def __init__( pre_norm: bool = False, positional_encoding: str = "sincos", learn_pe: bool = False, - use_cls_token: bool = False, - num_parallel_samples: int = 100, + use_cls_token: bool = False, init_std: float = 0.02, shared_projection: bool = True, seed_number: int = None, @@ -195,6 +194,8 @@ def __init__( prediction_length: int = 24, num_output_channels: int = 1, prediction_range: List = None, + # distribution head + num_parallel_samples: int = 100, **kwargs, ): # time series specific configuration @@ -254,6 +255,7 @@ def __init__( # Forcasting and prediction self.prediction_length = prediction_length + self.num_parallel_samples = num_parallel_samples # Regression self.num_output_channels = num_output_channels diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f4ebfee875c5ae..46bd8fc8936635 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -266,7 +266,7 @@ def random_masking( ): """random_masking: Mask the input considering the control variables. - Args: + Parameters: xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] mask_ratio (float): Mask ratio. unmasked_channel_indices (list, optional): @@ -297,8 +297,8 @@ def random_masking( mask[:, :, :len_keep] = 0 # sort noise for each sample - ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove - ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] + ids_shuffle = torch.Parametersort(noise, dim=-1) # ascend: small is keep, large is remove + ids_restore = torch.Parametersort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] mask = torch.gather(mask, dim=-1, index=ids_restore) mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patches x patch_length] @@ -317,7 +317,7 @@ class Patchify(nn.Module): """ A class to patchify the time series sequence into different patches - Args: + Parameters: sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride (int, required): stride between patches. @@ -350,7 +350,7 @@ def __init__( def forward(self, past_values: torch.Tensor): """ - Args: + Parameters: past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] Returns: @@ -371,7 +371,7 @@ def forward(self, past_values: torch.Tensor): class PatchEmbeddings(nn.Module): """ - Args: + Parameters: A class to patchify the time series sequence into different patches sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride (int, required): stride between patches. @@ -409,7 +409,7 @@ def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_d def forward(self, past_values: torch.Tensor): """ - Args: + Parameters: past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] Returns: embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim] @@ -436,7 +436,7 @@ class PatchMasking(nn.Module): """ PatchMasking: Class to random or forcast masking. - Args: + Parameters: mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. mask_ratio (float, optional): Mask ratio. mask_patches (list, optional): List of patch lengths to mask in the end of the data. @@ -706,7 +706,7 @@ def forward( self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None ) -> BaseModelOutputWithNoAttention: """ - Args: + Parameters: past_values: tensor [bs x nvars x num_patches x patch_length]. output_hidden_states (bool, optional): Indicates if hidden states should be output. @@ -767,7 +767,7 @@ def forward( """ PATCHTST_INPUTS_DOCSTRING = r""" - Args: + Parameters: past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, num_input_channels)`): Past values of the time series, that serve as context in order to predict the future. The sequence size of this tensor must be larger than the `context_length` of the model, since the model will use the larger size @@ -807,7 +807,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): """ Base class for model's outputs, with potential hidden states. - Args: + Parameters: last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): @@ -832,6 +832,183 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): scale: torch.FloatTensor = None +@dataclass +class PatchTSTForMaskPretrainingOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Parameters: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + MSE loss. + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction outputs of the time series modeling heads. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_output: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PatchTSTForPredictionOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Parameters: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + MSE loss. + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction outputs of the time series modeling heads. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_output: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PatchTSTOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Parameters: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + MSE loss. + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction outputs of the time series modeling heads. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_output: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class PatchTSTForClassificationOutput(ModelOutput): + """ + Output type of [`PatchTSTForClassification`]. + + Parameters: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class SamplePatchTSTPredictionOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Parameters: + sequences `(batch_size, num_samples, prediction_length, num_output_channels)`): + Sampled values from the chosen distribution. + """ + sequences: torch.FloatTensor = None + + +@dataclass +class PatchTSTForForecastingOutput(ModelOutput): + """ + Output type of [`PatchTSTForPredictiontion`]. + + Parameters: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + MSE loss. + + forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Forecasting outputs of the time series modeling heads. + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + forecast_outputs: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + loc: torch.FloatTensor = None + scale: torch.FloatTensor = None + + +@dataclass +class SamplePatchTSTForecastOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Parameters: + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or + `(batch_size, num_samples, prediction_length, number_channels)`): + Sampled values from the chosen distribution. + """ + sequences: torch.FloatTensor = None + + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor: """ @@ -846,7 +1023,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero, meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. - Args: + Parameters: input_tensor (`torch.FloatTensor`): Input tensor, of which the average must be computed. weights (`torch.FloatTensor`, *optional*): @@ -871,7 +1048,7 @@ class PatchTSTStdScaler(nn.Module): Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it by subtracting from the mean and dividing by the standard deviation. - Args: + Parameters: dim (`int`): Dimension along which to calculate the mean and standard deviation. keepdim (`bool`, *optional*, defaults to `False`): @@ -905,7 +1082,7 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data accordingly. - Args: + Parameters: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -962,7 +1139,7 @@ class PatchTSTNOPScaler(nn.Module): """ Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - Args: + Parameters: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1024,7 +1201,7 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ): + ) -> PatchTSTModelOutputWithNoAttention: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1071,35 +1248,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -@dataclass -class PatchTSTOutput(ModelOutput): - """ - Output type of [`PatchTSTForPredictiontion`]. - - Args: - loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction outputs of the time series modeling heads. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_output: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel): # PatchTSTModel + Pretraining Head def __init__(self, config: PatchTSTConfig): @@ -1118,7 +1266,7 @@ def forward( past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ) -> PatchTSTOutput: + ) -> PatchTSTForMaskPretrainingOutput: """ past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels """ @@ -1138,10 +1286,10 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTOutput(loss=masked_loss, - prediction_output=x_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTForMaskPretrainingOutput(loss=masked_loss, + prediction_output=x_hat, + hidden_states=model_output.hidden_states + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1156,7 +1304,12 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, past_values, labels=None, output_hidden_states: Optional[bool] = None): + def forward(self, + past_values: torch.Tensor, + labels: torch.Tensor = None, + output_hidden_states: Optional[bool] = None + ) -> PatchTSTForClassificationOutput: + output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1183,7 +1336,7 @@ def __init__(self, config: PatchTSTConfig): self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes) - def forward(self, x): + def forward(self, x: torch.Tensor): """ x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: [bs x n_classes] @@ -1202,36 +1355,6 @@ def forward(self, x): return y -@dataclass -class PatchTSTForClassificationOutput(ModelOutput): - """ - Output type of [`PatchTSTForClassification`]. - - Args: - loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1251,7 +1374,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - def forward(self, x): + def forward(self, x: torch.Tensor): """ x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token @@ -1308,12 +1431,18 @@ def forward( self, past_values: torch.Tensor, future_values: Optional[torch.Tensor] = None, + past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ): + ) -> PatchTSTForPredictionOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model(past_values, output_hidden_states=output_hidden_states) + # get model output + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states) + + # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape y_hat = self.head(model_output.last_hidden_state) loss_val = None @@ -1325,57 +1454,54 @@ def forward( loss_val = weighted_average(loss_val) else: loss_val = self.loss(y_hat, future_values) - return PatchTSTOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) - - -@dataclass -class PatchTSTForForecastingOutput(ModelOutput): - """ - Output type of [`PatchTSTForPredictiontion`]. - - Args: - loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. - - forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Forecasting outputs of the time series modeling heads. - - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. + + return PatchTSTForPredictionOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ + def generate(self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTPredictionOutput: + """ + Generate sequences of sample predictions from a model with a probability distribution head. - loss: Optional[torch.FloatTensor] = None - forecast_outputs: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - loc: torch.FloatTensor = None - scale: torch.FloatTensor = None + Parameters: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Past values of the time series that serves as context in order to predict the future. + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: -@dataclass -class SamplePatchTSTForecastOutput(ModelOutput): - """ - Base class for time series model's predictions outputs that contains the sampled values from the chosen - distribution. + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + Return: + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of + samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for + multivariate predictions. + """ + # get number of samples + num_parallel_samples = self.config.num_parallel_samples - Args: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or - `(batch_size, num_samples, prediction_length, number_channels)`): - Sampled values from the chosen distribution. - """ - sequences: torch.FloatTensor = None + # get model output + outputs = self(past_values=past_values, + future_values=None, + past_observed_mask=past_observed_mask, + output_hidden_states=None + ) + + # get distribution + distribution = self.distribution_output.distribution( + outputs.prediction_output + ) + # get samples + samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x pred_len x num_output_channels] + # stack tensors + samples = torch.stack(samples, dim=1) # [bs x num_samples x pred_len x num_output_channels] + return SamplePatchTSTPredictionOutput(sequences=samples) class ForecastHead(nn.Module): @@ -1487,15 +1613,16 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ): + ) -> PatchTSTForForecastingOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + # get model output model_output = self.model(past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states ) - + # get output head y_hat = self.head(model_output.last_hidden_state) loss_val = None @@ -1524,13 +1651,25 @@ def forward( def generate(self, past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None - ): + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTForecastOutput: """ + Generate sequences of sample predictions from a model with a probability distribution head. + + Parameters: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Past values of the time series that serves as context in order to predict the future. + + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + Return: [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of - samples, prediction_length)` or `(batch_size, number of samples, prediction_length, number_channels)` for + samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. """ # get number of samples @@ -1540,7 +1679,7 @@ def generate(self, outputs = self(past_values=past_values, future_values=None, past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states + output_hidden_states=None ) # get distribution @@ -1553,7 +1692,7 @@ def generate(self, samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x forecast_len x nvars] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] - return samples + return SamplePatchTSTForecastOutput(sequences=samples) class RegressionHead(nn.Module): @@ -1621,3 +1760,4 @@ def forward( prediction_output=y_hat, hidden_states=model_output.hidden_states ) + From 8daf165c9fe6903a4e7365613f132a131a85da0d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 10:09:12 +0200 Subject: [PATCH 065/189] formatting --- src/transformers/models/auto/modeling_auto.py | 4 +- .../models/patchtst/configuration_patchtst.py | 2 +- .../models/patchtst/modeling_patchtst.py | 241 +++++++++--------- 3 files changed, 124 insertions(+), 123 deletions(-) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b82b189f5bb91a..22feb8a125ee83 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1126,8 +1126,8 @@ [ ("patchtst", "PatchTSTForRegression"), ] -) - +) + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict( [ ("swin2sr", "Swin2SRForImageSuperResolution"), diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index e55737adcf4dda..71aef0b436bbd3 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -173,7 +173,7 @@ def __init__( pre_norm: bool = False, positional_encoding: str = "sincos", learn_pe: bool = False, - use_cls_token: bool = False, + use_cls_token: bool = False, init_std: float = 0.02, shared_projection: bool = True, seed_number: int = None, diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 46bd8fc8936635..5e13cc9fda7740 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -959,6 +959,7 @@ class SamplePatchTSTPredictionOutput(ModelOutput): sequences `(batch_size, num_samples, prediction_length, num_output_channels)`): Sampled values from the chosen distribution. """ + sequences: torch.FloatTensor = None @@ -1002,10 +1003,11 @@ class SamplePatchTSTForecastOutput(ModelOutput): distribution. Parameters: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, number_channels)`): Sampled values from the chosen distribution. """ + sequences: torch.FloatTensor = None @@ -1023,7 +1025,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero, meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. - Parameters: + Args: input_tensor (`torch.FloatTensor`): Input tensor, of which the average must be computed. weights (`torch.FloatTensor`, *optional*): @@ -1040,7 +1042,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights else: return input_tensor.mean(dim=dim) - + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): @@ -1048,7 +1050,7 @@ class PatchTSTStdScaler(nn.Module): Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it by subtracting from the mean and dividing by the standard deviation. - Parameters: + Args: dim (`int`): Dimension along which to calculate the mean and standard deviation. keepdim (`bool`, *optional*, defaults to `False`): @@ -1082,7 +1084,7 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data accordingly. - Parameters: + Args: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1139,7 +1141,7 @@ class PatchTSTNOPScaler(nn.Module): """ Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - Parameters: + Args: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1286,10 +1288,9 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) - return PatchTSTForMaskPretrainingOutput(loss=masked_loss, - prediction_output=x_hat, - hidden_states=model_output.hidden_states - ) + return PatchTSTForMaskPretrainingOutput( + loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1304,12 +1305,9 @@ def __init__(self, config: PatchTSTConfig): # Initialize weights and apply final processing self.post_init() - def forward(self, - past_values: torch.Tensor, - labels: torch.Tensor = None, - output_hidden_states: Optional[bool] = None - ) -> PatchTSTForClassificationOutput: - + def forward( + self, past_values: torch.Tensor, labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None + ) -> PatchTSTForClassificationOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1321,10 +1319,8 @@ def forward(self, if labels is not None: loss_val = self.loss(y_hat, labels) return PatchTSTForClassificationOutput( - loss=loss_val, - prediction_logits=y_hat, - hidden_states=model_output.hidden_states - ) + loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states + ) class ClassificationHead(nn.Module): @@ -1359,7 +1355,7 @@ class PredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() - self.num_output_channels = config.num_output_channels + self.num_output_channels = config.num_output_channels self.use_cls_token = config.use_cls_token self.pooling = config.pooling @@ -1368,8 +1364,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flatten = nn.Flatten(start_dim=1) if distribution_output is None: - self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - else: + self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) + else: self.projection = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() @@ -1390,15 +1386,17 @@ def forward(self, x: torch.Tensor): else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") - # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) - # projection + # flatten the input + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # projection y = self.projection(x) # reshape y - if isinstance(y, tuple): # for distribution head - y = (z.reshape(batch_size, -1, self.num_output_channels) for z in y) # tuple of [bs x pred_len x num_output_channels] - else: # for linear head - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + if isinstance(y, tuple): # for distribution head + y = ( + z.reshape(batch_size, -1, self.num_output_channels) for z in y + ) # tuple of [bs x pred_len x num_output_channels] + else: # for linear head + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] return y @@ -1418,7 +1416,9 @@ def __init__(self, config: PatchTSTConfig): elif config.distribution_output == "normal": self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) + self.distribution_output = NegativeBinomialOutput( + dim=config.prediction_length * config.num_output_channels + ) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1438,37 +1438,37 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) # get model output - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states) - - # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape - y_hat = self.head(model_output.last_hidden_state) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) + + # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape + y_hat = self.head(model_output.last_hidden_state) loss_val = None - if future_values is not None: - if self.distribution_output: + if future_values is not None: + if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) - else: + else: loss_val = self.loss(y_hat, future_values) - - return PatchTSTForPredictionOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) - - def generate(self, - past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTPredictionOutput: + + return PatchTSTForPredictionOutput( + loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states + ) + + def generate( + self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTPredictionOutput: """ Generate sequences of sample predictions from a model with a probability distribution head. Parameters: - past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): Past values of the time series that serves as context in order to predict the future. past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): @@ -1477,30 +1477,31 @@ def generate(self, - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - + Return: [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. """ - # get number of samples + # get number of samples num_parallel_samples = self.config.num_parallel_samples - # get model output - outputs = self(past_values=past_values, - future_values=None, - past_observed_mask=past_observed_mask, - output_hidden_states=None - ) - + # get model output + outputs = self( + past_values=past_values, + future_values=None, + past_observed_mask=past_observed_mask, + output_hidden_states=None, + ) + # get distribution - distribution = self.distribution_output.distribution( - outputs.prediction_output - ) + distribution = self.distribution_output.distribution(outputs.prediction_output) # get samples - samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x pred_len x num_output_channels] + samples = [ + distribution.sample() for i in range(num_parallel_samples) + ] # samples: list of [bs x pred_len x num_output_channels] # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x pred_len x num_output_channels] + samples = torch.stack(samples, dim=1) # [bs x num_samples x pred_len x num_output_channels] return SamplePatchTSTPredictionOutput(sequences=samples) @@ -1523,14 +1524,10 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.flattens.append(nn.Flatten(start_dim=2)) if distribution_output is None: # use linear head - self.projections.append( - nn.Linear(head_dim, config.prediction_length) - ) + self.projections.append(nn.Linear(head_dim, config.prediction_length)) else: # use distribution head - self.projections.append( - distribution_output.get_parameter_projection(head_dim) - ) + self.projections.append(distribution_output.get_parameter_projection(head_dim)) self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()) else: # all the channels share the same head @@ -1565,16 +1562,22 @@ def forward(self, x: torch.Tensor): for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.dropouts[i](z) - z = self.projections[i](z) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + z = self.projections[i]( + z + ) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head x_out.append(z) output = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] else: z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] z = self.dropout(z) - output = self.projection(z) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head - - if isinstance(output, tuple): - output = tuple(z.transpose(2,1) for z in output) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) + output = self.projection( + z + ) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head + + if isinstance(output, tuple): + output = tuple( + z.transpose(2, 1) for z in output + ) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) else: output = output.transpose(2, 1) # [bs x forecast_len x nvars] @@ -1601,7 +1604,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = ForecastHead(config, self.distribution_output) + self.head = ForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1618,46 +1621,47 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) # get model output - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states - ) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) # get output head - y_hat = self.head(model_output.last_hidden_state) - + y_hat = self.head(model_output.last_hidden_state) + loss_val = None - + if future_values is not None: if self.distribution_output: - distribution = self.distribution_output.distribution(y_hat, - loc=model_output.loc, - scale=model_output.scale) + distribution = self.distribution_output.distribution( + y_hat, loc=model_output.loc, scale=model_output.scale + ) loss_val = self.loss(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) - # for testing + # for testing # loss_val = nn.MSELoss(reduction='none')(distribution.mean, future_values) # loss_val = weighted_average(loss_val) else: y_hat = y_hat * model_output.scale + model_output.loc loss_val = self.loss(y_hat, future_values) - return PatchTSTForForecastingOutput(loss=loss_val, - forecast_outputs=y_hat, - hidden_states=model_output.hidden_states, - loc=model_output.loc, - scale=model_output.scale - ) - - def generate(self, - past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTForecastOutput: + return PatchTSTForForecastingOutput( + loss=loss_val, + forecast_outputs=y_hat, + hidden_states=model_output.hidden_states, + loc=model_output.loc, + scale=model_output.scale, + ) + + def generate( + self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTForecastOutput: """ Generate sequences of sample predictions from a model with a probability distribution head. Parameters: - past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): Past values of the time series that serves as context in order to predict the future. past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): @@ -1666,32 +1670,33 @@ def generate(self, - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - + Return: [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. """ - # get number of samples + # get number of samples num_parallel_samples = self.config.num_parallel_samples - # get model output - outputs = self(past_values=past_values, - future_values=None, - past_observed_mask=past_observed_mask, - output_hidden_states=None - ) - + # get model output + outputs = self( + past_values=past_values, + future_values=None, + past_observed_mask=past_observed_mask, + output_hidden_states=None, + ) + # get distribution distribution = self.distribution_output.distribution( - outputs.forecast_outputs, - loc=outputs.loc, - scale=outputs.scale - ) + outputs.forecast_outputs, loc=outputs.loc, scale=outputs.scale + ) # get samples - samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x forecast_len x nvars] + samples = [ + distribution.sample() for i in range(num_parallel_samples) + ] # samples: list of [bs x forecast_len x nvars] # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] + samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] return SamplePatchTSTForecastOutput(sequences=samples) @@ -1756,8 +1761,4 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) - + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) From e2f8fd8d563e3a13a6f9da44b109eca19ebfbac0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 10:17:56 +0200 Subject: [PATCH 066/189] use argsort --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5e13cc9fda7740..a98a101abb411a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -297,8 +297,8 @@ def random_masking( mask[:, :, :len_keep] = 0 # sort noise for each sample - ids_shuffle = torch.Parametersort(noise, dim=-1) # ascend: small is keep, large is remove - ids_restore = torch.Parametersort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] + ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] mask = torch.gather(mask, dim=-1, index=ids_restore) mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patches x patch_length] From 1c8ec9dfe3c54f97af23ddbb2ea1401e92e6581d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 10:28:52 +0200 Subject: [PATCH 067/189] add past_observed_mask ordering --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++-- tests/models/patchtst/test_modeling_patchtst.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a98a101abb411a..180eec91af9f0a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1430,8 +1430,8 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, past_observed_mask: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ) -> PatchTSTForPredictionOutput: output_hidden_states = ( @@ -1612,8 +1612,8 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - future_values: Optional[torch.Tensor] = None, past_observed_mask: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, future_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ) -> PatchTSTForForecastingOutput: diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 4f3cb2f1f465bc..a7c6c6e2186b19 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -277,6 +277,7 @@ def test_forward_signature(self): expected_arg_names = [ "past_values", + "past_observed_mask", "future_values", ] if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( From 7ebaa61ffc3b39ba8ecaeb130e528d1f09af1dd9 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 12:09:15 +0200 Subject: [PATCH 068/189] fix arguments --- .../models/patchtst/modeling_patchtst.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 180eec91af9f0a..8f08bcf4edee5e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1266,6 +1266,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ) -> PatchTSTForMaskPretrainingOutput: @@ -1306,7 +1307,11 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + past_observed_mask: Optional[bool] = None, + labels: torch.Tensor = None, + output_hidden_states: Optional[bool] = None, ) -> PatchTSTForClassificationOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1614,7 +1619,6 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - future_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, ) -> PatchTSTForForecastingOutput: output_hidden_states = ( @@ -1750,7 +1754,11 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, labels: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + past_observed_mask: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states From f3dca2530179991584ba77d4ec33ff78de3bcb84 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 12:09:58 +0200 Subject: [PATCH 069/189] docs --- docs/source/en/model_doc/patchtst.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 209e50a6b12480..504be80c3e6c9a 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -19,7 +19,6 @@ rendered properly in your Markdown viewer. ## Overview The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. - The abstract from the paper is the following: @@ -27,9 +26,10 @@ The abstract from the paper is the following: Tips: - +The model also adds a time series classification pipeline and time series regression pipeline. This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif). + The original code can be found [here](https://github.com/yuqinie98/PatchTST). From 5349bf439ee1c889f909b43e453290bd429832ab Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 12:42:09 +0200 Subject: [PATCH 070/189] add back test_model_outputs_equivalence test --- .../models/patchtst/modeling_patchtst.py | 71 +++++++++++++++---- .../models/patchtst/test_modeling_patchtst.py | 10 +-- 2 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 8f08bcf4edee5e..ffac3bcf4db227 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -17,7 +17,7 @@ import math import random from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numpy as np import torch @@ -1203,10 +1203,12 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ) -> PatchTSTModelOutputWithNoAttention: + return_dict: Optional[bool] = None, + ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if past_observed_mask is None: past_observed_mask = torch.ones_like(past_values) @@ -1221,9 +1223,15 @@ def forward( else: masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) + + hidden_states = encoder_output.last_hidden_state + encoder_states = encoder_output.hidden_states + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, patched_values, mask, loc, scale] if v is not None) return PatchTSTModelOutputWithNoAttention( - last_hidden_state=encoder_output.last_hidden_state, - hidden_states=encoder_output.hidden_states, + last_hidden_state=hidden_states, + hidden_states=encoder_states, patched_input=patched_values, mask=mask, loc=loc, @@ -1269,13 +1277,15 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ) -> PatchTSTForMaskPretrainingOutput: + return_dict: Optional[bool] = None, + ) -> Union[Tuple, PatchTSTForMaskPretrainingOutput]: """ past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # past_values: [bs x nvars x num_patches x d_model] or # [bs x nvars x (num_patches+1) x d_model] if use cls_token @@ -1289,8 +1299,11 @@ def forward( loss_val = self.loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) + encoder_states = model_output.hidden_states + if not return_dict: + return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None) return PatchTSTForMaskPretrainingOutput( - loss=masked_loss, prediction_output=x_hat, hidden_states=model_output.hidden_states + loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states ) @@ -1312,10 +1325,12 @@ def forward( past_observed_mask: Optional[bool] = None, labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, ) -> PatchTSTForClassificationOutput: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output[0]) @@ -1323,8 +1338,12 @@ def forward( loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) + + encoder_states = model_output.hidden_states + if not return_dict: + return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) return PatchTSTForClassificationOutput( - loss=loss_val, prediction_logits=y_hat, hidden_states=model_output.hidden_states + loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states ) @@ -1438,10 +1457,13 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ) -> PatchTSTForPredictionOutput: + return_dict: Optional[bool] = None, + ) -> Union[Tuple, PatchTSTForPredictionOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # get model output model_output = self.model( past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states @@ -1460,8 +1482,11 @@ def forward( else: loss_val = self.loss(y_hat, future_values) + encoder_states = model_output.hidden_states + if not return_dict: + return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) return PatchTSTForPredictionOutput( - loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states + loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states ) def generate( @@ -1620,10 +1645,13 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ) -> PatchTSTForForecastingOutput: + return_dict: Optional[bool] = None, + ) -> Union[Tuple, PatchTSTForForecastingOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # get model output model_output = self.model( past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states @@ -1648,12 +1676,18 @@ def forward( y_hat = y_hat * model_output.scale + model_output.loc loss_val = self.loss(y_hat, future_values) + encoder_states = model_output.hidden_states + loc = model_output.loc + scale = model_output.scale + + if not return_dict: + return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None) return PatchTSTForForecastingOutput( loss=loss_val, forecast_outputs=y_hat, - hidden_states=model_output.hidden_states, - loc=model_output.loc, - scale=model_output.scale, + hidden_states=encoder_states, + loc=loc, + scale=scale, ) def generate( @@ -1759,14 +1793,21 @@ def forward( past_observed_mask: Optional[bool] = None, labels: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, - ): + return_dict: Optional[bool] = None, + ) -> Union[Tuple, PatchTSTOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + model_output = self.model(past_values, output_hidden_states=output_hidden_states) y_hat = self.head(model_output.last_hidden_state) loss_val = None if labels is not None: loss_val = self.loss(y_hat, labels) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=model_output.hidden_states) + + encoder_states = model_output.hidden_states + if not return_dict: + return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) + return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index a7c6c6e2186b19..20ef536a9cb796 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -248,17 +248,13 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # - # # Ignore since we have no tokens embeddings + # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass - def test_model_outputs_equivalence(self): - pass - - def test_determinism(self): - pass + # def test_model_outputs_equivalence(self): + # pass def test_model_main_input_name(self): model_signature = inspect.signature(getattr(PatchTSTModel, "forward")) From eb7f547f3c8a6763c73548e760a91e80c723d771 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 12:43:24 +0200 Subject: [PATCH 071/189] formatting --- src/transformers/models/patchtst/modeling_patchtst.py | 10 +++------- tests/models/patchtst/test_modeling_patchtst.py | 1 - 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index ffac3bcf4db227..6f50c32d272070 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1342,9 +1342,7 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTForClassificationOutput( - loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states - ) + return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states) class ClassificationHead(nn.Module): @@ -1485,9 +1483,7 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTForPredictionOutput( - loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states - ) + return PatchTSTForPredictionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) def generate( self, @@ -1809,5 +1805,5 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) + return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 20ef536a9cb796..d9756b3c55b76a 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -248,7 +248,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # Ignore since we have no tokens embeddings def test_resize_tokens_embeddings(self): pass From a1cf42cbab4b95e7b5ce7690e76d6aa94ecececb Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 12:50:48 +0200 Subject: [PATCH 072/189] cleanup --- tests/models/patchtst/test_modeling_patchtst.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index d9756b3c55b76a..7f453f707948ef 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -248,13 +248,10 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # Ignore since we have no tokens embeddings + @unittest.skip(reason="we have no tokens embeddings") def test_resize_tokens_embeddings(self): pass - # def test_model_outputs_equivalence(self): - # pass - def test_model_main_input_name(self): model_signature = inspect.signature(getattr(PatchTSTModel, "forward")) # The main input is the name of the argument after `self` From 6392f999610bbe8cc8aa9e8a9cac606e98463d0f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 13:02:22 +0200 Subject: [PATCH 073/189] formatting --- .../models/patchtst/configuration_patchtst.py | 6 ++++-- .../models/patchtst/modeling_patchtst.py | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 71aef0b436bbd3..2ba1c808358cf7 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -45,10 +45,12 @@ class PatchTSTConfig(PretrainedConfig): context_length (`int`, defaults to 32): The context length for the encoder. distribution_output (`string`, *optional*, defaults to `"student_t"`): - The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial". + The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or + "negative_binomial". loss (`string`, *optional*, defaults to `"mse"`): The loss function for the model corresponding to the `distribution_output` head. For parametric - distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse". + distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared + error "mse". patch_length (`int`, *optional*, defaults to 1): Define the patch length of the patchification process. Default to 1. stride (`int`, *optional*, defaults to 1): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6f50c32d272070..ef0eefb9e1b300 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1505,9 +1505,9 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: - [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of - samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for - multivariate predictions. + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, + number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, + num_input_channels)` for multivariate predictions. """ # get number of samples num_parallel_samples = self.config.num_parallel_samples @@ -1706,9 +1706,9 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: - [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of - samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for - multivariate predictions. + [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number + of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, + num_input_channels)` for multivariate predictions. """ # get number of samples num_parallel_samples = self.config.num_parallel_samples From 8a91544f1ae6de73a368c6b7eeb9e37f75fd2797 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 16:01:48 +0200 Subject: [PATCH 074/189] use ACT2CLS --- src/transformers/models/patchtst/modeling_patchtst.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index ef0eefb9e1b300..4a5608cd02fd01 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -23,6 +23,7 @@ import torch from torch import nn +from ...activations import ACT2CLS from ...modeling_outputs import BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput @@ -195,14 +196,6 @@ def forward( return attn_output, attn_weights_reshaped, past_key_value -def get_activation_fn(activation): - if callable(activation): - return activation() - elif activation.lower() == "relu": - return nn.ReLU() - elif activation.lower() == "gelu": - return nn.GELU() - raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable') class Transpose(nn.Module): @@ -562,7 +555,7 @@ def __init__(self, config: PatchTSTConfig): # Position-wise Feed-Forward self.ff = nn.Sequential( nn.Linear(config.d_model, config.encoder_ffn_dim, bias=config.bias), - get_activation_fn(config.activation_function), + ACT2CLS[config.activation_function](), nn.Dropout(config.ff_dropout) if config.ff_dropout > 0 else nn.Identity(), nn.Linear(config.encoder_ffn_dim, config.d_model, bias=config.bias), ) From dfbea052770425c6cdd59ec0e85b10ea424e9eac Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 16:15:07 +0200 Subject: [PATCH 075/189] formatting --- src/transformers/models/patchtst/modeling_patchtst.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4a5608cd02fd01..9b438601506a2d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -196,8 +196,6 @@ def forward( return attn_output, attn_weights_reshaped, past_key_value - - class Transpose(nn.Module): def __init__(self, *dims, contiguous=False): super().__init__() From 1a0c55ee945855a4fb493f08bd7bfd72b6c3f311 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 16:33:07 +0200 Subject: [PATCH 076/189] fix add_start_docstrings decorator --- src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 9b438601506a2d..b49feb2ed23326 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -790,10 +790,6 @@ def forward( @dataclass -@add_start_docstrings( - "The bare PatchTST Model outputting raw hidden-states without any specific head.", - PATCHTST_START_DOCSTRING, -) class PatchTSTModelOutputWithNoAttention(ModelOutput): """ Base class for model's outputs, with potential hidden states. @@ -1152,6 +1148,10 @@ def forward( return data, loc, scale +@add_start_docstrings( + "The bare PatchTST Model outputting raw hidden-states without any specific head.", + PATCHTST_START_DOCSTRING, +) class PatchTSTModel(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) From 0a4e58bedeb296c5df05a9a7b913b9e541fecb58 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 5 Oct 2023 13:55:08 -0400 Subject: [PATCH 077/189] add distribution head and generate function to regression task add distribution head and generate function to regression task. Also made add PatchTSTForForecastingOutput, PatchTSTForRegressionOutput. --- .../models/patchtst/modeling_patchtst.py | 228 ++++++++++++++---- 1 file changed, 177 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index b49feb2ed23326..a3a5cf2b57993c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -822,7 +822,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): @dataclass class PatchTSTForMaskPretrainingOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForMaskPretraining`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): @@ -878,9 +878,9 @@ class PatchTSTForPredictionOutput(ModelOutput): @dataclass -class PatchTSTOutput(ModelOutput): +class PatchTSTForRegressionOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForRegression`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): @@ -907,16 +907,17 @@ class PatchTSTOutput(ModelOutput): @dataclass -class PatchTSTForClassificationOutput(ModelOutput): +class PatchTSTForForecastingOutput(ModelOutput): """ - Output type of [`PatchTSTForClassification`]. + Output type of [`PatchTSTForForecasting`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + MSE loss. + + forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Forecasting outputs of the time series modeling heads. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -931,37 +932,25 @@ class PatchTSTForClassificationOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_logits: torch.FloatTensor = None + forecast_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None + loc: torch.FloatTensor = None + scale: torch.FloatTensor = None -@dataclass -class SamplePatchTSTPredictionOutput(ModelOutput): - """ - Base class for time series model's predictions outputs that contains the sampled values from the chosen - distribution. - - Parameters: - sequences `(batch_size, num_samples, prediction_length, num_output_channels)`): - Sampled values from the chosen distribution. - """ - - sequences: torch.FloatTensor = None - @dataclass -class PatchTSTForForecastingOutput(ModelOutput): +class PatchTSTForClassificationOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForClassification`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. - - forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Forecasting outputs of the time series modeling heads. - + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -976,11 +965,23 @@ class PatchTSTForForecastingOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - forecast_outputs: torch.FloatTensor = None + prediction_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - loc: torch.FloatTensor = None - scale: torch.FloatTensor = None + + +@dataclass +class SamplePatchTSTPredictionOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Parameters: + sequences `(batch_size, num_samples, prediction_length, num_output_channels)`): + Sampled values from the chosen distribution. + """ + + sequences: torch.FloatTensor = None @dataclass @@ -990,14 +991,31 @@ class SamplePatchTSTForecastOutput(ModelOutput): distribution. Parameters: +<<<<<<< Updated upstream sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, number_channels)`): +======= + sequences `(batch_size, num_samples, prediction_length, number_channels)`): +>>>>>>> Stashed changes Sampled values from the chosen distribution. """ sequences: torch.FloatTensor = None +@dataclass +class SamplePatchTSTRegressionOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Parameters: + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)` + Sampled values from the chosen distribution. + """ + sequences: torch.FloatTensor = None + + # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor: """ @@ -1375,13 +1393,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) + self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() if distribution_output is None: self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) else: self.projection = distribution_output.get_parameter_projection(head_dim) - - self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() + def forward(self, x: torch.Tensor): """ @@ -1454,12 +1472,21 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # get model output +<<<<<<< Updated upstream model_output = self.model( past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states ) # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape y_hat = self.head(model_output.last_hidden_state) +======= + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states) + + # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape + y_hat = self.head(model_output.last_hidden_state) +>>>>>>> Stashed changes loss_val = None if future_values is not None: @@ -1496,9 +1523,14 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: +<<<<<<< Updated upstream [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. +======= + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of + samples, prediction_length, num_output_channels)` +>>>>>>> Stashed changes """ # get number of samples num_parallel_samples = self.config.num_parallel_samples @@ -1726,38 +1758,43 @@ def generate( class RegressionHead(nn.Module): - def __init__(self, config: PatchTSTConfig): + def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.y_range = config.prediction_range self.use_cls_token = config.use_cls_token self.pooling = config.pooling - # self.is_flatten = is_flatten + self.distribution_output = distribution_output + + head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - head_dim = config.num_input_channels * config.d_model - # if is_flatten: head_dim *= num_patch - self.linear = nn.Linear(head_dim, config.num_output_channels) + + if distribution_output is None: + self.projection = nn.Linear(head_dim, config.num_output_channels) + else: + self.projection = distribution_output.get_parameter_projection(head_dim) - def forward(self, past_values): + def forward(self, x): """ x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token output: [bs x output_dim] - """ + """ if self.use_cls_token: - past_values = past_values[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] + x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] elif self.pooling == "mean": - past_values = past_values.mean(dim=2) # x: [bs x nvars x d_model] + x = x.mean(dim=2) # x: [bs x nvars x d_model] elif self.pooling == "max": - past_values = past_values.max(dim=2) # x: [bs x nvars x d_model] + x = x.max(dim=2) # x: [bs x nvars x d_model] else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input - past_values = self.flatten(past_values) # x: bs x nvars * d_model - y = self.linear(self.dropout(past_values)) # y: bs x output_dim - - if self.y_range: + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # projection + y = self.projection(x) # y: bs x output_dim or a tuple of this shape for distribution head + # + if (self.distribution_output is None) & self.y_range: # linear head y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] return y @@ -1768,13 +1805,29 @@ class PatchTSTForRegression(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) - self.head = RegressionHead(config) - self.loss = nn.MSELoss(reduction="mean") + + self.model = PatchTSTModel(config) + if config.loss == "mse": + self.loss = nn.MSELoss(reduction="mean") + self.distribution_output = None + else: + self.loss = nll + if config.distribution_output == "student_t": + self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) + elif config.distribution_output == "normal": + self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) + elif config.distribution_output == "negative_binomial": + self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) + else: + raise ValueError(f"Unknown distribution output {config.distribution_output}") + + self.head = RegressionHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() def forward( +<<<<<<< Updated upstream self, past_values: torch.Tensor, past_observed_mask: Optional[bool] = None, @@ -1798,3 +1851,76 @@ def forward( if not return_dict: return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) +======= + self, past_values: torch.Tensor, + labels: Optional[torch.Tensor], + past_observed_mask: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None + ) -> PatchTSTForRegressionOutput: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states) + # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape + y_hat = self.head(model_output.last_hidden_state) + + loss_val = None + if labels is not None: + if self.distribution_output: + distribution = self.distribution_output.distribution(y_hat) + loss_val = self.loss(distribution, labels) + # take average of the loss + loss_val = weighted_average(loss_val) + else: + loss_val = self.loss(y_hat, labels) + + return PatchTSTForRegressionOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) + + + def generate(self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTPredictionOutput: + """ + Generate sequences of sample predictions from a model with a probability distribution head. + + Parameters: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Past values of the time series that serves as context in order to predict the future. + + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + Return: + [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of + samples, num_output_channels)`. + """ + # get number of samples + num_parallel_samples = self.config.num_parallel_samples + + # get model output + outputs = self(past_values=past_values, + labels=None, + past_observed_mask=past_observed_mask, + output_hidden_states=None + ) + + # get distribution + distribution = self.distribution_output.distribution( + outputs.prediction_output + ) + # get samples + samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x num_output_channels] + # stack tensors + samples = torch.stack(samples, dim=1) # [bs x num_samples x num_output_channels] + return SamplePatchTSTRegressionOutput(sequences=samples) +>>>>>>> Stashed changes From 72a6e1e5b29205ab6094c33aedb83e5fcb38079f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 5 Oct 2023 14:12:30 -0400 Subject: [PATCH 078/189] add distribution head and generate function to regression task add distribution head and generate function to regression task. Also made add PatchTSTForForecastingOutput, PatchTSTForRegressionOutput. --- .../models/patchtst/modeling_patchtst.py | 68 +++++-------------- 1 file changed, 16 insertions(+), 52 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a3a5cf2b57993c..b9dc656ef8d827 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1335,13 +1335,15 @@ def forward( labels: torch.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> PatchTSTForClassificationOutput: + ) -> Union[tuple, PatchTSTForClassificationOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - model_output = self.model(past_values, output_hidden_states=output_hidden_states) + model_output = self.model(past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states) y_hat = self.head(model_output[0]) loss_val = None @@ -1472,21 +1474,12 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # get model output -<<<<<<< Updated upstream - model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states - ) - - # get output head. y_hat is of shape [bs x pred_len x num_output_channels] of tuple of this shape - y_hat = self.head(model_output.last_hidden_state) -======= model_output = self.model(past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states) # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape y_hat = self.head(model_output.last_hidden_state) ->>>>>>> Stashed changes loss_val = None if future_values is not None: @@ -1523,14 +1516,8 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: -<<<<<<< Updated upstream - [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, - number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, - num_input_channels)` for multivariate predictions. -======= [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of samples, prediction_length, num_output_channels)` ->>>>>>> Stashed changes """ # get number of samples num_parallel_samples = self.config.num_parallel_samples @@ -1547,7 +1534,7 @@ def generate( distribution = self.distribution_output.distribution(outputs.prediction_output) # get samples samples = [ - distribution.sample() for i in range(num_parallel_samples) + distribution.sample() for _ in range(num_parallel_samples) ] # samples: list of [bs x pred_len x num_output_channels] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x pred_len x num_output_channels] @@ -1750,7 +1737,7 @@ def generate( ) # get samples samples = [ - distribution.sample() for i in range(num_parallel_samples) + distribution.sample() for _ in range(num_parallel_samples) ] # samples: list of [bs x forecast_len x nvars] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] @@ -1827,36 +1814,13 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( -<<<<<<< Updated upstream self, past_values: torch.Tensor, - past_observed_mask: Optional[bool] = None, - labels: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, PatchTSTOutput]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - model_output = self.model(past_values, output_hidden_states=output_hidden_states) - y_hat = self.head(model_output.last_hidden_state) - - loss_val = None - if labels is not None: - loss_val = self.loss(y_hat, labels) - - encoder_states = model_output.hidden_states - if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) -======= - self, past_values: torch.Tensor, labels: Optional[torch.Tensor], past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None - ) -> PatchTSTForRegressionOutput: + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, PatchTSTForRegressionOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1876,16 +1840,17 @@ def forward( else: loss_val = self.loss(y_hat, labels) + if not return_dict: + return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None) return PatchTSTForRegressionOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) - + prediction_output=y_hat, + hidden_states=model_output.hidden_states + ) def generate(self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTPredictionOutput: + ) -> SamplePatchTSTRegressionOutput: """ Generate sequences of sample predictions from a model with a probability distribution head. @@ -1919,8 +1884,7 @@ def generate(self, outputs.prediction_output ) # get samples - samples = [distribution.sample() for i in range(num_parallel_samples)] # samples: list of [bs x num_output_channels] + samples = [distribution.sample() for _ in range(num_parallel_samples)] # samples: list of [bs x num_output_channels] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x num_output_channels] return SamplePatchTSTRegressionOutput(sequences=samples) ->>>>>>> Stashed changes From 9908c6a10b4ca4a469bf5234fe89c39dd41125a6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 5 Oct 2023 20:32:31 +0200 Subject: [PATCH 079/189] fix typos --- .../models/patchtst/modeling_patchtst.py | 141 +++++++++--------- 1 file changed, 70 insertions(+), 71 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index b9dc656ef8d827..3f5102c0183160 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -939,7 +939,6 @@ class PatchTSTForForecastingOutput(ModelOutput): scale: torch.FloatTensor = None - @dataclass class PatchTSTForClassificationOutput(ModelOutput): """ @@ -987,17 +986,17 @@ class SamplePatchTSTPredictionOutput(ModelOutput): @dataclass class SamplePatchTSTForecastOutput(ModelOutput): """ - Base class for time series model's predictions outputs that contains the sampled values from the chosen - distribution. + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. - Parameters: -<<<<<<< Updated upstream - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or - `(batch_size, num_samples, prediction_length, number_channels)`): -======= - sequences `(batch_size, num_samples, prediction_length, number_channels)`): ->>>>>>> Stashed changes - Sampled values from the chosen distribution. + Parameters: + <<<<<<< Updated upstream + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, + num_samples, prediction_length, number_channels)`): + ======= + sequences `(batch_size, num_samples, prediction_length, number_channels)`): + >>>>>>> Stashed changes + Sampled values from the chosen distribution. """ sequences: torch.FloatTensor = None @@ -1010,10 +1009,11 @@ class SamplePatchTSTRegressionOutput(ModelOutput): distribution. Parameters: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)` + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)` Sampled values from the chosen distribution. """ - sequences: torch.FloatTensor = None + + sequences: torch.FloatTensor = None # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll @@ -1341,9 +1341,9 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) y_hat = self.head(model_output[0]) loss_val = None @@ -1401,7 +1401,6 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) else: self.projection = distribution_output.get_parameter_projection(head_dim) - def forward(self, x: torch.Tensor): """ @@ -1474,12 +1473,12 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # get model output - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states) - - # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape - y_hat = self.head(model_output.last_hidden_state) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) + + # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape + y_hat = self.head(model_output.last_hidden_state) loss_val = None if future_values is not None: @@ -1516,8 +1515,8 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: - [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of - samples, prediction_length, num_output_channels)` + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, + number of samples, prediction_length, num_output_channels)` """ # get number of samples num_parallel_samples = self.config.num_parallel_samples @@ -1751,23 +1750,23 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.use_cls_token = config.use_cls_token self.pooling = config.pooling self.distribution_output = distribution_output - - head_dim = config.num_input_channels * config.d_model + + head_dim = config.num_input_channels * config.d_model self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - + if distribution_output is None: self.projection = nn.Linear(head_dim, config.num_output_channels) - else: - self.projection = distribution_output.get_parameter_projection(head_dim) + else: + self.projection = distribution_output.get_parameter_projection(head_dim) def forward(self, x): """ x: [bs x nvars x num_patch x d_model] or [bs x nvars x (num_patch+1) x d_model] if use cls_token output: [bs x output_dim] - """ + """ if self.use_cls_token: x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] elif self.pooling == "mean": @@ -1777,11 +1776,11 @@ def forward(self, x): else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) - # projection + x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + # projection y = self.projection(x) # y: bs x output_dim or a tuple of this shape for distribution head - # - if (self.distribution_output is None) & self.y_range: # linear head + # + if (self.distribution_output is None) & self.y_range: # linear head y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] return y @@ -1804,7 +1803,9 @@ def __init__(self, config: PatchTSTConfig): elif config.distribution_output == "normal": self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_output_channels) + self.distribution_output = NegativeBinomialOutput( + dim=config.prediction_length * config.num_output_channels + ) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1816,7 +1817,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - labels: Optional[torch.Tensor], + labels: Optional[torch.Tensor], past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -1824,38 +1825,38 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - model_output = self.model(past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states) - # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) + # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape y_hat = self.head(model_output.last_hidden_state) loss_val = None - if labels is not None: - if self.distribution_output: + if labels is not None: + if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) loss_val = self.loss(distribution, labels) # take average of the loss loss_val = weighted_average(loss_val) - else: + else: loss_val = self.loss(y_hat, labels) + encoder_states = model_output.hidden_states + if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None) - return PatchTSTForRegressionOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states - ) - - def generate(self, - past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTRegressionOutput: + return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) + return PatchTSTForRegressionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) + + def generate( + self, + past_values: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + ) -> SamplePatchTSTRegressionOutput: """ Generate sequences of sample predictions from a model with a probability distribution head. Parameters: - past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): Past values of the time series that serves as context in order to predict the future. past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): @@ -1864,27 +1865,25 @@ def generate(self, - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - + Return: - [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of - samples, num_output_channels)`. + [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, + number of samples, num_output_channels)`. """ - # get number of samples + # get number of samples num_parallel_samples = self.config.num_parallel_samples - # get model output - outputs = self(past_values=past_values, - labels=None, - past_observed_mask=past_observed_mask, - output_hidden_states=None - ) - + # get model output + outputs = self( + past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=None + ) + # get distribution - distribution = self.distribution_output.distribution( - outputs.prediction_output - ) + distribution = self.distribution_output.distribution(outputs.prediction_output) # get samples - samples = [distribution.sample() for _ in range(num_parallel_samples)] # samples: list of [bs x num_output_channels] + samples = [ + distribution.sample() for _ in range(num_parallel_samples) + ] # samples: list of [bs x num_output_channels] # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x num_output_channels] + samples = torch.stack(samples, dim=1) # [bs x num_samples x num_output_channels] return SamplePatchTSTRegressionOutput(sequences=samples) From 91a4c46eb03bdc2af6d41ee395bffdfcbfac71e0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 6 Oct 2023 15:31:21 +0200 Subject: [PATCH 080/189] add forecast_masking --- .../models/patchtst/modeling_patchtst.py | 76 ++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3f5102c0183160..cb9360499ed863 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -300,6 +300,73 @@ def random_masking( return xb_mask, mask[..., 0] +def forecast_masking( + xb: torch.Tensor, + patch_lengths: list, + mix_ratio: list = None, + unmasked_channel_indices: list = None, + mask_value: int = 0, +): + """forecast_masking Mask last K patches where K is from the patch_lengths list. + For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in + cv_channel_indices + + Args: + xb (Tensor): + Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] + patch_lengths (list): List of patch lengths to mask in the end of the data. + mix_ratio (list, optional): List of weights to use for each patch length. For Ex. + if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to + None. + unmasked_channel_indices (list, optional): + Control Variable channel indices. These channels will not be masked. Defaults to None. + mask_value (int, optional): Value to use for masking. Defaults to 0. + + Returns: + Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x + tsg2 x c x n] + """ + if mix_ratio is None: + mix_ratio = [1 for t in patch_lengths] + + bs, nvars, L, D = xb.shape + mask = torch.zeros(bs, nvars, L, device=xb.device) + + t_list = [] + total_length = 0 + total_ratio = sum(mix_ratio) + + for i, j in zip(patch_lengths, mix_ratio): + if i <= 0 or i >= L: + raise Exception("masked_patch_len should be greater than 0 and less than total patches.") + temp_len = int(bs * j / total_ratio) + t_list.append([i, j, temp_len]) + total_length += temp_len + + t_list = sorted(t_list, key=lambda x: x[2]) + + if total_length < bs: + t_list[0][2] = t_list[0][2] + (bs - total_length) + elif total_length > bs: + t_list[-1][2] = t_list[-1][2] + (total_length - bs) + + b1 = 0 + for p, r, l in t_list: + b2 = b1 + l + mask[b1:b2, :, -p:] = 1 + b1 = b2 + + perm = torch.randperm(mask.shape[0]) + mask = mask[perm] + + mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] + if unmasked_channel_indices is not None: + mask[:, unmasked_channel_indices, :, :] = 0 + + xb_mask = xb.masked_fill(mask.bool(), mask_value) + return xb_mask, mask[..., 0] + + def compute_num_patches(sequence_length, patch_length, stride): return (max(sequence_length, patch_length) - patch_length) // stride + 1 @@ -490,7 +557,14 @@ def forward(self, x: torch.Tensor): mask_value=self.mask_value, seed_number=self.seed_number, ) - + elif self.mask_type == "forecast": + x_mask, mask = forecast_masking( + xb=x, + patch_lengths=self.mask_patches, + mix_ratio=self.mask_patch_ratios, + unmasked_channel_indices=self.unmasked_channel_indices, + mask_value=self.mask_value, + ) else: raise Exception("Invalid mask type") From 17c60a7aa69c5bf82d6ca842bba3b3b49e96bebb Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 6 Oct 2023 16:36:48 +0200 Subject: [PATCH 081/189] fixed tests --- src/transformers/models/patchtst/modeling_patchtst.py | 8 +++++--- tests/models/patchtst/test_modeling_patchtst.py | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index cb9360499ed863..49c73ef20b639e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -707,7 +707,7 @@ class PatchTSTPreTrainedModel(PreTrainedModel): config_class = PatchTSTConfig base_model_prefix = "model" main_input_name = "past_values" - supports_gradient_checkpointing = True + supports_gradient_checkpointing = False def _init_weights(self, module): """Initialize weights""" @@ -1405,8 +1405,8 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - past_observed_mask: Optional[bool] = None, labels: torch.Tensor = None, + past_observed_mask: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForClassificationOutput]: @@ -1854,7 +1854,7 @@ def forward(self, x): # projection y = self.projection(x) # y: bs x output_dim or a tuple of this shape for distribution head # - if (self.distribution_output is None) & self.y_range: # linear head + if (self.distribution_output is None) & (self.y_range is not None): # linear head y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] return y @@ -1899,6 +1899,8 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + model_output = self.model( past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states ) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 7f453f707948ef..69c8dad8e44fea 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -276,7 +276,9 @@ def test_forward_signature(self): MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING ): expected_arg_names.remove("future_values") + expected_arg_names.remove("past_observed_mask") expected_arg_names.append("labels") + expected_arg_names.append("past_observed_mask") expected_arg_names.extend( [ "output_hidden_states", From a61ac773cb5fd7127edba9de5c52dd9190a2b0f1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 6 Oct 2023 18:12:48 +0200 Subject: [PATCH 082/189] use set_seed --- docs/source/en/index.md | 1 + .../models/patchtst/configuration_patchtst.py | 2 +- .../models/patchtst/modeling_patchtst.py | 19 ++++++++----------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index e20389a2ab49cb..fe841a1b43607a 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -209,6 +209,7 @@ Flax), PyTorch, and/or TensorFlow. | [OpenLlama](model_doc/open-llama) | ✅ | ❌ | ❌ | | [OPT](model_doc/opt) | ✅ | ✅ | ✅ | | [OWL-ViT](model_doc/owlvit) | ✅ | ❌ | ❌ | +| [PatchTST](model_doc/patchtst) | ✅ | ❌ | ❌ | | [Pegasus](model_doc/pegasus) | ✅ | ✅ | ✅ | | [PEGASUS-X](model_doc/pegasus_x) | ✅ | ❌ | ❌ | | [Perceiver](model_doc/perceiver) | ✅ | ❌ | ❌ | diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 2ba1c808358cf7..e6f140165f624b 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -178,7 +178,7 @@ def __init__( use_cls_token: bool = False, init_std: float = 0.02, shared_projection: bool = True, - seed_number: int = None, + seed_number: Optional[int] = None, scaling: Optional[Union[str, bool]] = "mean", # mask pretraining mask_input: Optional[bool] = None, diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 49c73ef20b639e..60ad66da70c7eb 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -15,11 +15,9 @@ """ PyTorch PatchTST model.""" import math -import random from dataclasses import dataclass from typing import Optional, Tuple, Union -import numpy as np import torch from torch import nn @@ -27,6 +25,7 @@ from ...modeling_outputs import BaseModelOutputWithNoAttention from ...modeling_utils import PreTrainedModel from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput +from ...trainer_utils import set_seed from ...utils import ModelOutput, add_start_docstrings, logging from .configuration_patchtst import PatchTSTConfig @@ -239,14 +238,6 @@ def positional_encoding(pe, learn_pe, q_len, d_model): return nn.Parameter(w_pos, requires_grad=learn_pe) -def set_seed(x=42): - random.seed(x) - np.random.seed(x) - torch.manual_seed(x) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(x) - - def random_masking( xb: torch.Tensor, mask_ratio: float, @@ -306,6 +297,7 @@ def forecast_masking( mix_ratio: list = None, unmasked_channel_indices: list = None, mask_value: int = 0, + seed_number: Optional[int] = None, ): """forecast_masking Mask last K patches where K is from the patch_lengths list. For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in @@ -321,11 +313,15 @@ def forecast_masking( unmasked_channel_indices (list, optional): Control Variable channel indices. These channels will not be masked. Defaults to None. mask_value (int, optional): Value to use for masking. Defaults to 0. + seed_number (int, optional): Value to set for the random seed. Returns: Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x tsg2 x c x n] """ + if seed_number: + set_seed(seed_number) + if mix_ratio is None: mix_ratio = [1 for t in patch_lengths] @@ -564,6 +560,7 @@ def forward(self, x: torch.Tensor): mix_ratio=self.mask_patch_ratios, unmasked_channel_indices=self.unmasked_channel_indices, mask_value=self.mask_value, + seed_number=self.seed_number, ) else: raise Exception("Invalid mask type") @@ -1900,7 +1897,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + model_output = self.model( past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states ) From de7fb9e3dfc14b83b6f8814df78fe7998f80b9a1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 6 Oct 2023 18:38:02 +0200 Subject: [PATCH 083/189] fix doc test --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 60ad66da70c7eb..abeb092baa5b4d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1061,12 +1061,8 @@ class SamplePatchTSTForecastOutput(ModelOutput): distribution. Parameters: - <<<<<<< Updated upstream sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, number_channels)`): - ======= - sequences `(batch_size, num_samples, prediction_length, number_channels)`): - >>>>>>> Stashed changes Sampled values from the chosen distribution. """ From 0fd0ce701894cf2316e1e7bda8ed288d1e6dd732 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 6 Oct 2023 18:58:18 +0200 Subject: [PATCH 084/189] formatting --- .../models/patchtst/modeling_patchtst.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index abeb092baa5b4d..8b01612ae3b54d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1057,13 +1057,13 @@ class SamplePatchTSTPredictionOutput(ModelOutput): @dataclass class SamplePatchTSTForecastOutput(ModelOutput): """ - Base class for time series model's predictions outputs that contains the sampled values from the chosen - distribution. + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. - Parameters: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, - num_samples, prediction_length, number_channels)`): - Sampled values from the chosen distribution. + Parameters: + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, + num_samples, prediction_length, number_channels)`): + Sampled values from the chosen distribution. """ sequences: torch.FloatTensor = None From cb52b6f87f150ac1520f2b12da5544cad43bc12d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sat, 7 Oct 2023 11:16:22 +0200 Subject: [PATCH 085/189] Update docs/source/en/model_doc/patchtst.md Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> --- docs/source/en/model_doc/patchtst.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 504be80c3e6c9a..12542a8dc5206b 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -26,7 +26,7 @@ The abstract from the paper is the following: Tips: -The model also adds a time series classification pipeline and time series regression pipeline. +The model can also be used for time series classification and time series regression. See the respective [`PatchTSTForClassification`] and [`PatchTSTForRegression`] classes. This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif). From 3daec960700190206e3dcae03d70a724dea6ad6e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sun, 8 Oct 2023 13:31:53 +0200 Subject: [PATCH 086/189] better var names --- .../models/patchtst/modeling_patchtst.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 8b01612ae3b54d..a888a1a58ccbb4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -265,17 +265,17 @@ def random_masking( if seed_number: set_seed(seed_number) - bs, nvars, L, D = xb.shape + batch_size, nvars, seq_len, feat = xb.shape - len_keep = int(L * (1 - mask_ratio)) + len_keep = int(seq_len * (1 - mask_ratio)) if channel_consistent_masking: - noise = torch.rand(bs, 1, L, device=xb.device) # noise in [0, 1], bs x 1 x L - noise = noise.repeat(1, nvars, 1) # bs x nvars x L + noise = torch.rand(batch_size, 1, seq_len, device=xb.device) # noise in [0, 1], bs x 1 x L + noise = noise.repeat(1, nvars, 1) # bs x nvars x time else: - noise = torch.rand(bs, nvars, L, device=xb.device) # noise in [0, 1], bs x nvars x L + noise = torch.rand(batch_size, nvars, seq_len, device=xb.device) # noise in [0, 1], bs x nvars x L - mask = torch.ones(bs, nvars, L, device=xb.device) # mask: [bs x nvars x num_patch] + mask = torch.ones(batch_size, nvars, seq_len, device=xb.device) # mask: [bs x nvars x num_patch] mask[:, :, :len_keep] = 0 # sort noise for each sample @@ -283,7 +283,7 @@ def random_masking( ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] mask = torch.gather(mask, dim=-1, index=ids_restore) - mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patches x patch_length] + mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat) # mask: [bs x nvars x num_patches x patch_length] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 @@ -325,29 +325,29 @@ def forecast_masking( if mix_ratio is None: mix_ratio = [1 for t in patch_lengths] - bs, nvars, L, D = xb.shape - mask = torch.zeros(bs, nvars, L, device=xb.device) + batch_size, nvars, seq_len, feat = xb.shape + mask = torch.zeros(batch_size, nvars, seq_len, device=xb.device) t_list = [] total_length = 0 total_ratio = sum(mix_ratio) for i, j in zip(patch_lengths, mix_ratio): - if i <= 0 or i >= L: + if i <= 0 or i >= seq_len: raise Exception("masked_patch_len should be greater than 0 and less than total patches.") - temp_len = int(bs * j / total_ratio) + temp_len = int(batch_size * j / total_ratio) t_list.append([i, j, temp_len]) total_length += temp_len t_list = sorted(t_list, key=lambda x: x[2]) - if total_length < bs: - t_list[0][2] = t_list[0][2] + (bs - total_length) - elif total_length > bs: - t_list[-1][2] = t_list[-1][2] + (total_length - bs) + if total_length < batch_size: + t_list[0][2] = t_list[0][2] + (batch_size - total_length) + elif total_length > batch_size: + t_list[-1][2] = t_list[-1][2] + (total_length - batch_size) b1 = 0 - for p, r, l in t_list: + for p, _, l in t_list: b2 = b1 + l mask[b1:b2, :, -p:] = 1 b1 = b2 @@ -355,7 +355,7 @@ def forecast_masking( perm = torch.randperm(mask.shape[0]) mask = mask[perm] - mask = mask.unsqueeze(-1).repeat(1, 1, 1, D) # mask: [bs x nvars x num_patch x patch_len] + mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat) # mask: [bs x nvars x num_patch x patch_len] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 From c82022ded984a86e1c6b00415987eb763cc3d06d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 09:51:28 +0200 Subject: [PATCH 087/189] rename PatchTSTTranspose --- .../models/patchtst/modeling_patchtst.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a888a1a58ccbb4..d086276b03ef0f 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 TSFM team. All rights reserved. +# Copyright 2023 IBM & Hugging Face. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -195,34 +195,35 @@ def forward( return attn_output, attn_weights_reshaped, past_key_value -class Transpose(nn.Module): +class PatchTSTTranspose(nn.Module): def __init__(self, *dims, contiguous=False): super().__init__() self.dims, self.contiguous = dims, contiguous - def forward(self, x): + def forward(self, inputs): if self.contiguous: - return x.transpose(*self.dims).contiguous() + return inputs.transpose(*self.dims).contiguous() else: - return x.transpose(*self.dims) + return inputs.transpose(*self.dims) -def positional_encoding(pe, learn_pe, q_len, d_model): +def positional_encoding(position_embedding_type, learned, q_len, d_model): # Positional encoding - if pe is None: - w_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe + if position_embedding_type is None: + # position_embedding_type = None and learned = False can be used to measure impact of positional encoding + w_pos = torch.empty((q_len, d_model)) nn.init.uniform_(w_pos, -0.02, 0.02) - learn_pe = False - elif pe == "zeros": + learned = False + elif position_embedding_type == "zeros": w_pos = torch.empty((q_len, d_model)) nn.init.uniform_(w_pos, -0.02, 0.02) - elif pe == "normal": + elif position_embedding_type == "normal": w_pos = torch.zeros((q_len, 1)) torch.nn.init.normal_(w_pos, mean=0.0, std=0.1) - elif pe == "uniform": + elif position_embedding_type == "uniform": w_pos = torch.zeros((q_len, 1)) nn.init.uniform_(w_pos, a=0.0, b=0.1) - elif pe == "sincos": + elif position_embedding_type == "sincos": pos_enc = torch.zeros(q_len, d_model) position = torch.arange(0, q_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) @@ -233,9 +234,9 @@ def positional_encoding(pe, learn_pe, q_len, d_model): w_pos = pos_enc else: raise ValueError( - f"{pe} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." + f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." ) - return nn.Parameter(w_pos, requires_grad=learn_pe) + return nn.Parameter(w_pos, requires_grad=learned) def random_masking( @@ -609,7 +610,9 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of the sublayer 1 self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + self.norm_sublayer1 = nn.Sequential( + PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) + ) else: self.norm_sublayer1 = nn.LayerNorm(config.d_model) @@ -617,7 +620,9 @@ def __init__(self, config: PatchTSTConfig): if self.channel_attention: self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + self.norm_sublayer2 = nn.Sequential( + PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) + ) else: self.norm_sublayer2 = nn.LayerNorm(config.d_model) @@ -632,7 +637,9 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of sublayer 3 self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer3 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.d_model), Transpose(1, 2)) + self.norm_sublayer3 = nn.Sequential( + PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) + ) else: self.norm_sublayer3 = nn.LayerNorm(config.d_model) From 687e3c84f927456a156af518485d288dcb117dfc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:11:27 +0200 Subject: [PATCH 088/189] fix argument names and docs string --- .../models/patchtst/modeling_patchtst.py | 81 ++++++++++--------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d086276b03ef0f..10393875750527 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -240,7 +240,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model): def random_masking( - xb: torch.Tensor, + inputs: torch.Tensor, mask_ratio: float, unmasked_channel_indices: list = None, channel_consistent_masking: bool = False, @@ -249,34 +249,40 @@ def random_masking( ): """random_masking: Mask the input considering the control variables. - Parameters: - xb (Tensor): Input to mask [ bs x nvars x num_patches x patch_length] - mask_ratio (float): Mask ratio. - unmasked_channel_indices (list, optional): + Args: + inputs (`torch.Tensor` of shape `(batch_size, nvars, seq_len, feat)`): + The input tensor to mask. + mask_ratio (`float`): + Mask ratio. + unmasked_channel_indices (list, *optional*): indices of unmasked channels. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): + channel_consistent_masking (bool, *optional* defaults to False): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary - across channels. Defaults to True. - mask_value (int, optional): Value to use for masking. Defaults to 0. - seed_number (int, optional): Value to set for the random seed. + across channels. Defaults to False. + mask_value (int, *optional* defaults to 0): + Value to use for masking. + seed_number (int, *optional*): + Value to set for the random seed. Returns: - Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] + `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x + n] """ if seed_number: set_seed(seed_number) - batch_size, nvars, seq_len, feat = xb.shape + batch_size, nvars, seq_len, feat = inputs.shape + device = inputs.device len_keep = int(seq_len * (1 - mask_ratio)) if channel_consistent_masking: - noise = torch.rand(batch_size, 1, seq_len, device=xb.device) # noise in [0, 1], bs x 1 x L + noise = torch.rand(batch_size, 1, seq_len, device=device) # noise in [0, 1], bs x 1 x L noise = noise.repeat(1, nvars, 1) # bs x nvars x time else: - noise = torch.rand(batch_size, nvars, seq_len, device=xb.device) # noise in [0, 1], bs x nvars x L + noise = torch.rand(batch_size, nvars, seq_len, device=device) # noise in [0, 1], bs x nvars x L - mask = torch.ones(batch_size, nvars, seq_len, device=xb.device) # mask: [bs x nvars x num_patch] + mask = torch.ones(batch_size, nvars, seq_len, device=device) # mask: [bs x nvars x num_patch] mask[:, :, :len_keep] = 0 # sort noise for each sample @@ -288,37 +294,40 @@ def random_masking( if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 - xb_mask = xb.masked_fill(mask.bool(), mask_value) - return xb_mask, mask[..., 0] + inputs_mask = inputs.masked_fill(mask.bool(), mask_value) + return inputs_mask, mask[..., 0] def forecast_masking( - xb: torch.Tensor, + inputs: torch.Tensor, patch_lengths: list, mix_ratio: list = None, unmasked_channel_indices: list = None, mask_value: int = 0, seed_number: Optional[int] = None, ): - """forecast_masking Mask last K patches where K is from the patch_lengths list. - For every batch, distribute the patch lengths based on mix_ratio Ignore masks for column indices mentioned in - cv_channel_indices + """Forecast masking that masks the last K patches where K is from the patch_lengths list. + For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in + unmasked_channel_indices. Args: - xb (Tensor): + inputs (`torch.Tensor`): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] - patch_lengths (list): List of patch lengths to mask in the end of the data. - mix_ratio (list, optional): List of weights to use for each patch length. For Ex. - if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to - None. - unmasked_channel_indices (list, optional): + patch_lengths (list): + List of patch lengths to mask in the end of the data. + mix_ratio (list, *optional* defaults to None): + List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], + then equal weights to both patch lengths. Defaults to None. + unmasked_channel_indices (list, *optional* defaults to None): Control Variable channel indices. These channels will not be masked. Defaults to None. - mask_value (int, optional): Value to use for masking. Defaults to 0. - seed_number (int, optional): Value to set for the random seed. + mask_value (int, *optional* defaults to 0): + Value to use for masking. Defaults to 0. + seed_number (int, *optional*): + Value to set for the random seed. Returns: - Tensor: xb_mask, masked input, same shape as input Tensor: Mask tensor of shape [bs x c x n] or [bs x tsg1 x - tsg2 x c x n] + `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape [bs x c + x n] or [bs x tsg1 x tsg2 x c x n] """ if seed_number: set_seed(seed_number) @@ -326,8 +335,8 @@ def forecast_masking( if mix_ratio is None: mix_ratio = [1 for t in patch_lengths] - batch_size, nvars, seq_len, feat = xb.shape - mask = torch.zeros(batch_size, nvars, seq_len, device=xb.device) + batch_size, nvars, seq_len, feat = inputs.shape + mask = torch.zeros(batch_size, nvars, seq_len, device=inputs.device) t_list = [] total_length = 0 @@ -360,8 +369,8 @@ def forecast_masking( if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 - xb_mask = xb.masked_fill(mask.bool(), mask_value) - return xb_mask, mask[..., 0] + inputs_mask = inputs.masked_fill(mask.bool(), mask_value) + return inputs_mask, mask[..., 0] def compute_num_patches(sequence_length, patch_length, stride): @@ -547,7 +556,7 @@ def forward(self, x: torch.Tensor): if self.mask_type == "random": x_mask, mask = random_masking( - xb=x, + inputs=x, mask_ratio=self.mask_ratio, unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, @@ -556,7 +565,7 @@ def forward(self, x: torch.Tensor): ) elif self.mask_type == "forecast": x_mask, mask = forecast_masking( - xb=x, + inputs=x, patch_lengths=self.mask_patches, mix_ratio=self.mask_patch_ratios, unmasked_channel_indices=self.unmasked_channel_indices, From 5469748c5d2238ced75ef98c0a7998f1f78c80fe Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:26:21 +0200 Subject: [PATCH 089/189] remove compute_num_patches and unused class --- .../models/patchtst/modeling_patchtst.py | 75 +------------------ 1 file changed, 4 insertions(+), 71 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 10393875750527..7aff6af096b962 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -373,11 +373,7 @@ def forecast_masking( return inputs_mask, mask[..., 0] -def compute_num_patches(sequence_length, patch_length, stride): - return (max(sequence_length, patch_length) - patch_length) // stride + 1 - - -class Patchify(nn.Module): +class PatchTSTPatchify(nn.Module): """ A class to patchify the time series sequence into different patches @@ -408,8 +404,8 @@ def __init__( self.stride = stride # get the number of patches - self.num_patches = compute_num_patches(sequence_length, patch_length, stride) - new_sequence_length = patch_length + stride * (self.num_patches - 1) + num_patches = (max(sequence_length, patch_length) - patch_length) // stride + 1 + new_sequence_length = patch_length + stride * (num_patches - 1) self.s_begin = sequence_length - new_sequence_length def forward(self, past_values: torch.Tensor): @@ -433,69 +429,6 @@ def forward(self, past_values: torch.Tensor): return x -class PatchEmbeddings(nn.Module): - """ - Parameters: - A class to patchify the time series sequence into different patches - sequence_length (int, required): input sequence length. patch_length (int, required): patch length. stride - (int, required): stride between patches. - - Returns: - embeddings: output tensor data [bs x num_input_channels x num_patches x embed_dim] - """ - - def __init__(self, sequence_length: int, patch_length: int, stride: int, embed_dim: int): - super().__init__() - - assert ( - sequence_length > patch_length - ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" - - # assert ((max(sequence_length, patch_length) - patch_length) % stride == 0), f"sequence length minus patch length has to be divisible to the stride" - - self.sequence_length = sequence_length - self.patch_length = patch_length - self.stride = stride - self.embed_dim = embed_dim - - # get the number of patches - self.num_patches = compute_num_patches(sequence_length, patch_length, stride) - new_sequence_length = patch_length + stride * (self.num_patches - 1) - self.s_begin = sequence_length - new_sequence_length - - # Embedding - self.projection = nn.Conv1d( - in_channels=1, - out_channels=embed_dim, - kernel_size=patch_length, - stride=stride, - ) - - def forward(self, past_values: torch.Tensor): - """ - Parameters: - past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] - Returns: - embeddings: output tensor data [bs x num_input_channels x num_patches x emb_dim] - """ - bs, sequence_length, num_input_channels = past_values.shape - assert ( - sequence_length == self.sequence_length - ), f"Input sequence length ({sequence_length}) doesn't match the configuration sequence length ({self.sequence_length})." - - x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] - # convert past_values to shape [bs*num_input_channels x 1 x sequence_length ] - x = x.transpose(1, 2).reshape(bs * num_input_channels, 1, -1).contiguous() - # projection - embeddings = self.projection(x) # embeddings: [bs*num_input_channels x emb_dim x num_patches] - # reshape - embeddings = ( - embeddings.transpose(1, 2).view(bs, num_input_channels, -1, self.embed_dim).contiguous() - ) # embeddings: [bs x num_input_channels x num_patches x emb_dim] - # embeddings = embeddings.flatten(2).transpose(1, 2) - return embeddings - - class PatchMasking(nn.Module): """ PatchMasking: Class to random or forcast masking. @@ -1264,7 +1197,7 @@ def __init__(self, config: PatchTSTConfig): else: self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) - self.patching = Patchify( + self.patching = PatchTSTPatchify( config.context_length, patch_length=config.patch_length, stride=config.stride, From d5c83591ba846812f64fe9d38f3c8315078d4d8d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:31:00 +0200 Subject: [PATCH 090/189] remove assert --- .../models/patchtst/modeling_patchtst.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 7aff6af096b962..13d3acd9ed50a6 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -395,9 +395,10 @@ def __init__( ): super().__init__() - assert ( - sequence_length > patch_length - ), f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" + if sequence_length <= patch_length: + raise ValueError( + f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" + ) self.sequence_length = sequence_length self.patch_length = patch_length @@ -417,9 +418,10 @@ def forward(self, past_values: torch.Tensor): x: output tensor data [bs x num_input_channels x num_patches x patch_length] """ sequence_length = past_values.shape[-2] - assert ( - sequence_length == self.sequence_length - ), f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." + if sequence_length != self.sequence_length: + raise ValueError( + f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." + ) x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] x = x.unfold( From a25d433103a33d0f01b64e7486cb0a52f333d672 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:35:19 +0200 Subject: [PATCH 091/189] renamed to PatchTSTMasking --- src/transformers/models/patchtst/modeling_patchtst.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 13d3acd9ed50a6..7c10dac7069b53 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -431,11 +431,11 @@ def forward(self, past_values: torch.Tensor): return x -class PatchMasking(nn.Module): +class PatchTSTMasking(nn.Module): """ - PatchMasking: Class to random or forcast masking. + PatchTSTMasking: Class for random or forcast masking on inputs. - Parameters: + Args: mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. mask_ratio (float, optional): Mask ratio. mask_patches (list, optional): List of patch lengths to mask in the end of the data. @@ -448,6 +448,9 @@ class PatchMasking(nn.Module): across channels. Defaults to True. mask_value (int, optional): Value to use for masking. Defaults to 0. seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. + + Returns: + """ def __init__( @@ -1207,7 +1210,7 @@ def __init__(self, config: PatchTSTConfig): self.mask_input = config.mask_input if self.mask_input: - self.masking = PatchMasking( + self.masking = PatchTSTMasking( mask_type=config.mask_type, mask_ratio=config.mask_ratio, mask_patches=config.mask_patches, From db96ed830fb9986d77d840bccb2fa14121cc6166 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:37:26 +0200 Subject: [PATCH 092/189] use num_labels for classification --- .../models/patchtst/configuration_patchtst.py | 2 +- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- tests/models/patchtst/test_modeling_patchtst.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index e6f140165f624b..337caf70cd0895 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -253,7 +253,7 @@ def __init__( self.shared_projection = shared_projection # Classification - self.num_classes = num_classes + self.num_labels = num_labels # Forcasting and prediction self.prediction_length = prediction_length diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 7c10dac7069b53..b54f50346bb79c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1384,7 +1384,7 @@ def __init__(self, config: PatchTSTConfig): self.pooling = config.pooling self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_classes) + self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels) def forward(self, x: torch.Tensor): """ diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 69c8dad8e44fea..c669ef6f44959e 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -71,7 +71,7 @@ def __init__( lags_sequence=[1, 2, 3, 4, 5], distil=False, seed_number=42, - num_classes=2, + num_labels=2, num_output_channels=2, ): self.parent = parent @@ -93,7 +93,7 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.seed_number = seed_number - self.num_classes = num_classes + self.num_labels = num_labels self.num_output_channels = num_output_channels self.distil = distil self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 @@ -113,7 +113,7 @@ def get_config(self): context_length=self.context_length, activation_function=self.hidden_act, seed_number=self.seed_number, - num_classes=self.num_classes, + num_labels=self.num_labels, num_output_channels=self.num_output_channels, ) @@ -191,7 +191,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): # if classification model: if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): rng = random.Random(self.model_tester.seed_number) - labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_classes, rng=rng) + labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng) inputs_dict["labels"] = labels inputs_dict.pop("future_values") elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): From b6d3b4e526e80c6f00d1c6a14718f880c4ddd8a4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 10:38:28 +0200 Subject: [PATCH 093/189] use num_labels --- src/transformers/models/patchtst/configuration_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 337caf70cd0895..07524457d29678 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -112,7 +112,7 @@ class PatchTSTConfig(PretrainedConfig): Mask value to set. pooling (`str`, *optional*, defaults to `"mean"`): Pooling in the latent representation. `"mean"`, `"max"` and None are supported. - num_classes (`int`, *optional*, defaults to 1): + num_labels (`int`, *optional*, defaults to 1): Number of classes is defined for classification task. head_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for head. @@ -191,7 +191,7 @@ def __init__( mask_value=0, # head pooling: str = "mean", - num_classes: int = 1, + num_labels: int = 1, head_dropout: float = 0.0, prediction_length: int = 24, num_output_channels: int = 1, From ca648ef01b7beb3dd69e466f5115807cbf0d0df5 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 11:15:34 +0200 Subject: [PATCH 094/189] use default num_labels from super class --- src/transformers/models/patchtst/configuration_patchtst.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 07524457d29678..f3764b9e41e3c3 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -112,8 +112,6 @@ class PatchTSTConfig(PretrainedConfig): Mask value to set. pooling (`str`, *optional*, defaults to `"mean"`): Pooling in the latent representation. `"mean"`, `"max"` and None are supported. - num_labels (`int`, *optional*, defaults to 1): - Number of classes is defined for classification task. head_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for head. prediction_length (`int`): @@ -191,7 +189,6 @@ def __init__( mask_value=0, # head pooling: str = "mean", - num_labels: int = 1, head_dropout: float = 0.0, prediction_length: int = 24, num_output_channels: int = 1, @@ -252,9 +249,6 @@ def __init__( # Forecast head self.shared_projection = shared_projection - # Classification - self.num_labels = num_labels - # Forcasting and prediction self.prediction_length = prediction_length self.num_parallel_samples = num_parallel_samples From e56de1134de39c18a9505ea3679ac563f4fe6ba5 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 11:21:06 +0200 Subject: [PATCH 095/189] move model_type after docstring --- src/transformers/models/patchtst/configuration_patchtst.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index f3764b9e41e3c3..d2144cd65eca44 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -29,7 +29,6 @@ class PatchTSTConfig(PretrainedConfig): - model_type = "patchtst" r""" This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an PatchTST model according to the specified arguments, defining the model architecture. @@ -139,6 +138,8 @@ class PatchTSTConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + + model_type = "patchtst" attribute_map = { "hidden_size": "d_model", "num_attention_heads": "encoder_attention_heads", From fcfa103b5211a2d8005e6f2e8c6f7824b8e34fc5 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 11:29:11 +0200 Subject: [PATCH 096/189] renamed PatchTSTForMaskPretraining --- docs/source/en/model_doc/patchtst.md | 4 +-- src/transformers/__init__.py | 4 +-- src/transformers/models/patchtst/__init__.py | 4 +-- .../models/patchtst/configuration_patchtst.py | 33 ++++++++++--------- .../models/patchtst/modeling_patchtst.py | 10 +++--- src/transformers/utils/dummy_pt_objects.py | 2 +- .../models/patchtst/test_modeling_patchtst.py | 8 ++--- utils/check_repo.py | 2 +- 8 files changed, 34 insertions(+), 33 deletions(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 12542a8dc5206b..88094385c1500d 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -62,9 +62,9 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST). - forward -## PatchTSTForMaskPretraining +## PatchTSTForPretraining -[[autodoc]] PatchTSTForMaskPretraining +[[autodoc]] PatchTSTForPretraining - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index de83c9c45c978f..d94188d42f5e8a 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2439,7 +2439,7 @@ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", "PatchTSTForClassification", "PatchTSTForForecasting", - "PatchTSTForMaskPretraining", + "PatchTSTForPretraining", "PatchTSTForPrediction", "PatchTSTForRegression", "PatchTSTModel", @@ -6269,7 +6269,7 @@ PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForMaskPretraining, + PatchTSTForPretraining, PatchTSTForPrediction, PatchTSTForRegression, PatchTSTModel, diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 8979bed2341ab2..8ca9b1f88eb8c4 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -36,7 +36,7 @@ "PatchTSTPreTrainedModel", "PatchTSTForPrediction", "PatchTSTForForecasting", - "PatchTSTForMaskPretraining", + "PatchTSTForPretraining", "PatchTSTForRegression", "PatchTSTForClassification", ] @@ -55,7 +55,7 @@ PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForMaskPretraining, + PatchTSTForPretraining, PatchTSTForPrediction, PatchTSTForRegression, PatchTSTModel, diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index d2144cd65eca44..f8ee3f75a9530a 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -41,7 +41,7 @@ class PatchTSTConfig(PretrainedConfig): num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. - context_length (`int`, defaults to 32): + context_length (`int`, defaults to 32, *optional*, defaults to 32): The context length for the encoder. distribution_output (`string`, *optional*, defaults to `"student_t"`): The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or @@ -54,15 +54,15 @@ class PatchTSTConfig(PretrainedConfig): Define the patch length of the patchification process. Default to 1. stride (`int`, *optional*, defaults to 1): define the stride of the patchification process. Default to 1. - encoder_layers (`int`, *optional*, defaults to 2): + encoder_layers (`int`, *optional*, defaults to 3): Number of encoder layers. d_model (`int`, *optional*, defaults to 64): Dimensionality of the transformer layers. encoder_attention_heads (`int`, *optional*, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. - shared_embedding (`bool`, *optional*, defaults to True): + shared_embedding (`bool`, *optional*, defaults to `True`): Sharing the input embedding across all channels. - channel_attention (`bool`, *optional*, defaults to False): + channel_attention (`bool`, *optional*, defaults to `False`): Activate channel attention block in the Transformer to allow channels to attend each other. encoder_ffn_dim (`int`, *optional*, defaults to 256): Dimension of the "intermediate" (often named feed-forward) layer in encoder. @@ -78,23 +78,24 @@ class PatchTSTConfig(PretrainedConfig): The dropout path in the residual block. ff_dropout (`float`, *optional*, defaults to 0.0): The dropout probability used between the two layers of the feed-forward networks. - bias (`bool`, *optional*, defaults to True): + bias (`bool`, *optional*, defaults to `True`): Consider bias in the feed-forward networks. activation_function (`str`, *optional*, defaults to `"gelu"`): The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. + pre_norm (`bool`, *optional*, defaults to `False`): positional_encoding (`str`, *optional*, defaults to `"sincos"`): Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. - learn_pe (`bool`, *optional*, defaults to False): + learn_pe (`bool`, *optional*, defaults to `False`): Whether the positional encoding is updated during training. - use_cls_token (`bool`, *optional*, defaults to False): + use_cls_token (`bool`, *optional*, defaults to `False`): Whether cls token is used. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal weight initialization distribution. - shared_projection (`bool`, *optional*, defaults to True): + shared_projection (`bool`, *optional*, defaults to `True`): Sharing the projection layer across different channels in the forecast head. - seed_number (`int`, *optional*, defaults to None): + seed_number (`int`, *optional*): Use seed number for random masking. - scaling (`string` or `bool`, *optional* defaults to `"mean"`): + scaling (`string` or `bool`, *optional*, defaults to `"mean"`): Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the scaler is set to "mean". mask_input (`bool`, *optional*, defaults to False): @@ -103,9 +104,11 @@ class PatchTSTConfig(PretrainedConfig): Masking type. Only `"random"` is currently supported. mask_ratio (`float`, *optional*, defaults to 0.5): Masking ratio is applied to mask the input data during pretraining. - channel_consistent_masking (`bool`, *optional*, defaults to False): + mask_patches (`List`, *optional*, defaults to `[2, 3]`): + mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): + channel_consistent_masking (`bool`, *optional*, defaults to `False`): If channel consistent masking is True, all the channels will have the same masking. - unmasked_channel_indices (`list`, *optional*, defaults to None): + unmasked_channel_indices (`list`, *optional*): Channels are not masked during pretraining. mask_value (`int`, *optional*, defaults to 0): Mask value to set. @@ -113,13 +116,11 @@ class PatchTSTConfig(PretrainedConfig): Pooling in the latent representation. `"mean"`, `"max"` and None are supported. head_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for head. - prediction_length (`int`): - The prediction length for the encoder. In other words, the prediction horizon of the model. - prediction_length (`int`): + prediction_length (`int`, *optional*, defaults to 24): The prediction length for the encoder. In other words, the prediction horizon of the model. num_output_channels (`int`, *optional*, defaults to 1): Number of output channels. - prediction_range (`list`, *optional*, defaults to None): + prediction_range (`list`, *optional*): The range of prediction values can be set to enforce the model to produce values within a range. num_parallel_samples (`int`, *optional*, defaults to 100): The number of samples to generate in parallel for probablistic forecast. diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index b54f50346bb79c..e7ecad05adf5c7 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -845,9 +845,9 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): @dataclass -class PatchTSTForMaskPretrainingOutput(ModelOutput): +class PatchTSTForPretrainingOutput(ModelOutput): """ - Output type of [`PatchTSTForMaskPretraining`]. + Output type of [`PatchTSTForPretraining`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): @@ -1288,7 +1288,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class PatchTSTForMaskPretraining(PatchTSTPreTrainedModel): +class PatchTSTForPretraining(PatchTSTPreTrainedModel): # PatchTSTModel + Pretraining Head def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1308,7 +1308,7 @@ def forward( future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, PatchTSTForMaskPretrainingOutput]: + ) -> Union[Tuple, PatchTSTForPretrainingOutput]: """ past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels """ @@ -1332,7 +1332,7 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None) - return PatchTSTForMaskPretrainingOutput( + return PatchTSTForPretrainingOutput( loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 45522f0c8da893..d60b81511deda0 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -5921,7 +5921,7 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class PatchTSTForMaskPretraining(metaclass=DummyObject): +class PatchTSTForPretraining(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index c669ef6f44959e..14a47cd8ad523b 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -41,7 +41,7 @@ PatchTSTConfig, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForMaskPretraining, + PatchTSTForPretraining, PatchTSTForPrediction, PatchTSTForRegression, PatchTSTModel, @@ -149,7 +149,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase PatchTSTModel, PatchTSTForPrediction, PatchTSTForForecasting, - PatchTSTForMaskPretraining, + PatchTSTForPretraining, PatchTSTForClassification, PatchTSTForRegression, ) @@ -157,7 +157,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase else () ) all_generative_model_classes = ( - (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForMaskPretraining) if is_torch_available() else () + (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () ) pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} test_pruning = False @@ -305,7 +305,7 @@ def prepare_batch(repo_id="ibm/etth1-forecast-test", file="train-batch.pt"): class PatchTSTModelIntegrationTests(unittest.TestCase): # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_pretrain_head(self): - model = PatchTSTForMaskPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device) + model = PatchTSTForPretraining.from_pretrained("ibm/patchtst-etth1-pretrain").to(torch_device) batch = prepare_batch() torch.manual_seed(0) diff --git a/utils/check_repo.py b/utils/check_repo.py index 33cd397a627710..5358f4854b410e 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -179,7 +179,7 @@ "InformerForPrediction", "AutoformerForPrediction", "PatchTSTForForecasting", - "PatchTSTForMaskPretraining", + "PatchTSTForPretraining", "PatchTSTForPrediction", "JukeboxVQVAE", "JukeboxPrior", From cd0133f77b7590caa4f3a593210291a80dc27f69 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 13:36:08 +0200 Subject: [PATCH 097/189] bs -> batch_size --- .../models/patchtst/modeling_patchtst.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e7ecad05adf5c7..5a48e47ff6f63b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -594,14 +594,14 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ - src: tensor [bs x nvars x sequence_length x d_model] Return: - Tensor [bs x nvars x sequence_length x d_model] + src: tensor [batch_size x nvars x sequence_length x d_model] Return: + Tensor [batch_size x nvars x sequence_length x d_model] """ - bs, num_input_channels, sequence_length, d_model = src.shape + batch_size, num_input_channels, sequence_length, d_model = src.shape # First sublayer: attention across time src = src.view( - bs * num_input_channels, sequence_length, d_model + batch_size * num_input_channels, sequence_length, d_model ) # src: [(bs*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection @@ -613,7 +613,7 @@ def forward(self, src: torch.Tensor): src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src)[0]) ) # src: [(bs*nvars) x sequence_length x d_model] - src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = src.reshape(batch_size, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] @@ -632,13 +632,13 @@ def forward(self, src: torch.Tensor): src + self.dropout_path2(self.self_attn(src)[0]) ) # src: [(bs*sequence_length) x nvars x d_model] src = ( - src.reshape(bs, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() + src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() ) # src: [bs x nvars x sequence_length x d_model] # Third sublayer: mixing across hidden src = src.view( - bs * num_input_channels, sequence_length, d_model - ) # src: [(bs*nvars) x sequence_length x d_model] + batch_size * num_input_channels, sequence_length, d_model + ) # src: [(batch_size*nvars) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3( @@ -649,7 +649,7 @@ def forward(self, src: torch.Tensor): src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout - src = src.reshape(bs, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = src.reshape(batch_size, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] return src From bc2bf31f76567f3319541bb64d3580290688d004 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 15:09:55 +0200 Subject: [PATCH 098/189] more review fixes --- .../models/patchtst/modeling_patchtst.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5a48e47ff6f63b..d3da375028b995 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -545,9 +545,8 @@ def __init__(self, config: PatchTSTConfig): super().__init__() self.channel_attention = config.channel_attention - # Multi-Head attention - # self.self_attn = PatchTSTAttention(config) + # Multi-Head attention self.self_attn = PatchTSTAttention( embed_dim=config.d_model, num_heads=config.encoder_attention_heads, @@ -673,11 +672,11 @@ def _init_weights(self, module): module.bias.data.zero_() def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (ChannelAttentionPatchTSTEncoder)): + if isinstance(module, (PatchTSTEncoder)): module.gradient_checkpointing = value -class ChannelAttentionPatchTSTEncoder(PatchTSTPreTrainedModel): +class PatchTSTEncoder(PatchTSTPreTrainedModel): def __init__(self, config: PatchTSTConfig): super().__init__(config) self.num_input_channels = config.num_input_channels @@ -730,8 +729,7 @@ def forward( tensor [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token """ - # bs, num_patches, num_input_channels, patch_length = x.shape - bs, num_input_channels, num_patches, patch_length = past_values.shape + _, num_input_channels, _, _ = past_values.shape output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1222,7 +1220,7 @@ def __init__(self, config: PatchTSTConfig): ) else: self.masking = nn.Identity() - self.encoder = ChannelAttentionPatchTSTEncoder(config) + self.encoder = PatchTSTEncoder(config) # Initialize weights and apply final processing self.post_init() From b8a8231781490d1fa66799288447dabe47a85ab1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 15:15:46 +0200 Subject: [PATCH 099/189] use hidden_state --- src/transformers/__init__.py | 4 +-- src/transformers/models/patchtst/__init__.py | 2 +- .../models/patchtst/modeling_patchtst.py | 26 ++++++++++--------- .../models/patchtst/test_modeling_patchtst.py | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d94188d42f5e8a..e43c7ef59175cb 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2439,8 +2439,8 @@ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", "PatchTSTForClassification", "PatchTSTForForecasting", - "PatchTSTForPretraining", "PatchTSTForPrediction", + "PatchTSTForPretraining", "PatchTSTForRegression", "PatchTSTModel", "PatchTSTPreTrainedModel", @@ -6269,8 +6269,8 @@ PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForPretraining, PatchTSTForPrediction, + PatchTSTForPretraining, PatchTSTForRegression, PatchTSTModel, PatchTSTPreTrainedModel, diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index 8ca9b1f88eb8c4..e2ac594688d90e 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -55,8 +55,8 @@ PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForPretraining, PatchTSTForPrediction, + PatchTSTForPretraining, PatchTSTForRegression, PatchTSTModel, PatchTSTPreTrainedModel, diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d3da375028b995..74d92b4ec88081 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -524,20 +524,20 @@ def __init__(self, config: PatchTSTConfig): self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)]) - def forward(self, src: torch.Tensor, output_hidden_states: Optional[bool] = None): + def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - src: tensor [bs x nvars x sequence_length x d_model] Return: + hidden_state: tensor [bs x nvars x sequence_length x d_model] Return: Tensor [bs x nvars x sequence_length x d_model] """ all_hidden_states = [] for mod in self.layers: - src = mod(src) + hidden_state = mod(hidden_state) if output_hidden_states: - all_hidden_states.append(src) + all_hidden_states.append(hidden_state) if output_hidden_states is None: - return src, None - return src, all_hidden_states + return hidden_state, None + return hidden_state, all_hidden_states class ChannelAttentionTSTEncoderLayer(nn.Module): @@ -612,13 +612,15 @@ def forward(self, src: torch.Tensor): src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src)[0]) ) # src: [(bs*nvars) x sequence_length x d_model] - src = src.reshape(batch_size, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = src.reshape( + batch_size, num_input_channels, sequence_length, d_model + ) # [bs x nvars x sequence_length x d_model] # second sublayer: attention across variable at any given time # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] if self.channel_attention: src = ( - src.transpose(2, 1).contiguous().view(bs * sequence_length, num_input_channels, d_model) + src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model) ) # [(bs*sequence_length) x nvars x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection @@ -648,7 +650,9 @@ def forward(self, src: torch.Tensor): src = self.norm_sublayer3( src + self.dropout_path3(self.ff(src)) ) # Add: residual connection with residual dropout - src = src.reshape(batch_size, num_input_channels, sequence_length, d_model) # [bs x nvars x sequence_length x d_model] + src = src.reshape( + batch_size, num_input_channels, sequence_length, d_model + ) # [bs x nvars x sequence_length x d_model] return src @@ -1330,9 +1334,7 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None) - return PatchTSTForPretrainingOutput( - loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states - ) + return PatchTSTForPretrainingOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states) class PatchTSTForClassification(PatchTSTPreTrainedModel): diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 14a47cd8ad523b..d25cc525326ab5 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -41,8 +41,8 @@ PatchTSTConfig, PatchTSTForClassification, PatchTSTForForecasting, - PatchTSTForPretraining, PatchTSTForPrediction, + PatchTSTForPretraining, PatchTSTForRegression, PatchTSTModel, ) From 8c3ab7f44585420f9f09b473b2fedb385ba3c98a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 9 Oct 2023 15:25:11 +0200 Subject: [PATCH 100/189] rename encoder layer and block class --- src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 74d92b4ec88081..faba4597a0f01a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -518,11 +518,11 @@ def forward(self, x: torch.Tensor): return x_mask, mask -class ChannelAttentionTSTEncoder(nn.Module): +class PatchTSTEncoderBlock(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.layers = nn.ModuleList([ChannelAttentionTSTEncoderLayer(config) for i in range(config.encoder_layers)]) + self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): """ @@ -540,7 +540,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo return hidden_state, all_hidden_states -class ChannelAttentionTSTEncoderLayer(nn.Module): +class PatchTSTEncoderLayer(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() @@ -716,7 +716,7 @@ def __init__(self, config: PatchTSTConfig): ) # Encoder - self.encoder = ChannelAttentionTSTEncoder(config) + self.encoder = PatchTSTEncoderBlock(config) # Initialize weights and apply final processing self.post_init() From 2553965ad3f406282b844f12172e9c853513b682 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 9 Oct 2023 13:55:45 -0400 Subject: [PATCH 101/189] remove commented seed_number --- src/transformers/models/patchtst/modeling_patchtst.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index faba4597a0f01a..128f6e772f6739 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -464,8 +464,6 @@ def __init__( mask_value=0, seed_number: Optional[int] = None, ): - # if seed_number: - # set_seed(seed_number) self.mask_ratio = mask_ratio self.channel_consistent_masking = channel_consistent_masking self.mask_type = mask_type From 85538b163b7c32892f09d35036a0d7c87b16bd97 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 9 Oct 2023 15:11:12 -0400 Subject: [PATCH 102/189] edit docstring --- .../models/patchtst/modeling_patchtst.py | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 128f6e772f6739..87ad4673d9b81d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -310,7 +310,7 @@ def forecast_masking( For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in unmasked_channel_indices. - Args: + Parameters: inputs (`torch.Tensor`): Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] patch_lengths (list): @@ -383,7 +383,7 @@ class PatchTSTPatchify(nn.Module): stride (int, required): stride between patches. Returns: - z: output tensor data [bs x num_input_channels x num_patches x patch_length] + `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)` """ def __init__( @@ -412,10 +412,10 @@ def __init__( def forward(self, past_values: torch.Tensor): """ Parameters: - past_values (torch.Tensor, required): Input of shape [bs x sequence_length x num_input_channels] + past_values (`torch.Tensor` of shape `(batch_size, sequence_length, nvars)`, *required*): Returns: - x: output tensor data [bs x num_input_channels x num_patches x patch_length] + `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)` """ sequence_length = past_values.shape[-2] if sequence_length != self.sequence_length: @@ -433,9 +433,9 @@ def forward(self, past_values: torch.Tensor): class PatchTSTMasking(nn.Module): """ - PatchTSTMasking: Class for random or forcast masking on inputs. + Class for random or forcast masking on inputs. - Args: + Parameters: mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. mask_ratio (float, optional): Mask ratio. mask_patches (list, optional): List of patch lengths to mask in the end of the data. @@ -450,6 +450,10 @@ class PatchTSTMasking(nn.Module): seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. Returns: + x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) + Masked patched input + mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`) + Bool tensor indicating True on masked points """ @@ -479,15 +483,16 @@ def __init__( def forward(self, x: torch.Tensor): """ - Input: - x: patched input - 4D: [bs x num_input_channels x num_patches x patch_length] - - Output: - x_mask: Masked patched input - 4D: [bs x num_input_channels x num_patches x patch_length] - mask: bool tensor indicating True on masked points - 4D: [bs x num_input_channels x num_patch] + Parameters: + x (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*): + Patched input + + Return: + x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) + Masked patched input + mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`) + Bool tensor indicating True on masked points + """ if self.mask_type == "random": @@ -517,6 +522,9 @@ def forward(self, x: torch.Tensor): class PatchTSTEncoderBlock(nn.Module): + """ + PatchTST encoder block + """ def __init__(self, config: PatchTSTConfig): super().__init__() @@ -524,8 +532,14 @@ def __init__(self, config: PatchTSTConfig): def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): """ - hidden_state: tensor [bs x nvars x sequence_length x d_model] Return: - Tensor [bs x nvars x sequence_length x d_model] + Parameters: + hidden_state (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*): + Past values of the time series + output_hidden_states (`bool`, *optional*): + output hidden state option + Return: + `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)` + """ all_hidden_states = [] @@ -539,6 +553,9 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo class PatchTSTEncoderLayer(nn.Module): + """ + PatchTST encoder layer + """ def __init__(self, config: PatchTSTConfig): super().__init__() @@ -591,8 +608,12 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ - src: tensor [batch_size x nvars x sequence_length x d_model] Return: - Tensor [batch_size x nvars x sequence_length x d_model] + Parameters: + src (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*): + Past values of the time series + Return: + `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)` + """ batch_size, num_input_channels, sequence_length, d_model = src.shape @@ -724,12 +745,13 @@ def forward( ) -> BaseModelOutputWithNoAttention: """ Parameters: - past_values: tensor [bs x nvars x num_patches x patch_length]. + past_values (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*): + Past values of the time series output_hidden_states (bool, optional): Indicates if hidden states should be output. return: - tensor [bs x nvars x num_patches x d_model] - or [bs x nvars x (num_patches+1) x d_model] if use cls_token + `torch.Tensor` of shape `(batch_size, nvars, num_patches, d_model)` + or `(batch_size, nvars, num_patches+1, d_model)` if cls_token is used """ _, num_input_channels, _, _ = past_values.shape From c36370deaa3d008c6f6254d113244ab8d75bd38c Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 9 Oct 2023 17:48:00 -0400 Subject: [PATCH 103/189] Add docstring --- .../models/patchtst/modeling_patchtst.py | 232 ++++++++++-------- 1 file changed, 130 insertions(+), 102 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 87ad4673d9b81d..3ebf831273f73c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -42,7 +42,14 @@ # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST class PatchTSTAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" + """ + Multi-headed attention from 'Attention Is All You Need' paper + + Parameters: + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): + Input to the multi-head attention block + + """ def __init__( self, @@ -71,8 +78,8 @@ def __init__( self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + def _shape(self, tensor: torch.Tensor, sequence_length: int, bsz: int): + return tensor.view(bsz, sequence_length, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, @@ -196,11 +203,23 @@ def forward( class PatchTSTTranspose(nn.Module): + """ + Transpose the tensor to the dimension defined in **dims** + Parameters: + dims (`list`): list of dimensions to be transposed + contiguous (`bool`): if True, the transposed tensor is contiguous + """ def __init__(self, *dims, contiguous=False): super().__init__() self.dims, self.contiguous = dims, contiguous - def forward(self, inputs): + def forward(self, inputs: torch.Tensor): + """ + Parameters: + inputs (`torch.Tensor`): input to be transposed + Returns: + `torch.Tensor`: transposed tensor + """ if self.contiguous: return inputs.transpose(*self.dims).contiguous() else: @@ -244,13 +263,13 @@ def random_masking( mask_ratio: float, unmasked_channel_indices: list = None, channel_consistent_masking: bool = False, - mask_value=0, + mask_value: int = 0, seed_number: Optional[int] = None, ): """random_masking: Mask the input considering the control variables. Args: - inputs (`torch.Tensor` of shape `(batch_size, nvars, seq_len, feat)`): + inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`): The input tensor to mask. mask_ratio (`float`): Mask ratio. @@ -271,26 +290,26 @@ def random_masking( if seed_number: set_seed(seed_number) - batch_size, nvars, seq_len, feat = inputs.shape + batch_size, num_channels, sequence_length, num_features = inputs.shape device = inputs.device - len_keep = int(seq_len * (1 - mask_ratio)) + len_keep = int(sequence_length * (1 - mask_ratio)) if channel_consistent_masking: - noise = torch.rand(batch_size, 1, seq_len, device=device) # noise in [0, 1], bs x 1 x L - noise = noise.repeat(1, nvars, 1) # bs x nvars x time + noise = torch.rand(batch_size, 1, sequence_length, device=device) # noise in [0, 1], bs x 1 x L + noise = noise.repeat(1, num_channels, 1) # bs x num_channels x time else: - noise = torch.rand(batch_size, nvars, seq_len, device=device) # noise in [0, 1], bs x nvars x L + noise = torch.rand(batch_size, num_channels, sequence_length, device=device) # noise in [0, 1], bs x num_channels x L - mask = torch.ones(batch_size, nvars, seq_len, device=device) # mask: [bs x nvars x num_patch] + mask = torch.ones(batch_size, num_channels, sequence_length, device=device) # mask: [bs x num_channels x num_patch] mask[:, :, :len_keep] = 0 # sort noise for each sample ids_shuffle = torch.argsort(noise, dim=-1) # ascend: small is keep, large is remove - ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x nvars x L] + ids_restore = torch.argsort(ids_shuffle, dim=-1) # ids_restore: [bs x num_channels x L] mask = torch.gather(mask, dim=-1, index=ids_restore) - mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat) # mask: [bs x nvars x num_patches x patch_length] + mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features) # mask: [bs x num_channels x num_patches x patch_length] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 @@ -312,7 +331,7 @@ def forecast_masking( Parameters: inputs (`torch.Tensor`): - Input to mask [ bs x nvars x num_patch x patch_len] or [ bs x tsg1 x tag2 x nvars x num_patch x patch_len] + Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch x patch_len] patch_lengths (list): List of patch lengths to mask in the end of the data. mix_ratio (list, *optional* defaults to None): @@ -335,15 +354,15 @@ def forecast_masking( if mix_ratio is None: mix_ratio = [1 for t in patch_lengths] - batch_size, nvars, seq_len, feat = inputs.shape - mask = torch.zeros(batch_size, nvars, seq_len, device=inputs.device) + batch_size, num_channels, sequence_length, num_features = inputs.shape + mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device) t_list = [] total_length = 0 total_ratio = sum(mix_ratio) for i, j in zip(patch_lengths, mix_ratio): - if i <= 0 or i >= seq_len: + if i <= 0 or i >= sequence_length: raise Exception("masked_patch_len should be greater than 0 and less than total patches.") temp_len = int(batch_size * j / total_ratio) t_list.append([i, j, temp_len]) @@ -365,7 +384,7 @@ def forecast_masking( perm = torch.randperm(mask.shape[0]) mask = mask[perm] - mask = mask.unsqueeze(-1).repeat(1, 1, 1, feat) # mask: [bs x nvars x num_patch x patch_len] + mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features) # mask: [bs x num_channels x num_patch x patch_len] if unmasked_channel_indices is not None: mask[:, unmasked_channel_indices, :, :] = 0 @@ -383,7 +402,7 @@ class PatchTSTPatchify(nn.Module): stride (int, required): stride between patches. Returns: - `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)` + `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)` """ def __init__( @@ -412,10 +431,10 @@ def __init__( def forward(self, past_values: torch.Tensor): """ Parameters: - past_values (`torch.Tensor` of shape `(batch_size, sequence_length, nvars)`, *required*): + past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*): Returns: - `torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)` + `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)` """ sequence_length = past_values.shape[-2] if sequence_length != self.sequence_length: @@ -423,11 +442,11 @@ def forward(self, past_values: torch.Tensor): f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." ) - x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x nvars] + x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x num_channels] x = x.unfold( dimension=-2, size=self.patch_length, step=self.stride ) # x: [bs x num_patches x num_input_channels x patch_length] - x = x.transpose(-2, -3).contiguous() # xb: [bs x num_input_channels x num_patches x patch_length] + x = x.transpose(-2, -3).contiguous() # x: [bs x num_input_channels x num_patches x patch_length] return x @@ -450,9 +469,9 @@ class PatchTSTMasking(nn.Module): seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. Returns: - x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) + x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) Masked patched input - mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`) + mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`) Bool tensor indicating True on masked points """ @@ -484,13 +503,13 @@ def __init__( def forward(self, x: torch.Tensor): """ Parameters: - x (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*): + x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Patched input Return: - x_mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`) + x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) Masked patched input - mask (`torch.Tensor` of shape `(batch_size, nvars, num_patches)`) + mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`) Bool tensor indicating True on masked points """ @@ -533,12 +552,12 @@ def __init__(self, config: PatchTSTConfig): def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): """ Parameters: - hidden_state (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*): + hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): Past values of the time series output_hidden_states (`bool`, *optional*): output hidden state option Return: - `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)` + `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` """ all_hidden_states = [] @@ -609,10 +628,10 @@ def __init__(self, config: PatchTSTConfig): def forward(self, src: torch.Tensor): """ Parameters: - src (`torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)`, *required*): + src (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): Past values of the time series Return: - `torch.Tensor` of shape `(batch_size, nvars, sequence_length, d_model)` + `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` """ batch_size, num_input_channels, sequence_length, d_model = src.shape @@ -620,7 +639,7 @@ def forward(self, src: torch.Tensor): # First sublayer: attention across time src = src.view( batch_size * num_input_channels, sequence_length, d_model - ) # src: [(bs*nvars) x sequence_length x d_model] + ) # src: [(bs*num_channels) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( @@ -630,17 +649,18 @@ def forward(self, src: torch.Tensor): ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT src = self.norm_sublayer1( src + self.dropout_path1(self.self_attn(src)[0]) - ) # src: [(bs*nvars) x sequence_length x d_model] + ) # src: [(bs*num_channels) x sequence_length x d_model] src = src.reshape( batch_size, num_input_channels, sequence_length, d_model - ) # [bs x nvars x sequence_length x d_model] + ) # [bs x num_channels x sequence_length x d_model] # second sublayer: attention across variable at any given time - # [bs x nvars x sequence_length x d_model] -> [bs x sequence_length x nvars x d_model] -> [(bs*sequence_length) x nvars x d_model] + # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model] + # -> [(bs*sequence_length) x num_channels x d_model] if self.channel_attention: src = ( src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model) - ) # [(bs*sequence_length) x nvars x d_model] + ) # [(bs*sequence_length) x num_channels x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2( @@ -650,15 +670,15 @@ def forward(self, src: torch.Tensor): ## Multi-Head attention and Add residual connection and Norm src = self.norm_sublayer2( src + self.dropout_path2(self.self_attn(src)[0]) - ) # src: [(bs*sequence_length) x nvars x d_model] + ) # src: [(bs*sequence_length) x num_channels x d_model] src = ( src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() - ) # src: [bs x nvars x sequence_length x d_model] + ) # src: [bs x num_channels x sequence_length x d_model] # Third sublayer: mixing across hidden src = src.view( batch_size * num_input_channels, sequence_length, d_model - ) # src: [(batch_size*nvars) x sequence_length x d_model] + ) # src: [(batch_size*num_channels) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection src = src + self.dropout_path3( @@ -671,7 +691,7 @@ def forward(self, src: torch.Tensor): ) # Add: residual connection with residual dropout src = src.reshape( batch_size, num_input_channels, sequence_length, d_model - ) # [bs x nvars x sequence_length x d_model] + ) # [bs x num_channels x sequence_length x d_model] return src @@ -745,13 +765,13 @@ def forward( ) -> BaseModelOutputWithNoAttention: """ Parameters: - past_values (`torch.Tensor` of shape `(batch_size, nvars, num_patches, patch_length)`, *required*): + past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Past values of the time series output_hidden_states (bool, optional): Indicates if hidden states should be output. return: - `torch.Tensor` of shape `(batch_size, nvars, num_patches, d_model)` - or `(batch_size, nvars, num_patches+1, d_model)` if cls_token is used + `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` + or `(batch_size, num_channels, num_patches+1, d_model)` if cls_token is used """ _, num_input_channels, _, _ = past_values.shape @@ -766,23 +786,23 @@ def forward( x_out.append(z) past_values = torch.stack(x_out, dim=1) else: - past_values = self.w_p(past_values) # x: [bs x nvars x num_patches x d_model] + past_values = self.w_p(past_values) # x: [bs x num_channels x num_patches x d_model] if self.use_cls_token: - # x: [bs x nvars x num_patches x d_model] + # x: [bs x num_channels x num_patches x d_model] past_values = self.positional_dropout(past_values + self.w_pos[1:, :]) # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples - past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x nvars x (num_patches+1) x d_model] + past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x num_channels x (num_patches+1) x d_model] else: - past_values = self.positional_dropout(past_values + self.w_pos) # x: [bs x nvars x num_patches x d_model] + past_values = self.positional_dropout(past_values + self.w_pos) # x: [bs x num_channels x num_patches x d_model] # Encoder past_values, hidden_states = self.encoder( past_values, output_hidden_states - ) # x: [bs x nvars x num_patches x d_model] - # or [bs x nvars x (num_patches+1) x d_model] if use cls_token + ) # x: [bs x num_channels x num_patches x d_model] + # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token # return past_values, hidden_states return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states) @@ -1098,7 +1118,7 @@ class PatchTSTStdScaler(nn.Module): Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it by subtracting from the mean and dividing by the standard deviation. - Args: + Parameters: dim (`int`): Dimension along which to calculate the mean and standard deviation. keepdim (`bool`, *optional*, defaults to `False`): @@ -1132,7 +1152,7 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data accordingly. - Args: + Parameters: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1189,7 +1209,7 @@ class PatchTSTNOPScaler(nn.Module): """ Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - Args: + Parameters: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1265,7 +1285,7 @@ def forward( if past_observed_mask is None: past_observed_mask = torch.ones_like(past_values) - # x: tensor [bs x seq_len x in_channels] + # x: tensor [bs x sequence_length x num_input_channels] scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain @@ -1300,11 +1320,11 @@ def __init__(self, config): def forward(self, x: torch.Tensor) -> torch.Tensor: """ - x: tensor [bs x nvars x num_patches x d_model] - or [bs x nvars x (num_patches+1) x d_model] if use cls_token - output: tensor [bs x nvars x num_patches x patch_length] + x: tensor [bs x num_channels x num_patches x d_model] + or [bs x num_channels x (num_patches+1) x d_model] if use cls_token + output: tensor [bs x num_channels x num_patches x patch_length] """ - x = self.linear(self.dropout(x)) # [bs x nvars x num_patches x patch_length] + x = self.linear(self.dropout(x)) # [bs x num_channels x num_patches x patch_length] if self.use_cls_token: x = x[:, :, 1:, :] # remove the first cls token return x @@ -1339,12 +1359,12 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # past_values: [bs x nvars x num_patches x d_model] or - # [bs x nvars x (num_patches+1) x d_model] if use cls_token + # past_values: [bs x num_channels x num_patches x d_model] or + # [bs x num_channels x (num_patches+1) x d_model] if use cls_token model_output = self.model(past_values, output_hidden_states=output_hidden_states) - # model_output[0]: [bs x nvars x num_patches x patch_length] or - # [bs x nvars x (num_patches+1) x patch_length] if use cls_token + # model_output[0]: [bs x num_channels x num_patches x patch_length] or + # [bs x num_channels x (num_patches+1) x patch_length] if use cls_token x_hat = self.head(model_output[0]) # calculate masked_loss @@ -1406,21 +1426,22 @@ def __init__(self, config: PatchTSTConfig): self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels) - def forward(self, x: torch.Tensor): + def forward(self, embedding: torch.Tensor): """ - x: [bs x nvars x num_patches x d_model] or [bs x nvars x (num_patches+1) x d_model] if use cls_token output: + embedding: [bs x num_channels x num_patches x d_model] + or [bs x num_channels x (num_patches+1) x d_model] if use cls_token output: [bs x n_classes] """ if self.use_cls_token: - x = x[:, :, 0, :] # use the first output token, x: bs x nvars x d_model + x = embedding[:, :, 0, :] # use the first output token, x: bs x num_channels x d_model elif self.pooling == "mean": - x = x.mean(dim=2) # x: [bs x nvars x d_model] + x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] elif self.pooling == "max": - x = x.max(dim=2) # x: [bs x nvars x d_model] + x = embedding.max(dim=2) # x: [bs x num_channels x d_model] else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") - x = self.flatten(x) # x: bs x nvars * d_model + x = self.flatten(x) # x: bs x num_channels * d_model y = self.linear(self.dropout(x)) # y: bs x n_classes return y @@ -1443,37 +1464,40 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): else: self.projection = distribution_output.get_parameter_projection(head_dim) - def forward(self, x: torch.Tensor): + def forward(self, embedding: torch.Tensor): """ - x: [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token + embedding: [bs x num_channels x num_patch x d_model] + or [bs x num_channels x (num_patch+1) x d_model] if use cls_token output: [bs x pred_len x num_output_channels] """ - batch_size = x.shape[0] + batch_size = embedding.shape[0] if self.use_cls_token: - x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] + x = embedding[:, :, 0, :] # use the first output token, x: [bs x num_channels x d_model] elif self.pooling == "mean": - x = x.mean(dim=2) # x: [bs x nvars x d_model] + x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] elif self.pooling == "max": - x = x.max(dim=2) # x: [bs x nvars x d_model] + x = embedding.max(dim=2) # x: [bs x num_channels x d_model] else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + x = self.dropout(self.flatten(x)) # x: bs x (num_channels * d_model) # projection y = self.projection(x) # reshape y if isinstance(y, tuple): # for distribution head y = ( z.reshape(batch_size, -1, self.num_output_channels) for z in y - ) # tuple of [bs x pred_len x num_output_channels] + ) # tuple of [bs x prediction_len x num_output_channels] else: # for linear head - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x pred_len x num_output_channels] + y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x prediction_len x num_output_channels] return y class PatchTSTForPrediction(PatchTSTPreTrainedModel): + """ + + """ # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1544,7 +1568,7 @@ def generate( """ Generate sequences of sample predictions from a model with a probability distribution head. - Parameters: + Args: past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): Past values of the time series that serves as context in order to predict the future. @@ -1616,22 +1640,22 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.projection = distribution_output.get_parameter_projection(head_dim) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - def forward(self, x: torch.Tensor): + def forward(self, embedding: torch.Tensor): """ - x: [bs x nvars x num_patches x d_model] - or [bs x nvars x (num_patches+1) x d_model] if use cls_token - output: [bs x forecast_len x nvars] + embedding: [bs x num_channels x num_patches x d_model] + or [bs x num_channels x (num_patches+1) x d_model] if use cls_token + output: [bs x forecast_len x num_channels] """ if self.use_cls_token: - y = x[:, :, 0, :] # y: [bs x nvars x d_model] + y = embedding[:, :, 0, :] # y: [bs x num_channels x d_model] else: if self.pooling == "mean": - y = x.mean(dim=2) # y: [bs x nvars x d_model] + y = embedding.mean(dim=2) # y: [bs x num_channels x d_model] elif self.pooling == "max": - y = x.max(dim=2) # y: [bs x nvars x d_model] + y = embedding.max(dim=2) # y: [bs x num_channels x d_model] else: - y = x # y: [bs x nvars x num_patches x d_model] + y = embedding # y: [bs x num_channels x num_patches x d_model] if not self.shared_projection: x_out = [] @@ -1642,25 +1666,29 @@ def forward(self, x: torch.Tensor): z ) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head x_out.append(z) - output = torch.stack(x_out, dim=1) # x: [bs x nvars x forecast_len] + output = torch.stack(x_out, dim=1) # x: [bs x num_channels x forecast_len] else: - z = self.flatten(y) # z: [bs x nvars x (d_model * num_patches)] or [bs x nvars x d_model)] + z = self.flatten(y) # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)] z = self.dropout(z) output = self.projection( z - ) # x: [bs x nvars x forecast_len] or tuple ([bs x nvars x forecast_len], [bs x nvars x forecast_len]) if using distribution head + ) # output: [bs x num_channels x forecast_len] + # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head if isinstance(output, tuple): output = tuple( z.transpose(2, 1) for z in output - ) # ([bs x forecast_len x nvars], [bs x forecast_len x nvars]) + ) # ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels]) else: - output = output.transpose(2, 1) # [bs x forecast_len x nvars] + output = output.transpose(2, 1) # [bs x forecast_len x num_channels] return output class PatchTSTForForecasting(PatchTSTPreTrainedModel): + """ + PatchTST for forecasting + """ # PatchTST model + Forecasting head def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1778,9 +1806,9 @@ def generate( # get samples samples = [ distribution.sample() for _ in range(num_parallel_samples) - ] # samples: list of [bs x forecast_len x nvars] + ] # samples: list of [bs x forecast_len x num_channels] # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x nvars] + samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x num_channels] return SamplePatchTSTForecastOutput(sequences=samples) @@ -1802,22 +1830,22 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): else: self.projection = distribution_output.get_parameter_projection(head_dim) - def forward(self, x): + def forward(self, embedding: torch.Tensor): """ - x: [bs x nvars x num_patch x d_model] - or [bs x nvars x (num_patch+1) x d_model] if use cls_token + embedding: [bs x num_channels x num_patch x d_model] + or [bs x num_channels x (num_patch+1) x d_model] if use cls_token output: [bs x output_dim] """ if self.use_cls_token: - x = x[:, :, 0, :] # use the first output token, x: [bs x nvars x d_model] + x = embedding[:, :, 0, :] # use the first output token, x: [bs x num_channels x d_model] elif self.pooling == "mean": - x = x.mean(dim=2) # x: [bs x nvars x d_model] + x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] elif self.pooling == "max": - x = x.max(dim=2) # x: [bs x nvars x d_model] + x = embedding.max(dim=2) # x: [bs x num_channels x d_model] else: raise Exception(f"pooling operator {self.pooling} is not implemented yet") # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (nvars * d_model) + x = self.dropout(self.flatten(x)) # x: bs x (num_channels * d_model) # projection y = self.projection(x) # y: bs x output_dim or a tuple of this shape for distribution head # From fe3f4d49da315f0c31f6d1b6be4c9359ab999978 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 10 Oct 2023 11:35:35 +0200 Subject: [PATCH 104/189] formatting --- .../models/patchtst/modeling_patchtst.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3ebf831273f73c..55ff3446afe00b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -204,11 +204,12 @@ def forward( class PatchTSTTranspose(nn.Module): """ - Transpose the tensor to the dimension defined in **dims** Parameters: + Transpose the tensor to the dimension defined in **dims** dims (`list`): list of dimensions to be transposed contiguous (`bool`): if True, the transposed tensor is contiguous """ + def __init__(self, *dims, contiguous=False): super().__init__() self.dims, self.contiguous = dims, contiguous @@ -299,9 +300,13 @@ def random_masking( noise = torch.rand(batch_size, 1, sequence_length, device=device) # noise in [0, 1], bs x 1 x L noise = noise.repeat(1, num_channels, 1) # bs x num_channels x time else: - noise = torch.rand(batch_size, num_channels, sequence_length, device=device) # noise in [0, 1], bs x num_channels x L + noise = torch.rand( + batch_size, num_channels, sequence_length, device=device + ) # noise in [0, 1], bs x num_channels x L - mask = torch.ones(batch_size, num_channels, sequence_length, device=device) # mask: [bs x num_channels x num_patch] + mask = torch.ones( + batch_size, num_channels, sequence_length, device=device + ) # mask: [bs x num_channels x num_patch] mask[:, :, :len_keep] = 0 # sort noise for each sample @@ -331,7 +336,8 @@ def forecast_masking( Parameters: inputs (`torch.Tensor`): - Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch x patch_len] + Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch + x patch_len] patch_lengths (list): List of patch lengths to mask in the end of the data. mix_ratio (list, *optional* defaults to None): @@ -505,10 +511,10 @@ def forward(self, x: torch.Tensor): Parameters: x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Patched input - + Return: x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) - Masked patched input + Masked patched input mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`) Bool tensor indicating True on masked points @@ -544,6 +550,7 @@ class PatchTSTEncoderBlock(nn.Module): """ PatchTST encoder block """ + def __init__(self, config: PatchTSTConfig): super().__init__() @@ -575,6 +582,7 @@ class PatchTSTEncoderLayer(nn.Module): """ PatchTST encoder layer """ + def __init__(self, config: PatchTSTConfig): super().__init__() @@ -770,8 +778,8 @@ def forward( output_hidden_states (bool, optional): Indicates if hidden states should be output. return: - `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` - or `(batch_size, num_channels, num_patches+1, d_model)` if cls_token is used + `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` or `(batch_size, num_channels, + num_patches+1, d_model)` if cls_token is used """ _, num_input_channels, _, _ = past_values.shape @@ -794,9 +802,13 @@ def forward( # append cls token cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples - past_values = torch.cat((cls_tokens, past_values), dim=1) # x: [bs x num_channels x (num_patches+1) x d_model] + past_values = torch.cat( + (cls_tokens, past_values), dim=1 + ) # x: [bs x num_channels x (num_patches+1) x d_model] else: - past_values = self.positional_dropout(past_values + self.w_pos) # x: [bs x num_channels x num_patches x d_model] + past_values = self.positional_dropout( + past_values + self.w_pos + ) # x: [bs x num_channels x num_patches x d_model] # Encoder past_values, hidden_states = self.encoder( @@ -1495,9 +1507,8 @@ def forward(self, embedding: torch.Tensor): class PatchTSTForPrediction(PatchTSTPreTrainedModel): - """ + """ """ - """ # PatchTST model + prediction head def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1670,10 +1681,8 @@ def forward(self, embedding: torch.Tensor): else: z = self.flatten(y) # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)] z = self.dropout(z) - output = self.projection( - z - ) # output: [bs x num_channels x forecast_len] - # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head + output = self.projection(z) # output: [bs x num_channels x forecast_len] + # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head if isinstance(output, tuple): output = tuple( @@ -1689,6 +1698,7 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel): """ PatchTST for forecasting """ + # PatchTST model + Forecasting head def __init__(self, config: PatchTSTConfig): super().__init__(config) From 11feb7c80d5211c974df856ae389b415174c26f1 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 10 Oct 2023 12:22:34 +0200 Subject: [PATCH 105/189] use past_observed_mask --- src/transformers/models/patchtst/modeling_patchtst.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 55ff3446afe00b..26753be95f93d2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -206,8 +206,8 @@ class PatchTSTTranspose(nn.Module): """ Parameters: Transpose the tensor to the dimension defined in **dims** - dims (`list`): list of dimensions to be transposed - contiguous (`bool`): if True, the transposed tensor is contiguous + dims (`list`): list of dimensions to be transposed contiguous (`bool`): if True, the transposed tensor is + contiguous """ def __init__(self, *dims, contiguous=False): @@ -1373,7 +1373,9 @@ def forward( # past_values: [bs x num_channels x num_patches x d_model] or # [bs x num_channels x (num_patches+1) x d_model] if use cls_token - model_output = self.model(past_values, output_hidden_states=output_hidden_states) + model_output = self.model( + past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + ) # model_output[0]: [bs x num_channels x num_patches x patch_length] or # [bs x num_channels x (num_patches+1) x patch_length] if use cls_token From 3af8567bb1a1c0fd199bdc8b8b9cbddf2bff4c1e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 10 Oct 2023 14:30:54 +0200 Subject: [PATCH 106/189] doc suggestion --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 26753be95f93d2..48e732bacbd5a5 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -458,7 +458,7 @@ def forward(self, past_values: torch.Tensor): class PatchTSTMasking(nn.Module): """ - Class for random or forcast masking on inputs. + Class to perform random or forecast masking. Parameters: mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. From ed5b26be04b2e0535e752c93e4ba3b97952390eb Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 10 Oct 2023 14:47:57 +0200 Subject: [PATCH 107/189] make fix-copies --- .../models/patchtst/modeling_patchtst.py | 19 ++++++------------- src/transformers/utils/dummy_pt_objects.py | 4 ++-- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 48e732bacbd5a5..625f131187deb3 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -42,14 +42,7 @@ # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST class PatchTSTAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper - - Parameters: - hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): - Input to the multi-head attention block - - """ + """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, @@ -78,8 +71,8 @@ def __init__( self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - def _shape(self, tensor: torch.Tensor, sequence_length: int, bsz: int): - return tensor.view(bsz, sequence_length, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, @@ -1130,7 +1123,7 @@ class PatchTSTStdScaler(nn.Module): Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it by subtracting from the mean and dividing by the standard deviation. - Parameters: + Args: dim (`int`): Dimension along which to calculate the mean and standard deviation. keepdim (`bool`, *optional*, defaults to `False`): @@ -1164,7 +1157,7 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data accordingly. - Parameters: + Args: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): @@ -1221,7 +1214,7 @@ class PatchTSTNOPScaler(nn.Module): """ Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - Parameters: + Args: dim (`int`): Dimension along which to compute the scale. keepdim (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index d60b81511deda0..3dbb01528a3b99 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -5921,14 +5921,14 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class PatchTSTForPretraining(metaclass=DummyObject): +class PatchTSTForPrediction(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class PatchTSTForPrediction(metaclass=DummyObject): +class PatchTSTForPretraining(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): From ddffd717a54a8f415eca0a574ee2dadee507ffde Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 10 Oct 2023 15:37:26 +0200 Subject: [PATCH 108/189] use Args: --- .../models/patchtst/configuration_patchtst.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index f8ee3f75a9530a..89872754623825 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -37,7 +37,7 @@ class PatchTSTConfig(PretrainedConfig): Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - Parameters: + Args: num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. @@ -82,7 +82,8 @@ class PatchTSTConfig(PretrainedConfig): Consider bias in the feed-forward networks. activation_function (`str`, *optional*, defaults to `"gelu"`): The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. - pre_norm (`bool`, *optional*, defaults to `False`): + pre_norm (`bool`, *optional*, defaults to `False`): + TODO positional_encoding (`str`, *optional*, defaults to `"sincos"`): Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. learn_pe (`bool`, *optional*, defaults to `False`): @@ -104,8 +105,10 @@ class PatchTSTConfig(PretrainedConfig): Masking type. Only `"random"` is currently supported. mask_ratio (`float`, *optional*, defaults to 0.5): Masking ratio is applied to mask the input data during pretraining. - mask_patches (`List`, *optional*, defaults to `[2, 3]`): - mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): + mask_patches (`List`, *optional*, defaults to `[2, 3]`): + TODO + mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): + TODO channel_consistent_masking (`bool`, *optional*, defaults to `False`): If channel consistent masking is True, all the channels will have the same masking. unmasked_channel_indices (`list`, *optional*): @@ -125,7 +128,7 @@ class PatchTSTConfig(PretrainedConfig): num_parallel_samples (`int`, *optional*, defaults to 100): The number of samples to generate in parallel for probablistic forecast. - Example: + Example: ```python >>> from transformers import PatchTSTConfig, PatchTSTModel @@ -139,7 +142,6 @@ class PatchTSTConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "patchtst" attribute_map = { "hidden_size": "d_model", From ccdd0130ded6517e6338934274e37fb4d3c957be Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 10 Oct 2023 18:39:06 -0400 Subject: [PATCH 109/189] add docstring --- .../models/patchtst/modeling_patchtst.py | 88 +++++++++++-------- 1 file changed, 49 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 625f131187deb3..1b2ffab6523ab4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -329,29 +329,29 @@ def forecast_masking( Parameters: inputs (`torch.Tensor`): - Input to mask [ bs x num_channels x num_patch x patch_len] or [ bs x tsg1 x tag2 x num_channels x num_patch - x patch_len] - patch_lengths (list): + Input of shape `(bs, num_channels, num_patch, patch_len)` + or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)` + patch_lengths (`list`): List of patch lengths to mask in the end of the data. - mix_ratio (list, *optional* defaults to None): + mix_ratio (`list`, *optional*): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, *optional* defaults to None): + unmasked_channel_indices (`list`, *optional*): Control Variable channel indices. These channels will not be masked. Defaults to None. - mask_value (int, *optional* defaults to 0): + mask_value (`int`, *optional* defaults to 0): Value to use for masking. Defaults to 0. - seed_number (int, *optional*): + seed_number (`int`, *optional*): Value to set for the random seed. Returns: - `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape [bs x c - x n] or [bs x tsg1 x tsg2 x c x n] + `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs, num_channels + , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)` """ if seed_number: set_seed(seed_number) if mix_ratio is None: - mix_ratio = [1 for t in patch_lengths] + mix_ratio = [1 for _ in patch_lengths] batch_size, num_channels, sequence_length, num_features = inputs.shape mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device) @@ -396,9 +396,9 @@ class PatchTSTPatchify(nn.Module): A class to patchify the time series sequence into different patches Parameters: - sequence_length (int, required): input sequence length. - patch_length (int, required): patch length. - stride (int, required): stride between patches. + sequence_length (`int`, *required*): input sequence length. + patch_length (`int`, *required*): patch length. + stride (`int`, *required*): stride between patches. Returns: `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)` @@ -409,7 +409,6 @@ def __init__( sequence_length: int, patch_length: int, stride: int, - padding: bool = False, # TODO: use this to set whether we want to pad zeros to the sequence ): super().__init__() @@ -431,6 +430,7 @@ def forward(self, past_values: torch.Tensor): """ Parameters: past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*): + Input to be patchified Returns: `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)` @@ -441,12 +441,12 @@ def forward(self, past_values: torch.Tensor): f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." ) - x = past_values[:, self.s_begin :, :] # x: [bs x new_sequence_length x num_channels] - x = x.unfold( + output = past_values[:, self.s_begin :, :] # output: [bs x new_sequence_length x num_channels] + output = output.unfold( dimension=-2, size=self.patch_length, step=self.stride - ) # x: [bs x num_patches x num_input_channels x patch_length] - x = x.transpose(-2, -3).contiguous() # x: [bs x num_input_channels x num_patches x patch_length] - return x + ) # output: [bs x num_patches x num_input_channels x patch_length] + output = output.transpose(-2, -3).contiguous() # output: [bs x num_input_channels x num_patches x patch_length] + return output class PatchTSTMasking(nn.Module): @@ -454,18 +454,18 @@ class PatchTSTMasking(nn.Module): Class to perform random or forecast masking. Parameters: - mask_type (str, optional): Masking type. Allowed values are random, forecast. Defaults to random. - mask_ratio (float, optional): Mask ratio. - mask_patches (list, optional): List of patch lengths to mask in the end of the data. - mask_patch_ratios (list, optional): List of weights to use for each patch length. For Ex. + mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random. + mask_ratio (`float`, *optional*): Mask ratio. + mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. + mask_patch_ratios (`list`, *optional*): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (list, optional): - Control Variable channel indices. These channels will not be masked. Defaults to None. - channel_consistent_masking (bool, optional): + unmasked_channel_indices (`list`, *optional*): + Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. + channel_consistent_masking (`bool`, *optional*): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary across channels. Defaults to True. - mask_value (int, optional): Value to use for masking. Defaults to 0. - seed_number (int, optional): Random seed, when None seed is not set. Defaults to None. + mask_value (`int`, *optional*): Value to use for masking. Defaults to 0. + seed_number (`int`, *optional*): Random seed, when None seed is not set. Defaults to None. Returns: x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) @@ -478,12 +478,12 @@ class PatchTSTMasking(nn.Module): def __init__( self, mask_type: str = "random", - mask_ratio=0.5, + mask_ratio: float = 0.5, mask_patches: list = [2, 3], mask_patch_ratios: list = [1, 1], channel_consistent_masking: bool = False, unmasked_channel_indices: list = None, - mask_value=0, + mask_value: int = 0, seed_number: Optional[int] = None, ): self.mask_ratio = mask_ratio @@ -503,7 +503,7 @@ def forward(self, x: torch.Tensor): """ Parameters: x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): - Patched input + Patch input Return: x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) @@ -557,7 +557,9 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo output_hidden_states (`bool`, *optional*): output hidden state option Return: - `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` + hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`) + + all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` of shapes `(batch_size, num_channels, sequence_length, d_model)`) """ all_hidden_states = [] @@ -1317,22 +1319,30 @@ def forward( class MaskPretrainHead(nn.Module): + """ + Pretraining head for mask modelling + """ def __init__(self, config): super().__init__() self.dropout = nn.Dropout(config.dropout) self.linear = nn.Linear(config.d_model, config.patch_length) self.use_cls_token = config.use_cls_token - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, embedding: torch.Tensor) -> torch.Tensor: """ - x: tensor [bs x num_channels x num_patches x d_model] - or [bs x num_channels x (num_patches+1) x d_model] if use cls_token - output: tensor [bs x num_channels x num_patches x patch_length] + Parameters: + embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True + Embedding from the model + Returns: + `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or + `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True + """ - x = self.linear(self.dropout(x)) # [bs x num_channels x num_patches x patch_length] + embedding = self.linear(self.dropout(embedding)) # [bs x num_channels x num_patches x patch_length] if self.use_cls_token: - x = x[:, :, 1:, :] # remove the first cls token - return x + embedding = embedding[:, :, 1:, :] # remove the first cls token + return embedding class PatchTSTForPretraining(PatchTSTPreTrainedModel): From c993a50f1f2dc0de7575720a70920b20061e233f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 10 Oct 2023 23:11:33 -0400 Subject: [PATCH 110/189] add docstring --- .../models/patchtst/modeling_patchtst.py | 161 +++++++++++++++--- 1 file changed, 136 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 1b2ffab6523ab4..f9b644ad6dd148 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1332,8 +1332,8 @@ def forward(self, embedding: torch.Tensor) -> torch.Tensor: """ Parameters: embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` - or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True - Embedding from the model + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): + Embedding from the model Returns: `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True @@ -1346,7 +1346,9 @@ def forward(self, embedding: torch.Tensor) -> torch.Tensor: class PatchTSTForPretraining(PatchTSTPreTrainedModel): - # PatchTSTModel + Pretraining Head + """ + Mask pretrain model: PatchTST model + pretrain head + """ def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1362,12 +1364,25 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForPretrainingOutput]: """ - past_values (x): tensor [bs x sequence_length x num_input_channels ] future_values (y): labels + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1395,7 +1410,9 @@ def forward( class PatchTSTForClassification(PatchTSTPreTrainedModel): - # PatchTST model + classification head + """ + PatchTST model for classification. The model contains PatchTST model + classification head + """ def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1414,6 +1431,24 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForClassificationOutput]: + """ + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + labels (`torch.Tensor`, *optional*): labels associates with the `past_values` + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + + """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1435,6 +1470,9 @@ def forward( class ClassificationHead(nn.Module): + """ + Classification head + """ def __init__(self, config: PatchTSTConfig): super().__init__() self.use_cls_token = config.use_cls_token @@ -1445,9 +1483,13 @@ def __init__(self, config: PatchTSTConfig): def forward(self, embedding: torch.Tensor): """ - embedding: [bs x num_channels x num_patches x d_model] - or [bs x num_channels x (num_patches+1) x d_model] if use cls_token output: - [bs x n_classes] + Parameters: + embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): + Embedding from the model + Returns: + `torch.Tensor` of shape `(bs, num_labels)` + """ if self.use_cls_token: x = embedding[:, :, 0, :] # use the first output token, x: bs x num_channels x d_model @@ -1483,9 +1525,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): def forward(self, embedding: torch.Tensor): """ - embedding: [bs x num_channels x num_patch x d_model] - or [bs x num_channels x (num_patch+1) x d_model] if use cls_token - output: [bs x pred_len x num_output_channels] + Parameters: + embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): + Embedding from the model + Returns: + `torch.Tensor` of shape `(bs, pred_len, num_output_channels)` + """ batch_size = embedding.shape[0] if self.use_cls_token: @@ -1512,9 +1558,9 @@ def forward(self, embedding: torch.Tensor): class PatchTSTForPrediction(PatchTSTPreTrainedModel): - """ """ - - # PatchTST model + prediction head + """ + PatchTST model for prediction. The model contains PatchTST model + prediction head + """ def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1548,6 +1594,25 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForPredictionOutput]: + """ + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): + future target values associates with the `past_values` + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + + """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1658,11 +1723,14 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): def forward(self, embedding: torch.Tensor): """ - embedding: [bs x num_channels x num_patches x d_model] - or [bs x num_channels x (num_patches+1) x d_model] if use cls_token - output: [bs x forecast_len x num_channels] - """ + Parameters: + embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): + Embedding from the model + Returns: + `torch.Tensor` of shape `(bs, forecast_len, num_channels)` + """ if self.use_cls_token: y = embedding[:, :, 0, :] # y: [bs x num_channels x d_model] else: @@ -1701,10 +1769,8 @@ def forward(self, embedding: torch.Tensor): class PatchTSTForForecasting(PatchTSTPreTrainedModel): """ - PatchTST for forecasting + PatchTST for forecasting. The model contains PatchTST model + Forecasting head """ - - # PatchTST model + Forecasting head def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) @@ -1736,6 +1802,25 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForForecastingOutput]: + """ + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*): + future target values associates with the `past_values` + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + + """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1828,6 +1913,9 @@ def generate( class RegressionHead(nn.Module): + """ + Regression head + """ def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.y_range = config.prediction_range @@ -1847,9 +1935,13 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): def forward(self, embedding: torch.Tensor): """ - embedding: [bs x num_channels x num_patch x d_model] - or [bs x num_channels x (num_patch+1) x d_model] if use cls_token - output: [bs x output_dim] + Parameters: + embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` + or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): + Embedding from the model + Returns: + `torch.Tensor` of shape `(bs, output_dim)` + """ if self.use_cls_token: x = embedding[:, :, 0, :] # use the first output token, x: [bs x num_channels x d_model] @@ -1906,6 +1998,25 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForRegressionOutput]: + """ + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): + target labels associates with the `past_values` + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + + """ output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) From ddc7521d81949f8b26f3e3af9b56cd7ae602a59e Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 10 Oct 2023 23:59:15 -0400 Subject: [PATCH 111/189] change some variable names and add PatchTST before some class names --- .../models/patchtst/modeling_patchtst.py | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f9b644ad6dd148..089837326d2fd0 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -224,32 +224,31 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model): # Positional encoding if position_embedding_type is None: # position_embedding_type = None and learned = False can be used to measure impact of positional encoding - w_pos = torch.empty((q_len, d_model)) - nn.init.uniform_(w_pos, -0.02, 0.02) + position_enc = torch.empty((q_len, d_model)) + nn.init.uniform_(position_enc, -0.02, 0.02) learned = False elif position_embedding_type == "zeros": - w_pos = torch.empty((q_len, d_model)) - nn.init.uniform_(w_pos, -0.02, 0.02) + position_enc = torch.empty((q_len, d_model)) + nn.init.uniform_(position_enc, -0.02, 0.02) elif position_embedding_type == "normal": - w_pos = torch.zeros((q_len, 1)) - torch.nn.init.normal_(w_pos, mean=0.0, std=0.1) + position_enc = torch.zeros((q_len, 1)) + torch.nn.init.normal_(position_enc, mean=0.0, std=0.1) elif position_embedding_type == "uniform": - w_pos = torch.zeros((q_len, 1)) - nn.init.uniform_(w_pos, a=0.0, b=0.1) + position_enc = torch.zeros((q_len, 1)) + nn.init.uniform_(position_enc, a=0.0, b=0.1) elif position_embedding_type == "sincos": - pos_enc = torch.zeros(q_len, d_model) + position_enc = torch.zeros(q_len, d_model) position = torch.arange(0, q_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) - pos_enc[:, 0::2] = torch.sin(position * div_term) - pos_enc[:, 1::2] = torch.cos(position * div_term) - pos_enc = pos_enc - pos_enc.mean() - pos_enc = pos_enc / (pos_enc.std() * 10) - w_pos = pos_enc + position_enc[:, 0::2] = torch.sin(position * div_term) + position_enc[:, 1::2] = torch.cos(position * div_term) + position_enc = position_enc - position_enc.mean() + position_enc = position_enc / (position_enc.std() * 10) else: raise ValueError( f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." ) - return nn.Parameter(w_pos, requires_grad=learned) + return nn.Parameter(position_enc, requires_grad=learned) def random_masking( @@ -628,19 +627,19 @@ def __init__(self, config: PatchTSTConfig): self.pre_norm = config.pre_norm - def forward(self, src: torch.Tensor): + def forward(self, hidden_state: torch.Tensor): """ Parameters: - src (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): + hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): Past values of the time series Return: `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` """ - batch_size, num_input_channels, sequence_length, d_model = src.shape + batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape # First sublayer: attention across time - src = src.view( + src = hidden_state.view( batch_size * num_input_channels, sequence_length, d_model ) # src: [(bs*num_channels) x sequence_length x d_model] if self.pre_norm: @@ -723,6 +722,9 @@ def _set_gradient_checkpointing(self, module, value=False): class PatchTSTEncoder(PatchTSTPreTrainedModel): + """ + PatchTST Encoder + """ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.num_input_channels = config.num_input_channels @@ -735,20 +737,20 @@ def __init__(self, config: PatchTSTConfig): # Input encoding: projection of feature vectors onto a d-dim vector space if not config.shared_embedding: - self.w_p = nn.ModuleList() + self.input_embedding = nn.ModuleList() for _ in range(self.num_input_channels): - self.w_p.append(nn.Linear(config.patch_length, config.d_model)) + self.input_embedding.append(nn.Linear(config.patch_length, config.d_model)) else: - self.w_p = nn.Linear(config.patch_length, config.d_model) + self.input_embedding = nn.Linear(config.patch_length, config.d_model) # Positional encoding if config.use_cls_token: self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.w_pos = positional_encoding( + self.position_enc = positional_encoding( config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model ) else: - self.w_pos = positional_encoding( + self.position_enc = positional_encoding( config.positional_encoding, config.learn_pe, config.num_patches, config.d_model ) @@ -773,8 +775,7 @@ def forward( output_hidden_states (bool, optional): Indicates if hidden states should be output. return: - `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` or `(batch_size, num_channels, - num_patches+1, d_model)` if cls_token is used + `BaseModelOutputWithNoAttention` """ _, num_input_channels, _, _ = past_values.shape @@ -785,24 +786,24 @@ def forward( if not self.shared_embedding: x_out = [] for i in range(num_input_channels): - z = self.w_p[i](past_values[:, i, :, :]) + z = self.input_embedding[i](past_values[:, i, :, :]) x_out.append(z) past_values = torch.stack(x_out, dim=1) else: - past_values = self.w_p(past_values) # x: [bs x num_channels x num_patches x d_model] + past_values = self.input_embedding(past_values) # x: [bs x num_channels x num_patches x d_model] if self.use_cls_token: # x: [bs x num_channels x num_patches x d_model] - past_values = self.positional_dropout(past_values + self.w_pos[1:, :]) + past_values = self.positional_dropout(past_values + self.position_enc[1:, :]) # append cls token - cls_token = self.cls_token + self.w_pos[:1, :] # cls_token: [1 x 1 x 1 x d_model] + cls_token = self.cls_token + self.position_enc[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples past_values = torch.cat( (cls_tokens, past_values), dim=1 ) # x: [bs x num_channels x (num_patches+1) x d_model] else: past_values = self.positional_dropout( - past_values + self.w_pos + past_values + self.position_enc ) # x: [bs x num_channels x num_patches x d_model] # Encoder @@ -1417,7 +1418,7 @@ def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) - self.head = ClassificationHead(config) + self.head = PatchTSTClassificationHead(config) self.loss = nn.CrossEntropyLoss() # Initialize weights and apply final processing @@ -1469,7 +1470,7 @@ def forward( return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states) -class ClassificationHead(nn.Module): +class PatchTSTClassificationHead(nn.Module): """ Classification head """ @@ -1505,7 +1506,7 @@ def forward(self, embedding: torch.Tensor): return y -class PredictionHead(nn.Module): +class PatchTSTPredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1581,7 +1582,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = PredictionHead(config, self.distribution_output) + self.head = PatchTSTPredictionHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1686,7 +1687,7 @@ def generate( return SamplePatchTSTPredictionOutput(sequences=samples) -class ForecastHead(nn.Module): +class PatchTSTForecastHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1789,7 +1790,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = ForecastHead(config, self.distribution_output) + self.head = PatchTSTForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1912,7 +1913,7 @@ def generate( return SamplePatchTSTForecastOutput(sequences=samples) -class RegressionHead(nn.Module): +class PatchTSTRegressionHead(nn.Module): """ Regression head """ @@ -1985,7 +1986,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = RegressionHead(config, self.distribution_output) + self.head = PatchTSTRegressionHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() From 0d7d92d5bd2f819b2ec27e6a0be2455814bae8b0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 11 Oct 2023 09:28:57 +0200 Subject: [PATCH 112/189] formatting --- .../models/patchtst/modeling_patchtst.py | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 089837326d2fd0..3b36c0a4850160 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -328,8 +328,8 @@ def forecast_masking( Parameters: inputs (`torch.Tensor`): - Input of shape `(bs, num_channels, num_patch, patch_len)` - or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)` + Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch, + patch_len)` patch_lengths (`list`): List of patch lengths to mask in the end of the data. mix_ratio (`list`, *optional*): @@ -343,8 +343,8 @@ def forecast_masking( Value to set for the random seed. Returns: - `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs, num_channels - , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)` + `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs, + num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)` """ if seed_number: set_seed(seed_number) @@ -444,7 +444,9 @@ def forward(self, past_values: torch.Tensor): output = output.unfold( dimension=-2, size=self.patch_length, step=self.stride ) # output: [bs x num_patches x num_input_channels x patch_length] - output = output.transpose(-2, -3).contiguous() # output: [bs x num_input_channels x num_patches x patch_length] + output = output.transpose( + -2, -3 + ).contiguous() # output: [bs x num_input_channels x num_patches x patch_length] return output @@ -558,7 +560,8 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo Return: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`) - all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` of shapes `(batch_size, num_channels, sequence_length, d_model)`) + all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` + of shapes `(batch_size, num_channels, sequence_length, d_model)`) """ all_hidden_states = [] @@ -725,6 +728,7 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel): """ PatchTST Encoder """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) self.num_input_channels = config.num_input_channels @@ -1323,6 +1327,7 @@ class MaskPretrainHead(nn.Module): """ Pretraining head for mask modelling """ + def __init__(self, config): super().__init__() self.dropout = nn.Dropout(config.dropout) @@ -1350,6 +1355,7 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel): """ Mask pretrain model: PatchTST model + pretrain head """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1382,7 +1388,8 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) """ output_hidden_states = ( @@ -1414,6 +1421,7 @@ class PatchTSTForClassification(PatchTSTPreTrainedModel): """ PatchTST model for classification. The model contains PatchTST model + classification head """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1447,7 +1455,8 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) """ output_hidden_states = ( @@ -1474,6 +1483,7 @@ class PatchTSTClassificationHead(nn.Module): """ Classification head """ + def __init__(self, config: PatchTSTConfig): super().__init__() self.use_cls_token = config.use_cls_token @@ -1562,6 +1572,7 @@ class PatchTSTForPrediction(PatchTSTPreTrainedModel): """ PatchTST model for prediction. The model contains PatchTST model + prediction head """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) @@ -1611,7 +1622,8 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) """ output_hidden_states = ( @@ -1772,6 +1784,7 @@ class PatchTSTForForecasting(PatchTSTPreTrainedModel): """ PatchTST for forecasting. The model contains PatchTST model + Forecasting head """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) self.model = PatchTSTModel(config) @@ -1819,7 +1832,8 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) """ output_hidden_states = ( @@ -1917,6 +1931,7 @@ class PatchTSTRegressionHead(nn.Module): """ Regression head """ + def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.y_range = config.prediction_range @@ -2015,7 +2030,8 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) + `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) """ output_hidden_states = ( From 23819947610e5c3b479985babc4da73da70cf8b9 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 11 Oct 2023 09:47:02 +0200 Subject: [PATCH 113/189] fix argument types --- .../models/patchtst/configuration_patchtst.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 89872754623825..25839adde23c7c 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -41,12 +41,12 @@ class PatchTSTConfig(PretrainedConfig): num_input_channels (`int`, *optional*, defaults to 1): The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of multivariate targets. - context_length (`int`, defaults to 32, *optional*, defaults to 32): + context_length (`int`, *optional*, defaults to 32): The context length for the encoder. - distribution_output (`string`, *optional*, defaults to `"student_t"`): + distribution_output (`str`, *optional*, defaults to `"student_t"`): The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial". - loss (`string`, *optional*, defaults to `"mse"`): + loss (`str`, *optional*, defaults to `"mse"`): The loss function for the model corresponding to the `distribution_output` head. For parametric distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse". @@ -94,9 +94,9 @@ class PatchTSTConfig(PretrainedConfig): The standard deviation of the truncated normal weight initialization distribution. shared_projection (`bool`, *optional*, defaults to `True`): Sharing the projection layer across different channels in the forecast head. - seed_number (`int`, *optional*): + seed_number (`Optional`, *optional*): Use seed number for random masking. - scaling (`string` or `bool`, *optional*, defaults to `"mean"`): + scaling (`Union`, *optional*, defaults to `"mean"`): Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the scaler is set to "mean". mask_input (`bool`, *optional*, defaults to False): @@ -128,7 +128,6 @@ class PatchTSTConfig(PretrainedConfig): num_parallel_samples (`int`, *optional*, defaults to 100): The number of samples to generate in parallel for probablistic forecast. - Example: ```python >>> from transformers import PatchTSTConfig, PatchTSTModel From e79f0fd4f6e8acae249975cfca602017448aaf01 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 11 Oct 2023 12:07:38 +0200 Subject: [PATCH 114/189] fix tests --- .../models/patchtst/configuration_patchtst.py | 2 +- tests/models/patchtst/test_modeling_patchtst.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 25839adde23c7c..27f762459cd52b 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -46,7 +46,7 @@ class PatchTSTConfig(PretrainedConfig): distribution_output (`str`, *optional*, defaults to `"student_t"`): The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or "negative_binomial". - loss (`str`, *optional*, defaults to `"mse"`): + loss (`str`, *optional*, defaults to `"mse"`): The loss function for the model corresponding to the `distribution_output` head. For parametric distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse". diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index d25cc525326ab5..92bc76c375e0bd 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -188,8 +188,11 @@ def test_config(self): def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) - # if classification model: - if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): + # if PatchTSTForPretraining + if model_class == PatchTSTForPretraining: + inputs_dict.pop("future_values") + # else if classification model: + elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): rng = random.Random(self.model_tester.seed_number) labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng) inputs_dict["labels"] = labels @@ -272,7 +275,9 @@ def test_forward_signature(self): "past_observed_mask", "future_values", ] - if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( + if model_class == PatchTSTForPretraining: + expected_arg_names.remove("future_values") + elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values( MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING ): expected_arg_names.remove("future_values") @@ -282,6 +287,7 @@ def test_forward_signature(self): expected_arg_names.extend( [ "output_hidden_states", + "return_dict", ] ) From b61bec0ce64b3456dd723765bb1f77f8261cb358 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 11 Oct 2023 12:27:04 -0400 Subject: [PATCH 115/189] change x variable to patch_input --- .../models/patchtst/modeling_patchtst.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3b36c0a4850160..426090bd4a73de 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -500,14 +500,14 @@ def __init__( super().__init__() - def forward(self, x: torch.Tensor): + def forward(self, patch_input: torch.Tensor): """ Parameters: - x (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): + patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Patch input Return: - x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) + masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) Masked patched input mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`) Bool tensor indicating True on masked points @@ -515,8 +515,8 @@ def forward(self, x: torch.Tensor): """ if self.mask_type == "random": - x_mask, mask = random_masking( - inputs=x, + masked_input, mask = random_masking( + inputs=patch_input, mask_ratio=self.mask_ratio, unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, @@ -524,8 +524,8 @@ def forward(self, x: torch.Tensor): seed_number=self.seed_number, ) elif self.mask_type == "forecast": - x_mask, mask = forecast_masking( - inputs=x, + masked_input, mask = forecast_masking( + inputs=patch_input, patch_lengths=self.mask_patches, mix_ratio=self.mask_patch_ratios, unmasked_channel_indices=self.unmasked_channel_indices, @@ -537,7 +537,7 @@ def forward(self, x: torch.Tensor): mask = mask.bool() # mask: [bs x num_input_channels x num_patch] - return x_mask, mask + return masked_input, mask class PatchTSTEncoderBlock(nn.Module): From e9088621b9bafcec5615463296e7362478b4079f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 11 Oct 2023 17:46:08 -0400 Subject: [PATCH 116/189] format --- .../models/patchtst/modeling_patchtst.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 426090bd4a73de..0db1ae5768b933 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -550,7 +550,10 @@ def __init__(self, config: PatchTSTConfig): self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) - def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): + def forward(self, + hidden_state: torch.Tensor, + output_hidden_states: Optional[bool] = None + ): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -770,7 +773,9 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None + self, + past_values: torch.Tensor, + output_hidden_states: Optional[bool] = None ) -> BaseModelOutputWithNoAttention: """ Parameters: @@ -1148,7 +1153,10 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5) self.minimum_scale = minimum_scale @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, + data: torch.Tensor, + weights: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: denominator = weights.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator @@ -1185,9 +1193,10 @@ def __init__( self.default_scale = default_scale @torch.no_grad() - def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, + data: torch.Tensor, + observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # shape: (N, [C], T=1) ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -1233,9 +1242,10 @@ def __init__(self, dim: int, keepdim: bool = False): self.dim = dim self.keepdim = keepdim - def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, + data: torch.Tensor, + observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale From 25e669bcb1356d47558ec0ca75326b145323066f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 12 Oct 2023 12:33:08 +0200 Subject: [PATCH 117/189] formatting --- .../models/patchtst/modeling_patchtst.py | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 0db1ae5768b933..426090bd4a73de 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -550,10 +550,7 @@ def __init__(self, config: PatchTSTConfig): self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) - def forward(self, - hidden_state: torch.Tensor, - output_hidden_states: Optional[bool] = None - ): + def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -773,9 +770,7 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, - past_values: torch.Tensor, - output_hidden_states: Optional[bool] = None + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None ) -> BaseModelOutputWithNoAttention: """ Parameters: @@ -1153,10 +1148,7 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5) self.minimum_scale = minimum_scale @torch.no_grad() - def forward(self, - data: torch.Tensor, - weights: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: denominator = weights.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator @@ -1193,10 +1185,9 @@ def __init__( self.default_scale = default_scale @torch.no_grad() - def forward(self, - data: torch.Tensor, - observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # shape: (N, [C], T=1) ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -1242,10 +1233,9 @@ def __init__(self, dim: int, keepdim: bool = False): self.dim = dim self.keepdim = keepdim - def forward(self, - data: torch.Tensor, - observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale From b9c01ff2de9e6a012b4d68ab7248c5cb9f465d37 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 18 Oct 2023 13:39:46 +0200 Subject: [PATCH 118/189] fix-copies --- README.md | 2 +- README_es.md | 2 +- README_hd.md | 2 +- README_ja.md | 2 +- README_ko.md | 2 +- README_zh-hans.md | 2 +- README_zh-hant.md | 2 +- docs/source/en/index.md | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ff96020aa400c1..f4da5aff298803 100644 --- a/README.md +++ b/README.md @@ -434,8 +434,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_es.md b/README_es.md index a78d20a42996a4..a20cb25f308e82 100644 --- a/README_es.md +++ b/README_es.md @@ -409,8 +409,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/README_hd.md b/README_hd.md index 972a5e4c6eeeef..dee06e1a0ece3a 100644 --- a/README_hd.md +++ b/README_hd.md @@ -381,8 +381,8 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया। -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI से) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. द्वाराअनुसंधान पत्र [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) के साथ जारी किया गया +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा। 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया। diff --git a/README_ja.md b/README_ja.md index 1e77601d0fc7b3..dc259decdacb4e 100644 --- a/README_ja.md +++ b/README_ja.md @@ -443,8 +443,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. から公開された研究論文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) diff --git a/README_ko.md b/README_ko.md index 10162d8db9c582..71db05859b84e8 100644 --- a/README_ko.md +++ b/README_ko.md @@ -358,8 +358,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다. 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI 에서 제공)은 Matthias Minderer, Alexey Gritsenko, Neil Houlsby.의 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)논문과 함께 발표했습니다. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 0c02049371dad5..380dacfde1fcf0 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -382,8 +382,8 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。 -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (来自 Google AI) 伴随论文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) 由 Matthias Minderer, Alexey Gritsenko, Neil Houlsby 发布。 +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 60c72b34ed5e54..e7cec87615d14b 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -394,8 +394,8 @@ conda install -c huggingface transformers 1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al. 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. -1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. +1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu. 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ebc740a7cc9466..dc2204b94f22c8 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -209,8 +209,8 @@ Flax), PyTorch, and/or TensorFlow. | [OpenLlama](model_doc/open-llama) | ✅ | ❌ | ❌ | | [OPT](model_doc/opt) | ✅ | ✅ | ✅ | | [OWL-ViT](model_doc/owlvit) | ✅ | ❌ | ❌ | -| [PatchTST](model_doc/patchtst) | ✅ | ❌ | ❌ | | [OWLv2](model_doc/owlv2) | ✅ | ❌ | ❌ | +| [PatchTST](model_doc/patchtst) | ✅ | ❌ | ❌ | | [Pegasus](model_doc/pegasus) | ✅ | ✅ | ✅ | | [PEGASUS-X](model_doc/pegasus_x) | ✅ | ❌ | ❌ | | [Perceiver](model_doc/perceiver) | ✅ | ❌ | ❌ | From 9955e4fea795abf43d6cb18ab5955a76722ad46e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:45:31 +0200 Subject: [PATCH 119/189] Update tests/models/patchtst/test_modeling_patchtst.py Co-authored-by: Patrick von Platen --- tests/models/patchtst/test_modeling_patchtst.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 92bc76c375e0bd..eaa1033cc0fc1a 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -214,7 +214,6 @@ def test_save_load_strict(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) - # def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) From 5dbe6193f86974fb197f5b6afab5031c3f0c40f0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:53:30 +0200 Subject: [PATCH 120/189] move loss to forward --- .../models/patchtst/modeling_patchtst.py | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 426090bd4a73de..36a5bc8e768ad8 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -232,7 +232,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model): nn.init.uniform_(position_enc, -0.02, 0.02) elif position_embedding_type == "normal": position_enc = torch.zeros((q_len, 1)) - torch.nn.init.normal_(position_enc, mean=0.0, std=0.1) + nn.init.normal_(position_enc, mean=0.0, std=0.1) elif position_embedding_type == "uniform": position_enc = torch.zeros((q_len, 1)) nn.init.uniform_(position_enc, a=0.0, b=0.1) @@ -710,7 +710,7 @@ class PatchTSTPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize weights""" if self.config.use_cls_token: - torch.nn.init.normal_(self.config.cls_token, std=0.02) + nn.init.normal_(self.config.cls_token, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) @@ -1362,7 +1362,6 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config=config) self.head = MaskPretrainHead(config) - self.loss = torch.nn.MSELoss(reduction="none") # Initialize weights and apply final processing self.post_init() @@ -1408,7 +1407,8 @@ def forward( x_hat = self.head(model_output[0]) # calculate masked_loss - loss_val = self.loss(x_hat, model_output.patched_input) + loss = nn.MSELoss(reduction="none") + loss_val = loss(x_hat, model_output.patched_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) encoder_states = model_output.hidden_states @@ -1427,7 +1427,6 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) self.head = PatchTSTClassificationHead(config) - self.loss = nn.CrossEntropyLoss() # Initialize weights and apply final processing self.post_init() @@ -1471,7 +1470,8 @@ def forward( loss_val = None if labels is not None: - loss_val = self.loss(y_hat, labels) + loss = nn.CrossEntropyLoss() + loss_val = loss(y_hat, labels) encoder_states = model_output.hidden_states if not return_dict: @@ -1578,10 +1578,8 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) if config.loss == "mse": - self.loss = nn.MSELoss(reduction="mean") self.distribution_output = None else: - self.loss = nll if config.distribution_output == "student_t": self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "normal": @@ -1643,11 +1641,12 @@ def forward( if future_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) - loss_val = self.loss(distribution, future_values) + loss_val = nll(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) else: - loss_val = self.loss(y_hat, future_values) + loss = nn.MSELoss(reduction="mean") + loss_val = loss(y_hat, future_values) encoder_states = model_output.hidden_states if not return_dict: @@ -1790,10 +1789,8 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) if config.loss == "mse": - self.loss = nn.MSELoss(reduction="mean") self.distribution_output = None else: - self.loss = nll if config.distribution_output == "student_t": self.distribution_output = StudentTOutput(dim=config.prediction_length) elif config.distribution_output == "normal": @@ -1855,7 +1852,7 @@ def forward( distribution = self.distribution_output.distribution( y_hat, loc=model_output.loc, scale=model_output.scale ) - loss_val = self.loss(distribution, future_values) + loss_val = nll(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) # for testing @@ -1863,7 +1860,8 @@ def forward( # loss_val = weighted_average(loss_val) else: y_hat = y_hat * model_output.scale + model_output.loc - loss_val = self.loss(y_hat, future_values) + loss = nn.MSELoss(reduction="mean") + loss_val = loss(y_hat, future_values) encoder_states = model_output.hidden_states loc = model_output.loc @@ -1986,10 +1984,8 @@ def __init__(self, config: PatchTSTConfig): self.model = PatchTSTModel(config) if config.loss == "mse": - self.loss = nn.MSELoss(reduction="mean") self.distribution_output = None else: - self.loss = nll if config.distribution_output == "student_t": self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) elif config.distribution_output == "normal": @@ -2049,11 +2045,12 @@ def forward( if labels is not None: if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) - loss_val = self.loss(distribution, labels) + loss_val = nll(distribution, labels) # take average of the loss loss_val = weighted_average(loss_val) else: - loss_val = self.loss(y_hat, labels) + loss = nn.MSELoss(reduction="mean") + loss_val = loss(y_hat, labels) encoder_states = model_output.hidden_states From 099b76cef747d355247e9208bab885eb2af8d310 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:57:07 +0200 Subject: [PATCH 121/189] Update src/transformers/models/patchtst/modeling_patchtst.py Co-authored-by: Patrick von Platen --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 36a5bc8e768ad8..7e7e5f4b0fb685 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -570,7 +570,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[boo hidden_state = mod(hidden_state) if output_hidden_states: all_hidden_states.append(hidden_state) - if output_hidden_states is None: + if output_hidden_states is False: return hidden_state, None return hidden_state, all_hidden_states From b9c935f60802240d15a4b48267a8dda6175d6cc2 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:58:01 +0200 Subject: [PATCH 122/189] Update src/transformers/models/patchtst/modeling_patchtst.py Co-authored-by: Patrick von Platen --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 7e7e5f4b0fb685..6679968c45ca95 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -550,7 +550,7 @@ def __init__(self, config: PatchTSTConfig): self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) - def forward(self, hidden_state: torch.Tensor, output_hidden_states: Optional[bool] = None): + def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): From b8d59f8f281f3d5f897a891339ce412efa209458 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:58:12 +0200 Subject: [PATCH 123/189] Update src/transformers/models/patchtst/modeling_patchtst.py Co-authored-by: Patrick von Platen --- src/transformers/models/patchtst/modeling_patchtst.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6679968c45ca95..a455358265b9da 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -205,7 +205,8 @@ class PatchTSTTranspose(nn.Module): def __init__(self, *dims, contiguous=False): super().__init__() - self.dims, self.contiguous = dims, contiguous + self.dims = dims + self.contiguous = dims def forward(self, inputs: torch.Tensor): """ From b7c04c746b0b577ac0eec701d0c44191feb046f0 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:58:34 +0200 Subject: [PATCH 124/189] Update src/transformers/models/patchtst/modeling_patchtst.py Co-authored-by: Patrick von Platen --- src/transformers/models/patchtst/modeling_patchtst.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a455358265b9da..d435287b59341e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -488,6 +488,7 @@ def __init__( mask_value: int = 0, seed_number: Optional[int] = None, ): + super().__init__() self.mask_ratio = mask_ratio self.channel_consistent_masking = channel_consistent_masking self.mask_type = mask_type From c920eee26aa4195cb17ad29d50fa59856bf297d9 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 10:58:53 +0200 Subject: [PATCH 125/189] Update src/transformers/models/patchtst/modeling_patchtst.py Co-authored-by: Patrick von Platen --- src/transformers/models/patchtst/modeling_patchtst.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d435287b59341e..4cbf2d2177dff1 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -500,7 +500,6 @@ def __init__( self.unmasked_channel_indices.sort() self.seed_number = seed_number - super().__init__() def forward(self, patch_input: torch.Tensor): """ From 6642ab937a76a2bb28b960b954d6efe23867c629 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 19 Oct 2023 11:42:44 +0200 Subject: [PATCH 126/189] formatting --- src/transformers/models/patchtst/modeling_patchtst.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4cbf2d2177dff1..6be095e368a700 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -500,7 +500,6 @@ def __init__( self.unmasked_channel_indices.sort() self.seed_number = seed_number - def forward(self, patch_input: torch.Tensor): """ Parameters: From 78697674d890f5b62aa8768171a29a1bfd9793e9 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 19 Oct 2023 08:35:55 -0400 Subject: [PATCH 127/189] fix a bug when pre_norm is set to True --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6be095e368a700..d6414599ddd743 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -648,7 +648,7 @@ def forward(self, hidden_state: torch.Tensor): if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src)[0]) + self.self_attn(self.norm_sublayer1(src))[0] ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT @@ -669,7 +669,7 @@ def forward(self, hidden_state: torch.Tensor): if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src)[0]) + self.self_attn(self.norm_sublayer2(src))[0] ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm From 2fb741742c9b3741c56256c52aae77fbfcf0af4b Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 19 Oct 2023 14:53:43 -0400 Subject: [PATCH 128/189] output_hidden_states is set to False as default --- .../models/patchtst/modeling_patchtst.py | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d6414599ddd743..f528a80769f52b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -770,7 +770,7 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = False ) -> BaseModelOutputWithNoAttention: """ Parameters: @@ -783,9 +783,6 @@ def forward( """ _, num_input_channels, _, _ = past_values.shape - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) # Input encoding if not self.shared_embedding: x_out = [] @@ -1286,12 +1283,10 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if past_observed_mask is None: @@ -1370,7 +1365,7 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForPretrainingOutput]: """ @@ -1391,9 +1386,7 @@ def forward( `config.return_dict`=False) """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # past_values: [bs x num_channels x num_patches x d_model] or @@ -1436,7 +1429,7 @@ def forward( past_values: torch.Tensor, labels: torch.Tensor = None, past_observed_mask: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForClassificationOutput]: """ @@ -1458,9 +1451,7 @@ def forward( `config.return_dict`=False) """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict model_output = self.model( @@ -1601,8 +1592,8 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, ) -> Union[Tuple, PatchTSTForPredictionOutput]: """ Parameters: @@ -1624,9 +1615,7 @@ def forward( `config.return_dict`=False) """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # get model output @@ -1684,7 +1673,7 @@ def generate( past_values=past_values, future_values=None, past_observed_mask=past_observed_mask, - output_hidden_states=None, + output_hidden_states=False, ) # get distribution @@ -1810,7 +1799,7 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForForecastingOutput]: """ @@ -1833,9 +1822,6 @@ def forward( `config.return_dict`=False) """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # get model output @@ -1909,7 +1895,7 @@ def generate( past_values=past_values, future_values=None, past_observed_mask=past_observed_mask, - output_hidden_states=None, + output_hidden_states=False, ) # get distribution @@ -2007,7 +1993,7 @@ def forward( past_values: torch.Tensor, labels: Optional[torch.Tensor], past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForRegressionOutput]: """ @@ -2030,9 +2016,6 @@ def forward( `config.return_dict`=False) """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict model_output = self.model( @@ -2086,7 +2069,10 @@ def generate( # get model output outputs = self( - past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=None + past_values=past_values, + labels=None, + past_observed_mask=past_observed_mask, + output_hidden_states=False ) # get distribution From 9168ca28a46fbe474db696ac8d1a2f6c6e163969 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 19 Oct 2023 14:54:12 -0400 Subject: [PATCH 129/189] set pre_norm=True as default --- src/transformers/models/patchtst/configuration_patchtst.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 27f762459cd52b..e5c78104d3262d 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -82,8 +82,9 @@ class PatchTSTConfig(PretrainedConfig): Consider bias in the feed-forward networks. activation_function (`str`, *optional*, defaults to `"gelu"`): The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. - pre_norm (`bool`, *optional*, defaults to `False`): - TODO + pre_norm (`bool`, *optional*, defaults to `True`): + Normalization is applied before self-attention if pre_norm is set to True. Otherwise, normalization is + applied after residual block. positional_encoding (`str`, *optional*, defaults to `"sincos"`): Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. learn_pe (`bool`, *optional*, defaults to `False`): @@ -173,7 +174,7 @@ def __init__( ff_dropout: float = 0.0, bias: bool = True, activation_function: str = "gelu", - pre_norm: bool = False, + pre_norm: bool = True, positional_encoding: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, From 7829a57c9310c2051b68ddef76fc64e967745817 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 19 Oct 2023 22:15:12 -0400 Subject: [PATCH 130/189] format docstring --- .../models/patchtst/modeling_patchtst.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f528a80769f52b..b296b5d20072bc 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -555,7 +555,7 @@ def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): Past values of the time series - output_hidden_states (`bool`, *optional*): + output_hidden_states (`bool`, *optional*, default to False): output hidden state option Return: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`) @@ -860,7 +860,7 @@ def forward( For multivariate time series, the `num_input_channels` > 1 dimension is required and corresponds to the number of variates in the time series per time step. - output_hidden_states (`bool`, *optional*): + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers. """ @@ -1378,7 +1378,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1443,7 +1443,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1607,7 +1607,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1814,7 +1814,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -2008,7 +2008,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): target labels associates with the `past_values` - output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: From 5cc98cba747f89c253308f244933e583853a780a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 20 Oct 2023 08:24:04 +0200 Subject: [PATCH 131/189] format --- .../models/patchtst/modeling_patchtst.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index b296b5d20072bc..322fe6effca0b7 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1286,7 +1286,6 @@ def forward( output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if past_observed_mask is None: @@ -1378,7 +1377,8 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): + Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1443,7 +1443,8 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): + Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1607,7 +1608,8 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): + Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -1814,7 +1816,8 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): + Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -2008,7 +2011,8 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): target labels associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): Whether or not to return the hidden states of all layers + output_hidden_states (`bool`, *optional*, default to False): + Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: @@ -2069,10 +2073,7 @@ def generate( # get model output outputs = self( - past_values=past_values, - labels=None, - past_observed_mask=past_observed_mask, - output_hidden_states=False + past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=False ) # get distribution From 3a09a1e7cf267ad7c128658456e6856166b2cd23 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 20 Oct 2023 13:16:48 +0200 Subject: [PATCH 132/189] output_hidden_states is None by default --- .../models/patchtst/modeling_patchtst.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 322fe6effca0b7..6fb7d69b5b3aab 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -770,7 +770,7 @@ def __init__(self, config: PatchTSTConfig): self.post_init() def forward( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = False + self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None ) -> BaseModelOutputWithNoAttention: """ Parameters: @@ -808,6 +808,9 @@ def forward( ) # x: [bs x num_channels x num_patches x d_model] # Encoder + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) past_values, hidden_states = self.encoder( past_values, output_hidden_states ) # x: [bs x num_channels x num_patches x d_model] @@ -1283,7 +1286,7 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1364,7 +1367,7 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForPretrainingOutput]: """ @@ -1429,7 +1432,7 @@ def forward( past_values: torch.Tensor, labels: torch.Tensor = None, past_observed_mask: Optional[bool] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForClassificationOutput]: """ @@ -1593,7 +1596,7 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = True, ) -> Union[Tuple, PatchTSTForPredictionOutput]: """ @@ -1801,7 +1804,7 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForForecastingOutput]: """ @@ -1996,7 +1999,7 @@ def forward( past_values: torch.Tensor, labels: Optional[torch.Tensor], past_observed_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = False, + output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForRegressionOutput]: """ From 1777fc3b429542ce1f727b40a7eb23ec4cbd6403 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 20 Oct 2023 16:06:47 +0200 Subject: [PATCH 133/189] add missing docs --- src/transformers/models/patchtst/configuration_patchtst.py | 5 +++-- src/transformers/models/patchtst/modeling_patchtst.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index e5c78104d3262d..d7336b1b0b5742 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -107,9 +107,10 @@ class PatchTSTConfig(PretrainedConfig): mask_ratio (`float`, *optional*, defaults to 0.5): Masking ratio is applied to mask the input data during pretraining. mask_patches (`List`, *optional*, defaults to `[2, 3]`): - TODO + List of patch lengths to mask in the end of the data. mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): - TODO + List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], + then equal weights to both patch lengths. Defaults to None. channel_consistent_masking (`bool`, *optional*, defaults to `False`): If channel consistent masking is True, all the channels will have the same masking. unmasked_channel_indices (`list`, *optional*): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 6fb7d69b5b3aab..9190f9b36d0dca 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -459,8 +459,9 @@ class PatchTSTMasking(nn.Module): mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random. mask_ratio (`float`, *optional*): Mask ratio. mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. - mask_patch_ratios (`list`, *optional*): List of weights to use for each patch length. For Ex. - if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. + mask_patch_ratios (`list`, *optional*): + List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], + then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. channel_consistent_masking (`bool`, *optional*): From 21803e0f5bc1a51c6bb47fe19cd50869ef9b419e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 20 Oct 2023 16:12:39 +0200 Subject: [PATCH 134/189] better var names --- .../models/patchtst/modeling_patchtst.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 9190f9b36d0dca..8ce3a9b33cc9d9 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -360,11 +360,11 @@ def forecast_masking( total_length = 0 total_ratio = sum(mix_ratio) - for i, j in zip(patch_lengths, mix_ratio): - if i <= 0 or i >= sequence_length: + for patch_length, ratio in zip(patch_lengths, mix_ratio): + if patch_length <= 0 or patch_length >= sequence_length: raise Exception("masked_patch_len should be greater than 0 and less than total patches.") - temp_len = int(batch_size * j / total_ratio) - t_list.append([i, j, temp_len]) + temp_len = int(batch_size * ratio / total_ratio) + t_list.append([patch_length, ratio, temp_len]) total_length += temp_len t_list = sorted(t_list, key=lambda x: x[2]) @@ -374,11 +374,11 @@ def forecast_masking( elif total_length > batch_size: t_list[-1][2] = t_list[-1][2] + (total_length - batch_size) - b1 = 0 - for p, _, l in t_list: - b2 = b1 + l - mask[b1:b2, :, -p:] = 1 - b1 = b2 + batch1 = 0 + for patch_len, _, temp_len in t_list: + batch2 = batch1 + temp_len + mask[batch1:batch2, :, -patch_len:] = 1 + batch1 = batch2 perm = torch.randperm(mask.shape[0]) mask = mask[perm] From 87068ca095df60343d46b15f0816dd73272b15db Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 20 Oct 2023 11:06:12 -0400 Subject: [PATCH 135/189] docstring: remove default to False in output_hidden_states --- src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 8ce3a9b33cc9d9..04cac532c0ed7f 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1381,7 +1381,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*, default to False): + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -1447,7 +1447,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - output_hidden_states (`bool`, *optional*, default to False): + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -1612,7 +1612,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -1820,7 +1820,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*): future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -2015,7 +2015,7 @@ def forward( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): target labels associates with the `past_values` - output_hidden_states (`bool`, *optional*, default to False): + output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. From 4c8c7d0ccde947f7247e02396dca0f42896b29fa Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 22 Oct 2023 23:30:59 -0400 Subject: [PATCH 136/189] change labels name to target_values in regression task --- .../models/patchtst/modeling_patchtst.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 04cac532c0ed7f..5595d92e327b23 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1998,7 +1998,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - labels: Optional[torch.Tensor], + target_values: Optional[torch.Tensor], past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -2013,8 +2013,8 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - labels (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): - target labels associates with the `past_values` + target_values (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): + target values associates with the `past_values` output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -2033,15 +2033,15 @@ def forward( y_hat = self.head(model_output.last_hidden_state) loss_val = None - if labels is not None: + if target_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) - loss_val = nll(distribution, labels) + loss_val = nll(distribution, target_values) # take average of the loss loss_val = weighted_average(loss_val) else: loss = nn.MSELoss(reduction="mean") - loss_val = loss(y_hat, labels) + loss_val = loss(y_hat, target_values) encoder_states = model_output.hidden_states @@ -2077,7 +2077,7 @@ def generate( # get model output outputs = self( - past_values=past_values, labels=None, past_observed_mask=past_observed_mask, output_hidden_states=False + past_values=past_values, target_values=None, past_observed_mask=past_observed_mask, output_hidden_states=False ) # get distribution From dccbc31583eed8f71ff6e1febd8f989f5ea42b3d Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 23 Oct 2023 11:45:34 +0200 Subject: [PATCH 137/189] format --- src/transformers/models/patchtst/modeling_patchtst.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5595d92e327b23..3730eec898f67a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -2077,7 +2077,10 @@ def generate( # get model output outputs = self( - past_values=past_values, target_values=None, past_observed_mask=past_observed_mask, output_hidden_states=False + past_values=past_values, + target_values=None, + past_observed_mask=past_observed_mask, + output_hidden_states=False, ) # get distribution From 3d12866ace7b7ca75a42431f6c884a55037952f6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 23 Oct 2023 11:56:09 +0200 Subject: [PATCH 138/189] fix tests --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++-- tests/models/patchtst/test_modeling_patchtst.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 3730eec898f67a..ced40e2d3c1f44 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1998,7 +1998,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - target_values: Optional[torch.Tensor], + target_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -2013,7 +2013,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - target_values (`torch.Tensor` of shape `(bs, num_input_channels)`, *optional*): + target_values (`torch.Tensor` of shape `(bs, num_input_channels)`): target values associates with the `past_values` output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index eaa1033cc0fc1a..496cfc8301aa86 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -199,8 +199,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict.pop("future_values") elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): rng = random.Random(self.model_tester.seed_number) - labels = floats_tensor([self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng) - inputs_dict["labels"] = labels + target_values = floats_tensor( + [self.model_tester.batch_size, self.model_tester.num_output_channels], rng=rng + ) + inputs_dict["target_values"] = target_values inputs_dict.pop("future_values") return inputs_dict @@ -281,7 +283,9 @@ def test_forward_signature(self): ): expected_arg_names.remove("future_values") expected_arg_names.remove("past_observed_mask") - expected_arg_names.append("labels") + expected_arg_names.append("labels") if model_class in get_values( + MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING + ) else expected_arg_names.append("target_values") expected_arg_names.append("past_observed_mask") expected_arg_names.extend( [ From c489972f864c98f76505ed49ade0bcda142a89ce Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 23 Oct 2023 12:32:28 -0400 Subject: [PATCH 139/189] change to forecast_mask_ratios and random_mask_ratio --- .../models/patchtst/configuration_patchtst.py | 8 ++++---- .../models/patchtst/modeling_patchtst.py | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index d7336b1b0b5742..bd4733f0da607d 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -104,11 +104,11 @@ class PatchTSTConfig(PretrainedConfig): Apply masking during the pretraining. mask_type (`str`, *optional*, defaults to `"random"`): Masking type. Only `"random"` is currently supported. - mask_ratio (`float`, *optional*, defaults to 0.5): - Masking ratio is applied to mask the input data during pretraining. - mask_patches (`List`, *optional*, defaults to `[2, 3]`): + random_mask_ratio (`float`, *optional*, defaults to 0.5): + Masking ratio is applied to mask the input data during random pretraining. + forecast_mask_patches (`List`, *optional*, defaults to `[2, 3]`): List of patch lengths to mask in the end of the data. - mask_patch_ratios (`List`, *optional*, defaults to `[1, 1]`): + forecast_mask_ratios (`List`, *optional*, defaults to `[1, 1]`): List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. channel_consistent_masking (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5595d92e327b23..170ca4c264f2e2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -317,13 +317,13 @@ def random_masking( def forecast_masking( inputs: torch.Tensor, - patch_lengths: list, + forecast_mask_patches: list, mix_ratio: list = None, unmasked_channel_indices: list = None, mask_value: int = 0, seed_number: Optional[int] = None, ): - """Forecast masking that masks the last K patches where K is from the patch_lengths list. + """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list. For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in unmasked_channel_indices. @@ -331,10 +331,10 @@ def forecast_masking( inputs (`torch.Tensor`): Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)` - patch_lengths (`list`): + forecast_mask_patches (`list`): [2, 4] List of patch lengths to mask in the end of the data. - mix_ratio (`list`, *optional*): - List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], + mix_ratio (`list`, *optional*): [0.7, 0.3] + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Control Variable channel indices. These channels will not be masked. Defaults to None. @@ -351,7 +351,7 @@ def forecast_masking( set_seed(seed_number) if mix_ratio is None: - mix_ratio = [1 for _ in patch_lengths] + mix_ratio = [1 for _ in forecast_mask_patches] batch_size, num_channels, sequence_length, num_features = inputs.shape mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device) @@ -360,7 +360,7 @@ def forecast_masking( total_length = 0 total_ratio = sum(mix_ratio) - for patch_length, ratio in zip(patch_lengths, mix_ratio): + for patch_length, ratio in zip(forecast_mask_patches, mix_ratio): if patch_length <= 0 or patch_length >= sequence_length: raise Exception("masked_patch_len should be greater than 0 and less than total patches.") temp_len = int(batch_size * ratio / total_ratio) @@ -460,7 +460,7 @@ class PatchTSTMasking(nn.Module): mask_ratio (`float`, *optional*): Mask ratio. mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. mask_patch_ratios (`list`, *optional*): - List of weights to use for each patch length. For Ex. if patch_lengths is [5,4] and mix_ratio is [1,1], + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. @@ -527,7 +527,7 @@ def forward(self, patch_input: torch.Tensor): elif self.mask_type == "forecast": masked_input, mask = forecast_masking( inputs=patch_input, - patch_lengths=self.mask_patches, + forecast_mask_patches=self.forecast_mask_patches, mix_ratio=self.mask_patch_ratios, unmasked_channel_indices=self.unmasked_channel_indices, mask_value=self.mask_value, From 6318cd3d2741577149a1ebc56164800d86167a92 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 23 Oct 2023 16:08:51 -0400 Subject: [PATCH 140/189] change mask names --- .../models/patchtst/configuration_patchtst.py | 12 ++--- .../models/patchtst/modeling_patchtst.py | 46 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index bd4733f0da607d..b3fad61f911f5d 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -186,9 +186,9 @@ def __init__( # mask pretraining mask_input: Optional[bool] = None, mask_type: str = "random", - mask_ratio: float = 0.5, - mask_patches: List[int] = [2, 3], - mask_patch_ratios: List[int] = [1, 1], + random_mask_ratio: float = 0.5, + forecast_mask_patches: List[int] = [2, 3], + forecast_mask_ratios: List[int] = [1, 1], channel_consistent_masking: bool = False, unmasked_channel_indices: Optional[List[int]] = None, mask_value=0, @@ -240,9 +240,9 @@ def __init__( self.seed_number = seed_number self.mask_input = mask_input self.mask_type = mask_type - self.mask_ratio = mask_ratio - self.mask_patches = mask_patches - self.mask_patch_ratios = mask_patch_ratios + self.random_mask_ratio = random_mask_ratio # for random masking + self.forecast_mask_patches = forecast_mask_patches # for forecast masking + self.forecast_mask_ratios = forecast_mask_ratios self.channel_consistent_masking = channel_consistent_masking self.unmasked_channel_indices = unmasked_channel_indices self.mask_value = mask_value diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 86f9cf9aeb0047..5bb1a5cb716815 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -318,13 +318,13 @@ def random_masking( def forecast_masking( inputs: torch.Tensor, forecast_mask_patches: list, - mix_ratio: list = None, + forecast_mask_ratios: list = None, unmasked_channel_indices: list = None, mask_value: int = 0, seed_number: Optional[int] = None, ): """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list. - For every batch, distribute the patch lengths based on mix_ratio and ignore masks for column indices mentioned in + For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices mentioned in unmasked_channel_indices. Parameters: @@ -333,8 +333,8 @@ def forecast_masking( patch_len)` forecast_mask_patches (`list`): [2, 4] List of patch lengths to mask in the end of the data. - mix_ratio (`list`, *optional*): [0.7, 0.3] - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1], + forecast_mask_ratios (`list`, *optional*): [0.7, 0.3] + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Control Variable channel indices. These channels will not be masked. Defaults to None. @@ -350,17 +350,17 @@ def forecast_masking( if seed_number: set_seed(seed_number) - if mix_ratio is None: - mix_ratio = [1 for _ in forecast_mask_patches] + if forecast_mask_ratios is None: + forecast_mask_ratios = [1 for _ in forecast_mask_patches] batch_size, num_channels, sequence_length, num_features = inputs.shape mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device) t_list = [] total_length = 0 - total_ratio = sum(mix_ratio) + total_ratio = sum(forecast_mask_ratios) - for patch_length, ratio in zip(forecast_mask_patches, mix_ratio): + for patch_length, ratio in zip(forecast_mask_patches, forecast_mask_ratios): if patch_length <= 0 or patch_length >= sequence_length: raise Exception("masked_patch_len should be greater than 0 and less than total patches.") temp_len = int(batch_size * ratio / total_ratio) @@ -457,10 +457,10 @@ class PatchTSTMasking(nn.Module): Parameters: mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random. - mask_ratio (`float`, *optional*): Mask ratio. - mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. - mask_patch_ratios (`list`, *optional*): - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and mix_ratio is [1,1], + random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining. + forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. + forecast_mask_ratios (`list`, *optional*): + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. @@ -481,20 +481,20 @@ class PatchTSTMasking(nn.Module): def __init__( self, mask_type: str = "random", - mask_ratio: float = 0.5, - mask_patches: list = [2, 3], - mask_patch_ratios: list = [1, 1], + random_mask_ratio: float = 0.5, + forecast_mask_patches: list = [2, 3], + forecast_mask_ratios: list = [1, 1], channel_consistent_masking: bool = False, unmasked_channel_indices: list = None, mask_value: int = 0, seed_number: Optional[int] = None, ): super().__init__() - self.mask_ratio = mask_ratio + self.random_mask_ratio = random_mask_ratio self.channel_consistent_masking = channel_consistent_masking self.mask_type = mask_type - self.mask_patches = mask_patches - self.mask_patch_ratios = mask_patch_ratios + self.forecast_mask_patches = forecast_mask_patches + self.forecast_mask_ratios = forecast_mask_ratios self.unmasked_channel_indices = unmasked_channel_indices self.mask_value = mask_value if self.unmasked_channel_indices is not None: @@ -518,7 +518,7 @@ def forward(self, patch_input: torch.Tensor): if self.mask_type == "random": masked_input, mask = random_masking( inputs=patch_input, - mask_ratio=self.mask_ratio, + mask_ratio=self.random_mask_ratio, unmasked_channel_indices=self.unmasked_channel_indices, channel_consistent_masking=self.channel_consistent_masking, mask_value=self.mask_value, @@ -528,7 +528,7 @@ def forward(self, patch_input: torch.Tensor): masked_input, mask = forecast_masking( inputs=patch_input, forecast_mask_patches=self.forecast_mask_patches, - mix_ratio=self.mask_patch_ratios, + forecast_mask_ratios=self.forecast_mask_ratios, unmasked_channel_indices=self.unmasked_channel_indices, mask_value=self.mask_value, seed_number=self.seed_number, @@ -1267,9 +1267,9 @@ def __init__(self, config: PatchTSTConfig): if self.mask_input: self.masking = PatchTSTMasking( mask_type=config.mask_type, - mask_ratio=config.mask_ratio, - mask_patches=config.mask_patches, - mask_patch_ratios=config.mask_patch_ratios, + random_mask_ratio=config.random_mask_ratio, + forecast_mask_patches=config.forecast_mask_patches, + forecast_mask_ratios=config.forecast_mask_ratios, channel_consistent_masking=config.channel_consistent_masking, unmasked_channel_indices=config.unmasked_channel_indices, mask_value=config.mask_value, From 6734b654548baf24b41d6b8e8687315c41837aa9 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 23 Oct 2023 17:29:20 -0400 Subject: [PATCH 141/189] change future_values to target_values param in the prediction class --- .../models/patchtst/modeling_patchtst.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 5bb1a5cb716815..e8324efadeeb1f 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1596,7 +1596,7 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - future_values: Optional[torch.Tensor] = None, + target_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = True, ) -> Union[Tuple, PatchTSTForPredictionOutput]: @@ -1610,7 +1610,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): + target_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): future target values associates with the `past_values` output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers @@ -1633,15 +1633,15 @@ def forward( y_hat = self.head(model_output.last_hidden_state) loss_val = None - if future_values is not None: + if target_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) - loss_val = nll(distribution, future_values) + loss_val = nll(distribution, target_values) # take average of the loss loss_val = weighted_average(loss_val) else: loss = nn.MSELoss(reduction="mean") - loss_val = loss(y_hat, future_values) + loss_val = loss(y_hat, target_values) encoder_states = model_output.hidden_states if not return_dict: @@ -1677,7 +1677,7 @@ def generate( # get model output outputs = self( past_values=past_values, - future_values=None, + target_values=None, past_observed_mask=past_observed_mask, output_hidden_states=False, ) From 8a7f2a06255184a304782ae6fd3ae60263b294b7 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 23 Oct 2023 18:03:40 -0400 Subject: [PATCH 142/189] remove nn.Sequential and make PatchTSTBatchNorm class --- .../models/patchtst/modeling_patchtst.py | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e8324efadeeb1f..f3ca8a0d57b60e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -196,11 +196,12 @@ def forward( class PatchTSTTranspose(nn.Module): - """ - Parameters: + """ Transpose the tensor to the dimension defined in **dims** - dims (`list`): list of dimensions to be transposed contiguous (`bool`): if True, the transposed tensor is - contiguous + + Parameters: + dims (`list`): list of dimensions to be transposed + contiguous (`bool`, default to False): if True, the transposed tensor is contiguous """ def __init__(self, *dims, contiguous=False): @@ -221,6 +222,32 @@ def forward(self, inputs: torch.Tensor): return inputs.transpose(*self.dims) +class PatchTSTBatchNorm(nn.Module): + """ + Compute batch normalization + Parameters: + d_model (`int`): model dimension + """ + def __init__(self, d_model): + super().__init__() + self.d_model = d_model + self.transpose = PatchTSTTranspose(1, 2) + self.batchnorm = nn.BatchNorm1d(self.d_model) + + def forward(self, inputs: torch.Tensor): + """ + Parameters: + inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): + input for Batch norm calculation + Returns: + `torch.Tensor`: tensor + """ + output = self.transpose(inputs) # output: (batch_size, d_model, sequence_length) + output = self.batchnorm(output) + output = self.transpose(output) # output: (batch_size, sequence_length, d_model) + return output + + def positional_encoding(position_embedding_type, learned, q_len, d_model): # Positional encoding if position_embedding_type is None: @@ -596,9 +623,7 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of the sublayer 1 self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer1 = nn.Sequential( - PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) - ) + self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model) else: self.norm_sublayer1 = nn.LayerNorm(config.d_model) @@ -606,9 +631,7 @@ def __init__(self, config: PatchTSTConfig): if self.channel_attention: self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer2 = nn.Sequential( - PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) - ) + self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model) else: self.norm_sublayer2 = nn.LayerNorm(config.d_model) @@ -623,9 +646,7 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of sublayer 3 self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer3 = nn.Sequential( - PatchTSTTranspose(1, 2), nn.BatchNorm1d(config.d_model), PatchTSTTranspose(1, 2) - ) + self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model) else: self.norm_sublayer3 = nn.LayerNorm(config.d_model) @@ -1476,10 +1497,6 @@ def forward( class PatchTSTClassificationHead(nn.Module): - """ - Classification head - """ - def __init__(self, config: PatchTSTConfig): super().__init__() self.use_cls_token = config.use_cls_token From f23ff20dedc8c51f56afc7e4ee43c0902e269574 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 24 Oct 2023 09:48:10 +0200 Subject: [PATCH 143/189] black --- .../models/patchtst/modeling_patchtst.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f3ca8a0d57b60e..0d48cda7270938 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -196,11 +196,11 @@ def forward( class PatchTSTTranspose(nn.Module): - """ + """ Transpose the tensor to the dimension defined in **dims** Parameters: - dims (`list`): list of dimensions to be transposed + dims (`list`): list of dimensions to be transposed contiguous (`bool`, default to False): if True, the transposed tensor is contiguous """ @@ -224,10 +224,11 @@ def forward(self, inputs: torch.Tensor): class PatchTSTBatchNorm(nn.Module): """ + Parameters: Compute batch normalization - Parameters: d_model (`int`): model dimension """ + def __init__(self, d_model): super().__init__() self.d_model = d_model @@ -237,16 +238,16 @@ def __init__(self, d_model): def forward(self, inputs: torch.Tensor): """ Parameters: - inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): + inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): input for Batch norm calculation Returns: `torch.Tensor`: tensor """ - output = self.transpose(inputs) # output: (batch_size, d_model, sequence_length) + output = self.transpose(inputs) # output: (batch_size, d_model, sequence_length) output = self.batchnorm(output) - output = self.transpose(output) # output: (batch_size, sequence_length, d_model) + output = self.transpose(output) # output: (batch_size, sequence_length, d_model) return output - + def positional_encoding(position_embedding_type, learned, q_len, d_model): # Positional encoding @@ -351,8 +352,8 @@ def forecast_masking( seed_number: Optional[int] = None, ): """Forecast masking that masks the last K patches where K is from the forecast_mask_patches list. - For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices mentioned in - unmasked_channel_indices. + For every batch, distribute the patch lengths based on forecast_mask_ratios and ignore masks for column indices + mentioned in unmasked_channel_indices. Parameters: inputs (`torch.Tensor`): @@ -361,8 +362,8 @@ def forecast_masking( forecast_mask_patches (`list`): [2, 4] List of patch lengths to mask in the end of the data. forecast_mask_ratios (`list`, *optional*): [0.7, 0.3] - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1], - then equal weights to both patch lengths. Defaults to None. + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and + forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Control Variable channel indices. These channels will not be masked. Defaults to None. mask_value (`int`, *optional* defaults to 0): @@ -487,8 +488,8 @@ class PatchTSTMasking(nn.Module): random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining. forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. forecast_mask_ratios (`list`, *optional*): - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1], - then equal weights to both patch lengths. Defaults to None. + List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and + forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. unmasked_channel_indices (`list`, *optional*): Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. channel_consistent_masking (`bool`, *optional*): From d6eebdb3b6f846da44acd2b7d874b25388939b40 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 24 Oct 2023 13:22:56 +0200 Subject: [PATCH 144/189] fix argument name for prediction --- src/transformers/models/patchtst/modeling_patchtst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 0d48cda7270938..e1cbc33511b6f4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1614,7 +1614,7 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - target_values: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = True, ) -> Union[Tuple, PatchTSTForPredictionOutput]: @@ -1628,7 +1628,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - target_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): + future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): future target values associates with the `past_values` output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers @@ -1651,15 +1651,15 @@ def forward( y_hat = self.head(model_output.last_hidden_state) loss_val = None - if target_values is not None: + if future_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution(y_hat) - loss_val = nll(distribution, target_values) + loss_val = nll(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) else: loss = nn.MSELoss(reduction="mean") - loss_val = loss(y_hat, target_values) + loss_val = loss(y_hat, future_values) encoder_states = model_output.hidden_states if not return_dict: From 61b9da5b54150b41e71e7d8d89d1f9c19ba69cf9 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 24 Oct 2023 17:14:06 -0400 Subject: [PATCH 145/189] add output_attentions option --- .../models/patchtst/modeling_patchtst.py | 136 ++++++++++++------ 1 file changed, 92 insertions(+), 44 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e1cbc33511b6f4..2f734b266b1afb 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -22,7 +22,7 @@ from torch import nn from ...activations import ACT2CLS -from ...modeling_outputs import BaseModelOutputWithNoAttention +from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...trainer_utils import set_seed @@ -653,7 +653,10 @@ def __init__(self, config: PatchTSTConfig): self.pre_norm = config.pre_norm - def forward(self, hidden_state: torch.Tensor): + def forward(self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = None + ): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -662,23 +665,31 @@ def forward(self, hidden_state: torch.Tensor): `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` """ - batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape + batch_size, num_input_channels, sequence_length, d_model = hidden_states.shape # First sublayer: attention across time - src = hidden_state.view( + hidden_states = hidden_states.view( batch_size * num_input_channels, sequence_length, d_model - ) # src: [(bs*num_channels) x sequence_length x d_model] + ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] + if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path1( - self.self_attn(self.norm_sublayer1(src))[0] - ) # Add: residual connection with residual dropout + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=self.norm_sublayer1(hidden_states), + output_attentions=output_attentions + ) + hidden_states = hidden_states + self.dropout_path1(hidden_states) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - src = self.norm_sublayer1( - src + self.dropout_path1(self.self_attn(src)[0]) - ) # src: [(bs*num_channels) x sequence_length x d_model] - src = src.reshape( + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + output_attentions=output_attentions + ) + hidden_states = self.norm_sublayer1( + hidden_states + self.dropout_path1(hidden_states) + ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] + + hidden_states = hidden_states.reshape( batch_size, num_input_channels, sequence_length, d_model ) # [bs x num_channels x sequence_length x d_model] @@ -686,42 +697,55 @@ def forward(self, hidden_state: torch.Tensor): # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model] # -> [(bs*sequence_length) x num_channels x d_model] if self.channel_attention: - src = ( - src.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model) + hidden_states = ( + hidden_states.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model) ) # [(bs*sequence_length) x num_channels x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - src = src + self.dropout_path2( - self.self_attn(self.norm_sublayer2(src))[0] - ) # Add: residual connection with residual dropout + hidden_states, channel_attn_weights, _ = self.self_attn( + hidden_states=self.norm_sublayer2(hidden_states), + output_attentions=output_attentions + ) + hidden_states = hidden_states + self.dropout_path2(hidden_states) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - src = self.norm_sublayer2( - src + self.dropout_path2(self.self_attn(src)[0]) - ) # src: [(bs*sequence_length) x num_channels x d_model] - src = ( - src.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() + hidden_states, channel_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + output_attentions=output_attentions + ) + hidden_states = self.norm_sublayer2( + hidden_states + self.dropout_path2(hidden_states) + ) # hidden_states: [(bs*sequence_length) x num_channels x d_model] + + hidden_states = ( + hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() ) # src: [bs x num_channels x sequence_length x d_model] # Third sublayer: mixing across hidden - src = src.view( + hidden_states = hidden_states.view( batch_size * num_input_channels, sequence_length, d_model ) # src: [(batch_size*num_channels) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection - src = src + self.dropout_path3( - self.ff(self.norm_sublayer3(src)) + hidden_states = hidden_states + self.dropout_path3( + self.ff(self.norm_sublayer3(hidden_states)) ) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - src = self.norm_sublayer3( - src + self.dropout_path3(self.ff(src)) + hidden_states = self.norm_sublayer3( + hidden_states + self.dropout_path3(self.ff(hidden_states)) ) # Add: residual connection with residual dropout - src = src.reshape( + + hidden_states = hidden_states.reshape( batch_size, num_input_channels, sequence_length, d_model ) # [bs x num_channels x sequence_length x d_model] - return src + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights, channel_attn_weights) + + return outputs class PatchTSTPreTrainedModel(PreTrainedModel): @@ -788,22 +812,30 @@ def __init__(self, config: PatchTSTConfig): # Encoder self.encoder = PatchTSTEncoderBlock(config) + self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) # Initialize weights and apply final processing self.post_init() def forward( - self, past_values: torch.Tensor, output_hidden_states: Optional[bool] = None - ) -> BaseModelOutputWithNoAttention: + self, + past_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + ) -> BaseModelOutput: """ Parameters: past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Past values of the time series - output_hidden_states (bool, optional): Indicates if hidden states should be output. + output_hidden_states (bool, optional): Indicates if hidden states should be outputted. + output_attentions (bool, optional): Indicates if attentions should be outputted. return: - `BaseModelOutputWithNoAttention` + `BaseModelOutput` """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + _, num_input_channels, _, _ = past_values.shape # Input encoding @@ -822,25 +854,38 @@ def forward( # append cls token cls_token = self.cls_token + self.position_enc[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples - past_values = torch.cat( + hidden_states = torch.cat( (cls_tokens, past_values), dim=1 ) # x: [bs x num_channels x (num_patches+1) x d_model] else: - past_values = self.positional_dropout( + hidden_states = self.positional_dropout( past_values + self.position_enc ) # x: [bs x num_channels x num_patches x d_model] - # Encoder - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - past_values, hidden_states = self.encoder( - past_values, output_hidden_states - ) # x: [bs x num_channels x num_patches x d_model] - # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states=hidden_states, + output_attentions=output_attentions, + ) + # get hidden state + hidden_states = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] + # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token + # append layer attention + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) # return past_values, hidden_states - return BaseModelOutputWithNoAttention(last_hidden_state=past_values, hidden_states=hidden_states) + return BaseModelOutput( + last_hidden_state=past_values, + hidden_states=encoder_states, + attentions=all_attentions + ) PATCHTST_START_DOCSTRING = r""" @@ -1310,9 +1355,12 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if past_observed_mask is None: past_observed_mask = torch.ones_like(past_values) From 2b84706813de8cec5e36482bd9a0c70f1051aedc Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 24 Oct 2023 23:31:02 -0400 Subject: [PATCH 146/189] add output_attentions to PatchTSTEncoder --- src/transformers/models/patchtst/modeling_patchtst.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 2f734b266b1afb..627a8383d6536e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -811,7 +811,7 @@ def __init__(self, config: PatchTSTConfig): ) # Encoder - self.encoder = PatchTSTEncoderBlock(config) + # self.encoder = PatchTSTEncoderBlock(config) self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) # Initialize weights and apply final processing @@ -1374,7 +1374,10 @@ def forward( masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None - encoder_output = self.encoder(masked_values, output_hidden_states=output_hidden_states) + encoder_output = self.encoder(masked_values, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions + ) hidden_states = encoder_output.last_hidden_state encoder_states = encoder_output.hidden_states From 0be64406146ba9076c0523cf81e76a64c44a29b6 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 25 Oct 2023 08:29:37 +0200 Subject: [PATCH 147/189] formatting --- .../models/patchtst/modeling_patchtst.py | 57 +++++++++---------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 627a8383d6536e..a572b80dd7fa28 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -653,10 +653,7 @@ def __init__(self, config: PatchTSTConfig): self.pre_norm = config.pre_norm - def forward(self, - hidden_states: torch.Tensor, - output_attentions: Optional[bool] = None - ): + def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] = None): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -675,15 +672,15 @@ def forward(self, if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection hidden_states, attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer1(hidden_states), - output_attentions=output_attentions - ) - hidden_states = hidden_states + self.dropout_path1(hidden_states) # Add: residual connection with residual dropout + hidden_states=self.norm_sublayer1(hidden_states), output_attentions=output_attentions + ) + hidden_states = hidden_states + self.dropout_path1( + hidden_states + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT hidden_states, attn_weights, _ = self.self_attn( - hidden_states=hidden_states, - output_attentions=output_attentions + hidden_states=hidden_states, output_attentions=output_attentions ) hidden_states = self.norm_sublayer1( hidden_states + self.dropout_path1(hidden_states) @@ -698,27 +695,31 @@ def forward(self, # -> [(bs*sequence_length) x num_channels x d_model] if self.channel_attention: hidden_states = ( - hidden_states.transpose(2, 1).contiguous().view(batch_size * sequence_length, num_input_channels, d_model) + hidden_states.transpose(2, 1) + .contiguous() + .view(batch_size * sequence_length, num_input_channels, d_model) ) # [(bs*sequence_length) x num_channels x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection hidden_states, channel_attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer2(hidden_states), - output_attentions=output_attentions + hidden_states=self.norm_sublayer2(hidden_states), output_attentions=output_attentions ) - hidden_states = hidden_states + self.dropout_path2(hidden_states) # Add: residual connection with residual dropout + hidden_states = hidden_states + self.dropout_path2( + hidden_states + ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm hidden_states, channel_attn_weights, _ = self.self_attn( - hidden_states=hidden_states, - output_attentions=output_attentions + hidden_states=hidden_states, output_attentions=output_attentions ) hidden_states = self.norm_sublayer2( hidden_states + self.dropout_path2(hidden_states) ) # hidden_states: [(bs*sequence_length) x num_channels x d_model] hidden_states = ( - hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model).transpose(1, 2).contiguous() + hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model) + .transpose(1, 2) + .contiguous() ) # src: [bs x num_channels x sequence_length x d_model] # Third sublayer: mixing across hidden @@ -834,7 +835,9 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) _, num_input_channels, _, _ = past_values.shape @@ -874,18 +877,14 @@ def forward( output_attentions=output_attentions, ) # get hidden state - hidden_states = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] - # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token + hidden_states = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] + # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token # append layer attention if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # return past_values, hidden_states - return BaseModelOutput( - last_hidden_state=past_values, - hidden_states=encoder_states, - attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=past_values, hidden_states=encoder_states, attentions=all_attentions) PATCHTST_START_DOCSTRING = r""" @@ -1358,7 +1357,6 @@ def forward( output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1374,10 +1372,9 @@ def forward( masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None - encoder_output = self.encoder(masked_values, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions - ) + encoder_output = self.encoder( + masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions + ) hidden_states = encoder_output.last_hidden_state encoder_states = encoder_output.hidden_states From 8972a920eb504b00e84d56e3657b0a2ff9f07f71 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 11:39:31 -0400 Subject: [PATCH 148/189] Add attention output option to all classes --- .../models/patchtst/modeling_patchtst.py | 106 ++++++++++++------ 1 file changed, 74 insertions(+), 32 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a572b80dd7fa28..c61b47ae08e538 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -812,7 +812,6 @@ def __init__(self, config: PatchTSTConfig): ) # Encoder - # self.encoder = PatchTSTEncoderBlock(config) self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) # Initialize weights and apply final processing @@ -884,7 +883,9 @@ def forward( all_attentions = all_attentions + (layer_outputs[1],) # return past_values, hidden_states - return BaseModelOutput(last_hidden_state=past_values, hidden_states=encoder_states, attentions=all_attentions) + return BaseModelOutput(last_hidden_state=past_values, + hidden_states=encoder_states, + attentions=all_attentions) PATCHTST_START_DOCSTRING = r""" @@ -936,7 +937,7 @@ def forward( @dataclass -class PatchTSTModelOutputWithNoAttention(ModelOutput): +class PatchTSTModelOutput(ModelOutput): """ Base class for model's outputs, with potential hidden states. @@ -959,6 +960,7 @@ class PatchTSTModelOutputWithNoAttention(ModelOutput): last_hidden_state: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None patched_input: torch.FloatTensor = None mask: torch.FloatTensor = None loc: torch.FloatTensor = None @@ -1356,7 +1358,7 @@ def forward( output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, PatchTSTModelOutputWithNoAttention]: + ) -> Union[Tuple, PatchTSTModelOutput]: return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1372,18 +1374,21 @@ def forward( masked_values, mask = self.masking(patched_values) else: masked_values, mask = self.masking(patched_values), None + encoder_output = self.encoder( - masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions + past_values=masked_values, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) - hidden_states = encoder_output.last_hidden_state - encoder_states = encoder_output.hidden_states - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, patched_values, mask, loc, scale] if v is not None) - return PatchTSTModelOutputWithNoAttention( - last_hidden_state=hidden_states, - hidden_states=encoder_states, + outputs = (encoder_output.last_hidden_state, encoder_output.hidden_states, encoder_output.attentions) + outputs = outputs + (patched_values, mask, loc, scale) + return tuple(v for v in outputs if v is not None) + return PatchTSTModelOutput( + last_hidden_state=encoder_output.last_hidden_state, + hidden_states=encoder_output.hidden_states, + attentions=encoder_output.attentions, patched_input=patched_values, mask=mask, loc=loc, @@ -1439,6 +1444,7 @@ def forward( past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForPretrainingOutput]: """ @@ -1466,7 +1472,10 @@ def forward( # past_values: [bs x num_channels x num_patches x d_model] or # [bs x num_channels x (num_patches+1) x d_model] if use cls_token model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + past_values=past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) # model_output[0]: [bs x num_channels x num_patches x patch_length] or @@ -1480,8 +1489,13 @@ def forward( encoder_states = model_output.hidden_states if not return_dict: - return tuple(v for v in [masked_loss, x_hat, encoder_states] if v is not None) - return PatchTSTForPretrainingOutput(loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states) + outputs = (masked_loss, x_hat, model_output.hidden_states, model_output.attentions) + return tuple(v for v in outputs if v is not None) + return PatchTSTForPretrainingOutput(loss=masked_loss, + prediction_output=x_hat, + hidden_states=encoder_states, + attentions=model_output.attentions + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1504,6 +1518,7 @@ def forward( labels: torch.Tensor = None, past_observed_mask: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForClassificationOutput]: """ @@ -1530,7 +1545,10 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + past_values=past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) y_hat = self.head(model_output[0]) @@ -1539,10 +1557,14 @@ def forward( loss = nn.CrossEntropyLoss() loss_val = loss(y_hat, labels) - encoder_states = model_output.hidden_states if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTForClassificationOutput(loss=loss_val, prediction_logits=y_hat, hidden_states=encoder_states) + outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) + return tuple(v for v in outputs if v is not None) + return PatchTSTForClassificationOutput(loss=loss_val, + prediction_logits=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions + ) class PatchTSTClassificationHead(nn.Module): @@ -1664,6 +1686,7 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = True, ) -> Union[Tuple, PatchTSTForPredictionOutput]: """ @@ -1692,7 +1715,10 @@ def forward( # get model output model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + past_values=past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape @@ -1709,10 +1735,14 @@ def forward( loss = nn.MSELoss(reduction="mean") loss_val = loss(y_hat, future_values) - encoder_states = model_output.hidden_states if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTForPredictionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) + outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) + return tuple(v for v in outputs if v is not None) + return PatchTSTForPredictionOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions + ) def generate( self, @@ -1872,6 +1902,7 @@ def forward( past_observed_mask: Optional[torch.Tensor] = None, future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTForForecastingOutput]: """ @@ -1899,7 +1930,10 @@ def forward( # get model output model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + past_values=past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) # get output head y_hat = self.head(model_output.last_hidden_state) @@ -1922,16 +1956,17 @@ def forward( loss = nn.MSELoss(reduction="mean") loss_val = loss(y_hat, future_values) - encoder_states = model_output.hidden_states loc = model_output.loc scale = model_output.scale if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states, loc, scale] if v is not None) + outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions, loc, scale) + return tuple(v for v in outputs if v is not None) return PatchTSTForForecastingOutput( loss=loss_val, forecast_outputs=y_hat, - hidden_states=encoder_states, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions, loc=loc, scale=scale, ) @@ -2067,6 +2102,7 @@ def forward( target_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, PatchTSTForRegressionOutput]: """ @@ -2093,7 +2129,10 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict model_output = self.model( - past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states + past_values=past_values, + past_observed_mask=past_observed_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape y_hat = self.head(model_output.last_hidden_state) @@ -2109,11 +2148,14 @@ def forward( loss = nn.MSELoss(reduction="mean") loss_val = loss(y_hat, target_values) - encoder_states = model_output.hidden_states - if not return_dict: - return tuple(v for v in [loss_val, y_hat, encoder_states] if v is not None) - return PatchTSTForRegressionOutput(loss=loss_val, prediction_output=y_hat, hidden_states=encoder_states) + outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) + return tuple(v for v in outputs if v is not None) + return PatchTSTForRegressionOutput(loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions + ) def generate( self, From a2ff8ef8b63c0ee3e3d391bbded4723ceaf02657 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 12:36:21 -0400 Subject: [PATCH 149/189] Remove PatchTSTEncoderBlock --- .../models/patchtst/modeling_patchtst.py | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index c61b47ae08e538..1130d62fa34d7b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -569,41 +569,6 @@ def forward(self, patch_input: torch.Tensor): return masked_input, mask -class PatchTSTEncoderBlock(nn.Module): - """ - PatchTST encoder block - """ - - def __init__(self, config: PatchTSTConfig): - super().__init__() - - self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) - - def forward(self, hidden_state: torch.Tensor, output_hidden_states: bool = False): - """ - Parameters: - hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): - Past values of the time series - output_hidden_states (`bool`, *optional*, default to False): - output hidden state option - Return: - hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`) - - all_hidden_states (*optional*, returned when `output_hidden_states` is set to True, tuple of `torch.Tensor` - of shapes `(batch_size, num_channels, sequence_length, d_model)`) - - """ - all_hidden_states = [] - - for mod in self.layers: - hidden_state = mod(hidden_state) - if output_hidden_states: - all_hidden_states.append(hidden_state) - if output_hidden_states is False: - return hidden_state, None - return hidden_state, all_hidden_states - - class PatchTSTEncoderLayer(nn.Module): """ PatchTST encoder layer From d11ea0eab442d33ad6b4f9cce7f69ed3b6425e46 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 16:50:38 -0400 Subject: [PATCH 150/189] create PatchTSTEmbedding class --- .../models/patchtst/modeling_patchtst.py | 158 ++++++++++-------- 1 file changed, 88 insertions(+), 70 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 1130d62fa34d7b..d2ea7d153069df 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -573,7 +573,6 @@ class PatchTSTEncoderLayer(nn.Module): """ PatchTST encoder layer """ - def __init__(self, config: PatchTSTConfig): super().__init__() @@ -618,7 +617,9 @@ def __init__(self, config: PatchTSTConfig): self.pre_norm = config.pre_norm - def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] = None): + def forward(self, + hidden_state: torch.Tensor, + output_attentions: Optional[bool] = None): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -627,31 +628,33 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)` """ - batch_size, num_input_channels, sequence_length, d_model = hidden_states.shape + batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape # First sublayer: attention across time - hidden_states = hidden_states.view( + hidden_state = hidden_state.view( batch_size * num_input_channels, sequence_length, d_model ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - hidden_states, attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer1(hidden_states), output_attentions=output_attentions + attn_output, attn_weights, _ = self.self_attn( + hidden_states=self.norm_sublayer1(hidden_state), + output_attentions=output_attentions ) - hidden_states = hidden_states + self.dropout_path1( - hidden_states + hidden_state = hidden_state + self.dropout_path1( + attn_output ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT - hidden_states, attn_weights, _ = self.self_attn( - hidden_states=hidden_states, output_attentions=output_attentions + attn_output, attn_weights, _ = self.self_attn( + hidden_states=hidden_state, + output_attentions=output_attentions ) - hidden_states = self.norm_sublayer1( - hidden_states + self.dropout_path1(hidden_states) + hidden_state = self.norm_sublayer1( + hidden_state + self.dropout_path1(attn_output) ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] - hidden_states = hidden_states.reshape( + hidden_state = hidden_state.reshape( batch_size, num_input_channels, sequence_length, d_model ) # [bs x num_channels x sequence_length x d_model] @@ -659,54 +662,55 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: Optional[bool] # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model] # -> [(bs*sequence_length) x num_channels x d_model] if self.channel_attention: - hidden_states = ( - hidden_states.transpose(2, 1) + hidden_state = ( + hidden_state.transpose(2, 1) .contiguous() .view(batch_size * sequence_length, num_input_channels, d_model) ) # [(bs*sequence_length) x num_channels x d_model] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection - hidden_states, channel_attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer2(hidden_states), output_attentions=output_attentions + attn_output, channel_attn_weights, _ = self.self_attn( + hidden_states=self.norm_sublayer2(hidden_state), + output_attentions=output_attentions ) - hidden_states = hidden_states + self.dropout_path2( - hidden_states + hidden_state = hidden_state + self.dropout_path2( + attn_output ) # Add: residual connection with residual dropout else: ## Multi-Head attention and Add residual connection and Norm - hidden_states, channel_attn_weights, _ = self.self_attn( - hidden_states=hidden_states, output_attentions=output_attentions + attn_output, channel_attn_weights, _ = self.self_attn( + hidden_states=hidden_state, output_attentions=output_attentions ) - hidden_states = self.norm_sublayer2( - hidden_states + self.dropout_path2(hidden_states) + hidden_state = self.norm_sublayer2( + hidden_state + self.dropout_path2(attn_output) ) # hidden_states: [(bs*sequence_length) x num_channels x d_model] - hidden_states = ( - hidden_states.reshape(batch_size, sequence_length, num_input_channels, d_model) + hidden_state = ( + hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model) .transpose(1, 2) .contiguous() ) # src: [bs x num_channels x sequence_length x d_model] # Third sublayer: mixing across hidden - hidden_states = hidden_states.view( + hidden_state = hidden_state.view( batch_size * num_input_channels, sequence_length, d_model ) # src: [(batch_size*num_channels) x sequence_length x d_model] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection - hidden_states = hidden_states + self.dropout_path3( - self.ff(self.norm_sublayer3(hidden_states)) + hidden_state = hidden_state + self.dropout_path3( + self.ff(self.norm_sublayer3(hidden_state)) ) # Add: residual connection with residual dropout else: ## Position-wise Feed-Forward and Add residual connection and Norm - hidden_states = self.norm_sublayer3( - hidden_states + self.dropout_path3(self.ff(hidden_states)) + hidden_state = self.norm_sublayer3( + hidden_state + self.dropout_path3(self.ff(hidden_state)) ) # Add: residual connection with residual dropout - hidden_states = hidden_states.reshape( + hidden_state = hidden_state.reshape( batch_size, num_input_channels, sequence_length, d_model ) # [bs x num_channels x sequence_length x d_model] - outputs = (hidden_states,) + outputs = (hidden_state,) if output_attentions: outputs += (attn_weights, channel_attn_weights) @@ -737,6 +741,35 @@ def _set_gradient_checkpointing(self, module, value=False): module.gradient_checkpointing = value +class PatchTSTEmbedding(nn.Module): + def __init__(self, config: PatchTSTConfig): + super().__init__() + # Input encoding: projection of feature vectors onto a d-dim vector space + if not config.shared_embedding: + self.input_embedding = nn.ModuleList() + for _ in range(config.num_input_channels): + self.input_embedding.append(nn.Linear(config.patch_length, config.d_model)) + else: + self.input_embedding = nn.Linear(config.patch_length, config.d_model) + + def forward(self, patch_input: torch.Tensor): + """ + Parameters: + patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): + Patch input for embedding + return: + `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)` + """ + # Input encoding + num_input_channels = patch_input.shape[1] + if isinstance(self.input_embedding, nn.ModuleList): + embeddings = [ self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)] + embeddings = torch.stack(embeddings, dim=1) + else: + embeddings = self.input_embedding(patch_input) # x: [bs x num_channels x num_patches x d_model] + return embeddings + + class PatchTSTEncoder(PatchTSTPreTrainedModel): """ PatchTST Encoder @@ -752,13 +785,8 @@ def __init__(self, config: PatchTSTConfig): self.use_cls_token = config.use_cls_token self.gradient_checkpointing = False - # Input encoding: projection of feature vectors onto a d-dim vector space - if not config.shared_embedding: - self.input_embedding = nn.ModuleList() - for _ in range(self.num_input_channels): - self.input_embedding.append(nn.Linear(config.patch_length, config.d_model)) - else: - self.input_embedding = nn.Linear(config.patch_length, config.d_model) + # Input embedding: projection of feature vectors onto a d-dim vector space + self.embedder = PatchTSTEmbedding(config) # Positional encoding if config.use_cls_token: @@ -784,13 +812,13 @@ def __init__(self, config: PatchTSTConfig): def forward( self, - past_values: torch.Tensor, + patch_input: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ) -> BaseModelOutput: """ Parameters: - past_values (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): + patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*): Past values of the time series output_hidden_states (bool, optional): Indicates if hidden states should be outputted. output_attentions (bool, optional): Indicates if attentions should be outputted. @@ -799,34 +827,23 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - _, num_input_channels, _, _ = past_values.shape - - # Input encoding - if not self.shared_embedding: - x_out = [] - for i in range(num_input_channels): - z = self.input_embedding[i](past_values[:, i, :, :]) - x_out.append(z) - past_values = torch.stack(x_out, dim=1) - else: - past_values = self.input_embedding(past_values) # x: [bs x num_channels x num_patches x d_model] + # Input embedding + patch_input = self.embedder(patch_input) if self.use_cls_token: # x: [bs x num_channels x num_patches x d_model] - past_values = self.positional_dropout(past_values + self.position_enc[1:, :]) + patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :]) # append cls token cls_token = self.cls_token + self.position_enc[:1, :] # cls_token: [1 x 1 x 1 x d_model] - cls_tokens = cls_token.expand(past_values.shape[0], -1, -1) # get the same copy for all the batch samples - hidden_states = torch.cat( - (cls_tokens, past_values), dim=1 + cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1) # get the same copy for all the batch samples + hidden_state = torch.cat( + (cls_tokens, patch_input), dim=1 ) # x: [bs x num_channels x (num_patches+1) x d_model] else: - hidden_states = self.positional_dropout( - past_values + self.position_enc + hidden_state = self.positional_dropout( + patch_input + self.position_enc ) # x: [bs x num_channels x num_patches x d_model] encoder_states = () if output_hidden_states else None @@ -834,21 +851,21 @@ def forward( for encoder_layer in self.layers: if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + encoder_states = encoder_states + (hidden_state,) layer_outputs = encoder_layer( - hidden_states=hidden_states, + hidden_state=hidden_state, output_attentions=output_attentions, ) # get hidden state - hidden_states = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] + hidden_state = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token # append layer attention if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # return past_values, hidden_states - return BaseModelOutput(last_hidden_state=past_values, + return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions) @@ -913,7 +930,7 @@ class PatchTSTModelOutput(ModelOutput): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - patched_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): + patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`): patched input to the Transformer mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*) Bool masked tensor indicating which patches are masked @@ -926,7 +943,7 @@ class PatchTSTModelOutput(ModelOutput): last_hidden_state: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None - patched_input: torch.FloatTensor = None + patch_input: torch.FloatTensor = None mask: torch.FloatTensor = None loc: torch.FloatTensor = None scale: torch.FloatTensor = None @@ -1341,7 +1358,7 @@ def forward( masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder( - past_values=masked_values, + patch_input=masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions ) @@ -1350,11 +1367,12 @@ def forward( outputs = (encoder_output.last_hidden_state, encoder_output.hidden_states, encoder_output.attentions) outputs = outputs + (patched_values, mask, loc, scale) return tuple(v for v in outputs if v is not None) + return PatchTSTModelOutput( last_hidden_state=encoder_output.last_hidden_state, hidden_states=encoder_output.hidden_states, attentions=encoder_output.attentions, - patched_input=patched_values, + patch_input=patched_values, mask=mask, loc=loc, scale=scale, @@ -1449,7 +1467,7 @@ def forward( # calculate masked_loss loss = nn.MSELoss(reduction="none") - loss_val = loss(x_hat, model_output.patched_input) + loss_val = loss(x_hat, model_output.patch_input) masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10) encoder_states = model_output.hidden_states From 93b88cfb36d8fed74e84d79212c802eedeb0b273 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 22:21:37 -0400 Subject: [PATCH 151/189] use config in PatchTSTPatchify --- .../models/patchtst/modeling_patchtst.py | 40 ++++++------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d2ea7d153069df..f2b731778683a2 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -423,36 +423,26 @@ class PatchTSTPatchify(nn.Module): """ A class to patchify the time series sequence into different patches - Parameters: - sequence_length (`int`, *required*): input sequence length. - patch_length (`int`, *required*): patch length. - stride (`int`, *required*): stride between patches. - Returns: `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)` """ - def __init__( - self, - sequence_length: int, - patch_length: int, - stride: int, - ): + def __init__(self, config: PatchTSTConfig): super().__init__() - if sequence_length <= patch_length: + self.sequence_length = config.context_length + self.patch_length = config.patch_length + self.stride = config.stride + + if self.sequence_length <= self.patch_length: raise ValueError( - f"Sequence length ({sequence_length}) has to be greater than the patch length ({patch_length})" + f"Sequence length ({self.sequence_length}) has to be greater than the patch length ({self.patch_length})" ) - self.sequence_length = sequence_length - self.patch_length = patch_length - self.stride = stride - # get the number of patches - num_patches = (max(sequence_length, patch_length) - patch_length) // stride + 1 - new_sequence_length = patch_length + stride * (num_patches - 1) - self.s_begin = sequence_length - new_sequence_length + num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.stride + 1 + new_sequence_length = self.patch_length + self.stride * (num_patches - 1) + self.sequence_start = self.sequence_length - new_sequence_length def forward(self, past_values: torch.Tensor): """ @@ -469,7 +459,7 @@ def forward(self, past_values: torch.Tensor): f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." ) - output = past_values[:, self.s_begin :, :] # output: [bs x new_sequence_length x num_channels] + output = past_values[:, self.sequence_start :, :] # output: [bs x new_sequence_length x num_channels] output = output.unfold( dimension=-2, size=self.patch_length, step=self.stride ) # output: [bs x num_patches x num_input_channels x patch_length] @@ -1307,11 +1297,7 @@ def __init__(self, config: PatchTSTConfig): else: self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) - self.patching = PatchTSTPatchify( - config.context_length, - patch_length=config.patch_length, - stride=config.stride, - ) + self.patchifier = PatchTSTPatchify(config) self.mask_input = config.mask_input if self.mask_input: @@ -1351,7 +1337,7 @@ def forward( scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask) # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain - patched_values = self.patching(scaled_past_values) + patched_values = self.patchifier(scaled_past_values) if self.mask_input: masked_values, mask = self.masking(patched_values) else: From 8175505fa97d98d4da5dd880718b18db37ffa021 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 22:25:14 -0400 Subject: [PATCH 152/189] Use config in PatchTSTMasking class --- .../models/patchtst/modeling_patchtst.py | 50 ++++--------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f2b731778683a2..f9fd23a93c47cf 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -474,19 +474,7 @@ class PatchTSTMasking(nn.Module): Class to perform random or forecast masking. Parameters: - mask_type (`str`, *optional*): Masking type. Allowed values are random, forecast. Defaults to random. - random_mask_ratio (`float`, *optional*): Mask ratio for random pretraining. - forecast_mask_patches (`list`, *optional*): List of patch lengths to mask in the end of the data. - forecast_mask_ratios (`list`, *optional*): - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and - forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. - unmasked_channel_indices (`list`, *optional*): - Define what channels not to mask. These channels will not be masked during pretrainin. Defaults to None. - channel_consistent_masking (`bool`, *optional*): - When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary - across channels. Defaults to True. - mask_value (`int`, *optional*): Value to use for masking. Defaults to 0. - seed_number (`int`, *optional*): Random seed, when None seed is not set. Defaults to None. + config (`PatchTSTConfig`): model config Returns: x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`) @@ -498,26 +486,19 @@ class PatchTSTMasking(nn.Module): def __init__( self, - mask_type: str = "random", - random_mask_ratio: float = 0.5, - forecast_mask_patches: list = [2, 3], - forecast_mask_ratios: list = [1, 1], - channel_consistent_masking: bool = False, - unmasked_channel_indices: list = None, - mask_value: int = 0, - seed_number: Optional[int] = None, + config: PatchTSTConfig ): super().__init__() - self.random_mask_ratio = random_mask_ratio - self.channel_consistent_masking = channel_consistent_masking - self.mask_type = mask_type - self.forecast_mask_patches = forecast_mask_patches - self.forecast_mask_ratios = forecast_mask_ratios - self.unmasked_channel_indices = unmasked_channel_indices - self.mask_value = mask_value + self.random_mask_ratio = config.random_mask_ratio + self.channel_consistent_masking = config.channel_consistent_masking + self.mask_type = config.mask_type + self.forecast_mask_patches = config.forecast_mask_patches + self.forecast_mask_ratios = config.forecast_mask_ratios + self.unmasked_channel_indices = config.unmasked_channel_indices + self.mask_value = config.mask_value if self.unmasked_channel_indices is not None: self.unmasked_channel_indices.sort() - self.seed_number = seed_number + self.seed_number = config.seed_number def forward(self, patch_input: torch.Tensor): """ @@ -1301,16 +1282,7 @@ def __init__(self, config: PatchTSTConfig): self.mask_input = config.mask_input if self.mask_input: - self.masking = PatchTSTMasking( - mask_type=config.mask_type, - random_mask_ratio=config.random_mask_ratio, - forecast_mask_patches=config.forecast_mask_patches, - forecast_mask_ratios=config.forecast_mask_ratios, - channel_consistent_masking=config.channel_consistent_masking, - unmasked_channel_indices=config.unmasked_channel_indices, - mask_value=config.mask_value, - seed_number=config.seed_number, - ) + self.masking = PatchTSTMasking(config) else: self.masking = nn.Identity() self.encoder = PatchTSTEncoder(config) From 63684aa5999912355a10c7d6e9effa606eca63cc Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 22:37:57 -0400 Subject: [PATCH 153/189] add channel_attn_weights --- src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index f9fd23a93c47cf..99c92dc4610c88 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -684,7 +684,7 @@ def forward(self, outputs = (hidden_state,) if output_attentions: - outputs += (attn_weights, channel_attn_weights) + outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights, ) return outputs From e8faa8b3cc41c668de71e4ed0796b6befd50b35d Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 25 Oct 2023 22:52:36 -0400 Subject: [PATCH 154/189] Add PatchTSTScaler class --- .../models/patchtst/modeling_patchtst.py | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 99c92dc4610c88..4d2bfe1e4367b3 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1170,7 +1170,9 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5) self.minimum_scale = minimum_scale @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, + data: torch.Tensor, weights: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: denominator = weights.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator @@ -1198,7 +1200,8 @@ class PatchTSTMeanScaler(nn.Module): """ def __init__( - self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 + self, dim: int = -1, keepdim: bool = True, + default_scale: Optional[float] = None, minimum_scale: float = 1e-10 ): super().__init__() self.dim = dim @@ -1207,9 +1210,9 @@ def __init__( self.default_scale = default_scale @torch.no_grad() - def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, + data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # shape: (N, [C], T=1) ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -1263,14 +1266,9 @@ def forward( return data, loc, scale -@add_start_docstrings( - "The bare PatchTST Model outputting raw hidden-states without any specific head.", - PATCHTST_START_DOCSTRING, -) -class PatchTSTModel(PatchTSTPreTrainedModel): +class PatchTSTScaler(nn.Module): def __init__(self, config: PatchTSTConfig): - super().__init__(config) - + super().__init__() if config.scaling == "mean" or config.scaling is True: self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True) elif config.scaling == "std": @@ -1278,6 +1276,22 @@ def __init__(self, config: PatchTSTConfig): else: self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) + def forward(self, + data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + data, loc, scale = self.scaler(data, observed_indicator) + return data, loc, scale + + +@add_start_docstrings( + "The bare PatchTST Model outputting raw hidden-states without any specific head.", + PATCHTST_START_DOCSTRING, +) +class PatchTSTModel(PatchTSTPreTrainedModel): + def __init__(self, config: PatchTSTConfig): + super().__init__(config) + + self.scaler = PatchTSTScaler(config) self.patchifier = PatchTSTPatchify(config) self.mask_input = config.mask_input From 6389fbfae198f9c418095acf23a7fb3998e8086f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 26 Oct 2023 00:06:45 -0400 Subject: [PATCH 155/189] add output_attentions arg to test function --- tests/models/patchtst/test_modeling_patchtst.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 496cfc8301aa86..68ba5030b35430 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -290,6 +290,7 @@ def test_forward_signature(self): expected_arg_names.extend( [ "output_hidden_states", + "output_attentions", "return_dict", ] ) From dd5e25d9a020f94f3366766dc24739c08c288e71 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 26 Oct 2023 09:29:27 +0200 Subject: [PATCH 156/189] format --- .../models/patchtst/modeling_patchtst.py | 105 ++++++++---------- 1 file changed, 47 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4d2bfe1e4367b3..03dcf8e309bd75 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -484,10 +484,7 @@ class PatchTSTMasking(nn.Module): """ - def __init__( - self, - config: PatchTSTConfig - ): + def __init__(self, config: PatchTSTConfig): super().__init__() self.random_mask_ratio = config.random_mask_ratio self.channel_consistent_masking = config.channel_consistent_masking @@ -544,6 +541,7 @@ class PatchTSTEncoderLayer(nn.Module): """ PatchTST encoder layer """ + def __init__(self, config: PatchTSTConfig): super().__init__() @@ -588,9 +586,7 @@ def __init__(self, config: PatchTSTConfig): self.pre_norm = config.pre_norm - def forward(self, - hidden_state: torch.Tensor, - output_attentions: Optional[bool] = None): + def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] = None): """ Parameters: hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*): @@ -609,8 +605,7 @@ def forward(self, if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer1(hidden_state), - output_attentions=output_attentions + hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions ) hidden_state = hidden_state + self.dropout_path1( attn_output @@ -618,8 +613,7 @@ def forward(self, else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT attn_output, attn_weights, _ = self.self_attn( - hidden_states=hidden_state, - output_attentions=output_attentions + hidden_states=hidden_state, output_attentions=output_attentions ) hidden_state = self.norm_sublayer1( hidden_state + self.dropout_path1(attn_output) @@ -641,8 +635,7 @@ def forward(self, if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, channel_attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer2(hidden_state), - output_attentions=output_attentions + hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions ) hidden_state = hidden_state + self.dropout_path2( attn_output @@ -684,7 +677,7 @@ def forward(self, outputs = (hidden_state,) if output_attentions: - outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights, ) + outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights,) return outputs @@ -734,7 +727,7 @@ def forward(self, patch_input: torch.Tensor): # Input encoding num_input_channels = patch_input.shape[1] if isinstance(self.input_embedding, nn.ModuleList): - embeddings = [ self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)] + embeddings = [self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)] embeddings = torch.stack(embeddings, dim=1) else: embeddings = self.input_embedding(patch_input) # x: [bs x num_channels x num_patches x d_model] @@ -798,7 +791,9 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) # Input embedding patch_input = self.embedder(patch_input) @@ -836,9 +831,7 @@ def forward( all_attentions = all_attentions + (layer_outputs[1],) # return past_values, hidden_states - return BaseModelOutput(last_hidden_state=hidden_state, - hidden_states=encoder_states, - attentions=all_attentions) + return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions) PATCHTST_START_DOCSTRING = r""" @@ -1170,9 +1163,7 @@ def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5) self.minimum_scale = minimum_scale @torch.no_grad() - def forward(self, - data: torch.Tensor, weights: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: denominator = weights.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator @@ -1200,8 +1191,7 @@ class PatchTSTMeanScaler(nn.Module): """ def __init__( - self, dim: int = -1, keepdim: bool = True, - default_scale: Optional[float] = None, minimum_scale: float = 1e-10 + self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 ): super().__init__() self.dim = dim @@ -1210,9 +1200,9 @@ def __init__( self.default_scale = default_scale @torch.no_grad() - def forward(self, - data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # shape: (N, [C], T=1) ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -1276,9 +1266,9 @@ def __init__(self, config: PatchTSTConfig): else: self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) - def forward(self, - data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: data, loc, scale = self.scaler(data, observed_indicator) return data, loc, scale @@ -1330,9 +1320,7 @@ def forward( masked_values, mask = self.masking(patched_values), None encoder_output = self.encoder( - patch_input=masked_values, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions + patch_input=masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions ) if not return_dict: @@ -1430,7 +1418,7 @@ def forward( past_values=past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions + output_attentions=output_attentions, ) # model_output[0]: [bs x num_channels x num_patches x patch_length] or @@ -1446,11 +1434,9 @@ def forward( if not return_dict: outputs = (masked_loss, x_hat, model_output.hidden_states, model_output.attentions) return tuple(v for v in outputs if v is not None) - return PatchTSTForPretrainingOutput(loss=masked_loss, - prediction_output=x_hat, - hidden_states=encoder_states, - attentions=model_output.attentions - ) + return PatchTSTForPretrainingOutput( + loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states, attentions=model_output.attentions + ) class PatchTSTForClassification(PatchTSTPreTrainedModel): @@ -1503,7 +1489,7 @@ def forward( past_values=past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions + output_attentions=output_attentions, ) y_hat = self.head(model_output[0]) @@ -1515,11 +1501,12 @@ def forward( if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) return tuple(v for v in outputs if v is not None) - return PatchTSTForClassificationOutput(loss=loss_val, - prediction_logits=y_hat, - hidden_states=model_output.hidden_states, - attentions=model_output.attentions - ) + return PatchTSTForClassificationOutput( + loss=loss_val, + prediction_logits=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions, + ) class PatchTSTClassificationHead(nn.Module): @@ -1673,7 +1660,7 @@ def forward( past_values=past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions + output_attentions=output_attentions, ) # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape @@ -1693,11 +1680,12 @@ def forward( if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) return tuple(v for v in outputs if v is not None) - return PatchTSTForPredictionOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states, - attentions=model_output.attentions - ) + return PatchTSTForPredictionOutput( + loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions, + ) def generate( self, @@ -1888,7 +1876,7 @@ def forward( past_values=past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions + output_attentions=output_attentions, ) # get output head y_hat = self.head(model_output.last_hidden_state) @@ -2087,7 +2075,7 @@ def forward( past_values=past_values, past_observed_mask=past_observed_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions + output_attentions=output_attentions, ) # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape y_hat = self.head(model_output.last_hidden_state) @@ -2106,11 +2094,12 @@ def forward( if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) return tuple(v for v in outputs if v is not None) - return PatchTSTForRegressionOutput(loss=loss_val, - prediction_output=y_hat, - hidden_states=model_output.hidden_states, - attentions=model_output.attentions - ) + return PatchTSTForRegressionOutput( + loss=loss_val, + prediction_output=y_hat, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions, + ) def generate( self, From b07c55f3f7a12f807edec76d8c07058b0284b17f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 27 Oct 2023 21:52:24 +0200 Subject: [PATCH 157/189] Update doc with image patchtst.md --- docs/source/en/model_doc/patchtst.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 88094385c1500d..ba4b5e27636056 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -28,6 +28,11 @@ Tips: The model can also be used for time series classification and time series regression. See the respective [`PatchTSTForClassification`] and [`PatchTSTForRegression`] classes. +At a high level the model vectorizes time series into patches of a given size and encodes them via a Transformer which then outputs the prediction length forecasts: + +![model](https://github.com/namctin/transformers/assets/8100/150af169-29de-419a-8d98-eb78251c21fa) + + This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif). The original code can be found [here](https://github.com/yuqinie98/PatchTST). @@ -71,4 +76,4 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST). ## PatchTSTForRegression [[autodoc]] PatchTSTForRegression - - forward \ No newline at end of file + - forward From 546f3e2db3bcba84727bc5d9fba4aaea920e766f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 6 Nov 2023 13:20:44 +0100 Subject: [PATCH 158/189] fix-copies --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 03dcf8e309bd75..d9c1371cc2dcd3 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -51,12 +51,15 @@ def __init__( dropout: float = 0.0, is_decoder: bool = False, bias: bool = True, + is_causal: bool = False, + config: Optional[PatchTSTConfig] = None, ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads + self.config = config if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( @@ -65,6 +68,7 @@ def __init__( ) self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder + self.is_causal = is_causal self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) From e7c687e7a37d2f2f4e27a0f5ec93a8444a656c36 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 6 Nov 2023 19:25:57 +0100 Subject: [PATCH 159/189] rename Forecast <-> Prediction --- .../models/patchtst/modeling_patchtst.py | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index d9c1371cc2dcd3..63352c121d556c 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -947,15 +947,15 @@ class PatchTSTForPretrainingOutput(ModelOutput): @dataclass -class PatchTSTForPredictionOutput(ModelOutput): +class PatchTSTForForecastingOutput(ModelOutput): """ - Output type of [`PatchTSTForPredictiontion`]. + Output type of [`PatchTSTForForecastingtion`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction outputs of the time series modeling heads. + forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`): + Forecast outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. @@ -970,7 +970,7 @@ class PatchTSTForPredictionOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_output: torch.FloatTensor = None + forecast_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -983,7 +983,7 @@ class PatchTSTForRegressionOutput(ModelOutput): Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of @@ -999,22 +999,22 @@ class PatchTSTForRegressionOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - prediction_output: torch.FloatTensor = None + forecast_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @dataclass -class PatchTSTForForecastingOutput(ModelOutput): +class PatchTSTForPredictionOutput(ModelOutput): """ - Output type of [`PatchTSTForForecasting`]. + Output type of [`PatchTSTForPrediction`]. Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): MSE loss. - forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Forecasting outputs of the time series modeling heads. + prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, -1)`): + Prediction outputs of the time series modeling heads. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of @@ -1030,7 +1030,7 @@ class PatchTSTForForecastingOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] = None - forecast_outputs: torch.FloatTensor = None + prediction_outputs: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None loc: torch.FloatTensor = None @@ -1546,7 +1546,7 @@ def forward(self, embedding: torch.Tensor): return y -class PatchTSTPredictionHead(nn.Module): +class PatchTSTForecastHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1598,7 +1598,7 @@ def forward(self, embedding: torch.Tensor): return y -class PatchTSTForPrediction(PatchTSTPreTrainedModel): +class PatchTSTForForecasting(PatchTSTPreTrainedModel): """ PatchTST model for prediction. The model contains PatchTST model + prediction head """ @@ -1621,7 +1621,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = PatchTSTPredictionHead(config, self.distribution_output) + self.head = PatchTSTForecastHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1634,7 +1634,7 @@ def forward( output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = True, - ) -> Union[Tuple, PatchTSTForPredictionOutput]: + ) -> Union[Tuple, PatchTSTForForecastingOutput]: """ Parameters: past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): @@ -1652,7 +1652,7 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) """ @@ -1684,9 +1684,9 @@ def forward( if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) return tuple(v for v in outputs if v is not None) - return PatchTSTForPredictionOutput( + return PatchTSTForForecastingOutput( loss=loss_val, - prediction_output=y_hat, + forecast_outputs=y_hat, hidden_states=model_output.hidden_states, attentions=model_output.attentions, ) @@ -1726,7 +1726,7 @@ def generate( ) # get distribution - distribution = self.distribution_output.distribution(outputs.prediction_output) + distribution = self.distribution_output.distribution(outputs.forecast_outputs) # get samples samples = [ distribution.sample() for _ in range(num_parallel_samples) @@ -1736,7 +1736,7 @@ def generate( return SamplePatchTSTPredictionOutput(sequences=samples) -class PatchTSTForecastHead(nn.Module): +class PatchTSTPredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1817,7 +1817,7 @@ def forward(self, embedding: torch.Tensor): return output -class PatchTSTForForecasting(PatchTSTPreTrainedModel): +class PatchTSTForPrediction(PatchTSTPreTrainedModel): """ PatchTST for forecasting. The model contains PatchTST model + Forecasting head """ @@ -1838,7 +1838,7 @@ def __init__(self, config: PatchTSTConfig): else: raise ValueError(f"Unknown distribution output {config.distribution_output}") - self.head = PatchTSTForecastHead(config, self.distribution_output) + self.head = PatchTSTPredictionHead(config, self.distribution_output) # Initialize weights and apply final processing self.post_init() @@ -1851,7 +1851,7 @@ def forward( output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, PatchTSTForForecastingOutput]: + ) -> Union[Tuple, PatchTSTForPredictionOutput]: """ Parameters: past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): @@ -1869,7 +1869,7 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) """ @@ -1909,9 +1909,9 @@ def forward( if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions, loc, scale) return tuple(v for v in outputs if v is not None) - return PatchTSTForForecastingOutput( + return PatchTSTForPredictionOutput( loss=loss_val, - forecast_outputs=y_hat, + prediction_outputs=y_hat, hidden_states=model_output.hidden_states, attentions=model_output.attentions, loc=loc, @@ -1955,7 +1955,7 @@ def generate( # get distribution distribution = self.distribution_output.distribution( - outputs.forecast_outputs, loc=outputs.loc, scale=outputs.scale + outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale ) # get samples samples = [ @@ -2100,7 +2100,7 @@ def forward( return tuple(v for v in outputs if v is not None) return PatchTSTForRegressionOutput( loss=loss_val, - prediction_output=y_hat, + forecast_outputs=y_hat, hidden_states=model_output.hidden_states, attentions=model_output.attentions, ) @@ -2140,7 +2140,7 @@ def generate( ) # get distribution - distribution = self.distribution_output.distribution(outputs.prediction_output) + distribution = self.distribution_output.distribution(outputs.forecast_outputs) # get samples samples = [ distribution.sample() for _ in range(num_parallel_samples) From 609a9d35d5aee21876989031872a1033d2870c6d Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 6 Nov 2023 14:27:47 -0500 Subject: [PATCH 160/189] change name of a few parameters to match with PatchTSMixer. --- .../models/patchtst/modeling_patchtst.py | 224 ++---------------- 1 file changed, 17 insertions(+), 207 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 63352c121d556c..e5a0af1e86d02b 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1074,7 +1074,7 @@ class SamplePatchTSTPredictionOutput(ModelOutput): distribution. Parameters: - sequences `(batch_size, num_samples, prediction_length, num_output_channels)`): + sequences `(batch_size, num_samples, prediction_length, num_targets)`): Sampled values from the chosen distribution. """ @@ -1103,7 +1103,7 @@ class SamplePatchTSTRegressionOutput(ModelOutput): distribution. Parameters: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_output_channels)` + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_targets)` Sampled values from the chosen distribution. """ @@ -1460,7 +1460,7 @@ def __init__(self, config: PatchTSTConfig): def forward( self, past_values: torch.Tensor, - labels: torch.Tensor = None, + target_values: torch.Tensor = None, past_observed_mask: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, @@ -1470,7 +1470,7 @@ def forward( Parameters: past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): Input sequence to the model - labels (`torch.Tensor`, *optional*): labels associates with the `past_values` + target_values (`torch.Tensor`, *optional*): labels associates with the `past_values` past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in `[0, 1]`: @@ -1498,9 +1498,9 @@ def forward( y_hat = self.head(model_output[0]) loss_val = None - if labels is not None: + if target_values is not None: loss = nn.CrossEntropyLoss() - loss_val = loss(y_hat, labels) + loss_val = loss(y_hat, target_values) if not return_dict: outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) @@ -1520,7 +1520,7 @@ def __init__(self, config: PatchTSTConfig): self.pooling = config.pooling self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_labels) + self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_targets) def forward(self, embedding: torch.Tensor): """ @@ -1529,7 +1529,7 @@ def forward(self, embedding: torch.Tensor): or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): Embedding from the model Returns: - `torch.Tensor` of shape `(bs, num_labels)` + `torch.Tensor` of shape `(bs, num_targets)` """ if self.use_cls_token: @@ -1546,196 +1546,6 @@ def forward(self, embedding: torch.Tensor): return y -class PatchTSTForecastHead(nn.Module): - def __init__(self, config: PatchTSTConfig, distribution_output=None): - super().__init__() - - self.num_output_channels = config.num_output_channels - self.use_cls_token = config.use_cls_token - self.pooling = config.pooling - - head_dim = config.num_input_channels * config.d_model - - self.flatten = nn.Flatten(start_dim=1) - self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() - - if distribution_output is None: - self.projection = nn.Linear(head_dim, config.prediction_length * config.num_output_channels) - else: - self.projection = distribution_output.get_parameter_projection(head_dim) - - def forward(self, embedding: torch.Tensor): - """ - Parameters: - embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` - or `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*): - Embedding from the model - Returns: - `torch.Tensor` of shape `(bs, pred_len, num_output_channels)` - - """ - batch_size = embedding.shape[0] - if self.use_cls_token: - x = embedding[:, :, 0, :] # use the first output token, x: [bs x num_channels x d_model] - elif self.pooling == "mean": - x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] - elif self.pooling == "max": - x = embedding.max(dim=2) # x: [bs x num_channels x d_model] - else: - raise Exception(f"pooling operator {self.pooling} is not implemented yet") - - # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (num_channels * d_model) - # projection - y = self.projection(x) - # reshape y - if isinstance(y, tuple): # for distribution head - y = ( - z.reshape(batch_size, -1, self.num_output_channels) for z in y - ) # tuple of [bs x prediction_len x num_output_channels] - else: # for linear head - y = y.reshape(batch_size, -1, self.num_output_channels) # [bs x prediction_len x num_output_channels] - return y - - -class PatchTSTForForecasting(PatchTSTPreTrainedModel): - """ - PatchTST model for prediction. The model contains PatchTST model + prediction head - """ - - def __init__(self, config: PatchTSTConfig): - super().__init__(config) - - self.model = PatchTSTModel(config) - if config.loss == "mse": - self.distribution_output = None - else: - if config.distribution_output == "student_t": - self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) - elif config.distribution_output == "normal": - self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) - elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput( - dim=config.prediction_length * config.num_output_channels - ) - else: - raise ValueError(f"Unknown distribution output {config.distribution_output}") - - self.head = PatchTSTForecastHead(config, self.distribution_output) - - # Initialize weights and apply final processing - self.post_init() - - def forward( - self, - past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - future_values: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, PatchTSTForForecastingOutput]: - """ - Parameters: - past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): - Input sequence to the model - past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): - Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected - in `[0, 1]`: - - - 1 for values that are **observed**, - - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - future_values (`torch.Tensor` of shape `(bs, pred_len, num_output_channels)`, *optional*): - future target values associates with the `past_values` - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers - return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. - - Returns: - `PatchTSTForForecastingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or - `config.return_dict`=False) - - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # get model output - model_output = self.model( - past_values=past_values, - past_observed_mask=past_observed_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - ) - - # get output head. y_hat is of shape [bs x pred_len x num_output_channels] or tuple of this shape - y_hat = self.head(model_output.last_hidden_state) - - loss_val = None - if future_values is not None: - if self.distribution_output: - distribution = self.distribution_output.distribution(y_hat) - loss_val = nll(distribution, future_values) - # take average of the loss - loss_val = weighted_average(loss_val) - else: - loss = nn.MSELoss(reduction="mean") - loss_val = loss(y_hat, future_values) - - if not return_dict: - outputs = (loss_val, y_hat, model_output.hidden_states, model_output.attentions) - return tuple(v for v in outputs if v is not None) - return PatchTSTForForecastingOutput( - loss=loss_val, - forecast_outputs=y_hat, - hidden_states=model_output.hidden_states, - attentions=model_output.attentions, - ) - - def generate( - self, - past_values: torch.Tensor, - past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTPredictionOutput: - """ - Generate sequences of sample predictions from a model with a probability distribution head. - - Args: - past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`): - Past values of the time series that serves as context in order to predict the future. - - past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): - Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected - in `[0, 1]`: - - - 1 for values that are **observed**, - - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). - - Return: - [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, - number of samples, prediction_length, num_output_channels)` - """ - # get number of samples - num_parallel_samples = self.config.num_parallel_samples - - # get model output - outputs = self( - past_values=past_values, - target_values=None, - past_observed_mask=past_observed_mask, - output_hidden_states=False, - ) - - # get distribution - distribution = self.distribution_output.distribution(outputs.forecast_outputs) - # get samples - samples = [ - distribution.sample() for _ in range(num_parallel_samples) - ] # samples: list of [bs x pred_len x num_output_channels] - # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x pred_len x num_output_channels] - return SamplePatchTSTPredictionOutput(sequences=samples) - - class PatchTSTPredictionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() @@ -1973,7 +1783,7 @@ class PatchTSTRegressionHead(nn.Module): def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() - self.y_range = config.prediction_range + self.y_range = config.output_range self.use_cls_token = config.use_cls_token self.pooling = config.pooling self.distribution_output = distribution_output @@ -1984,7 +1794,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() if distribution_output is None: - self.projection = nn.Linear(head_dim, config.num_output_channels) + self.projection = nn.Linear(head_dim, config.num_targets) else: self.projection = distribution_output.get_parameter_projection(head_dim) @@ -2028,12 +1838,12 @@ def __init__(self, config: PatchTSTConfig): self.distribution_output = None else: if config.distribution_output == "student_t": - self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_output_channels) + self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_targets) elif config.distribution_output == "normal": - self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_output_channels) + self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_targets) elif config.distribution_output == "negative_binomial": self.distribution_output = NegativeBinomialOutput( - dim=config.prediction_length * config.num_output_channels + dim=config.prediction_length * config.num_targets ) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -2081,7 +1891,7 @@ def forward( output_hidden_states=output_hidden_states, output_attentions=output_attentions, ) - # get output head. y_hat is of shape [bs x num_output_channels] or tuple of this shape + # get output head. y_hat is of shape [bs x num_targets] or tuple of this shape y_hat = self.head(model_output.last_hidden_state) loss_val = None @@ -2126,7 +1936,7 @@ def generate( Return: [`SamplePatchTSTRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, - number of samples, num_output_channels)`. + number of samples, num_targets)`. """ # get number of samples num_parallel_samples = self.config.num_parallel_samples @@ -2144,7 +1954,7 @@ def generate( # get samples samples = [ distribution.sample() for _ in range(num_parallel_samples) - ] # samples: list of [bs x num_output_channels] + ] # samples: list of [bs x num_targets] # stack tensors - samples = torch.stack(samples, dim=1) # [bs x num_samples x num_output_channels] + samples = torch.stack(samples, dim=1) # [bs x num_samples x num_targets] return SamplePatchTSTRegressionOutput(sequences=samples) From 7f0561086d685ea79407ddf39d26adbdac2c4e62 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 6 Nov 2023 14:28:16 -0500 Subject: [PATCH 161/189] Remove *ForForecasting class to match with other time series models. --- .../models/patchtst/configuration_patchtst.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index b3fad61f911f5d..e7fb491f435052 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -123,12 +123,12 @@ class PatchTSTConfig(PretrainedConfig): The dropout probability for head. prediction_length (`int`, *optional*, defaults to 24): The prediction length for the encoder. In other words, the prediction horizon of the model. - num_output_channels (`int`, *optional*, defaults to 1): - Number of output channels. - prediction_range (`list`, *optional*): - The range of prediction values can be set to enforce the model to produce values within a range. + num_targets (`int`, *optional*, defaults to 1): + Number of targets for regression and classificastion tasks. For classification, it is the number of classes. + output_range (`list`, *optional*): + Output range for regression task. The range of output values can be set to enforce the model to produce values within a range. num_parallel_samples (`int`, *optional*, defaults to 100): - The number of samples to generate in parallel for probablistic forecast. + The number of samples to generate in parallel for probablistic prediction. ```python @@ -196,8 +196,8 @@ def __init__( pooling: str = "mean", head_dropout: float = 0.0, prediction_length: int = 24, - num_output_channels: int = 1, - prediction_range: List = None, + num_targets: int = 1, + output_range: List = None, # distribution head num_parallel_samples: int = 100, **kwargs, @@ -231,7 +231,7 @@ def __init__( self.init_std = init_std self.scaling = scaling - # PatchTST + # PatchTST parameters self.patch_length = patch_length self.stride = stride self.num_patches = self._num_patches() @@ -251,16 +251,16 @@ def __init__( self.pooling = pooling self.head_dropout = head_dropout - # Forecast head + # For prediction head self.shared_projection = shared_projection - - # Forcasting and prediction self.prediction_length = prediction_length + + # For prediction and regression head self.num_parallel_samples = num_parallel_samples # Regression - self.num_output_channels = num_output_channels - self.prediction_range = prediction_range + self.num_targets = num_targets + self.output_range = output_range super().__init__(**kwargs) From 9807142518b02a69ddeb3bb0b7e33d8f6a3e54dc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 6 Nov 2023 20:44:44 +0100 Subject: [PATCH 162/189] make style --- .../models/patchtst/configuration_patchtst.py | 6 ++++-- src/transformers/models/patchtst/modeling_patchtst.py | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index e7fb491f435052..7a91549684feb7 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -124,9 +124,11 @@ class PatchTSTConfig(PretrainedConfig): prediction_length (`int`, *optional*, defaults to 24): The prediction length for the encoder. In other words, the prediction horizon of the model. num_targets (`int`, *optional*, defaults to 1): - Number of targets for regression and classificastion tasks. For classification, it is the number of classes. + Number of targets for regression and classificastion tasks. For classification, it is the number of + classes. output_range (`list`, *optional*): - Output range for regression task. The range of output values can be set to enforce the model to produce values within a range. + Output range for regression task. The range of output values can be set to enforce the model to produce + values within a range. num_parallel_samples (`int`, *optional*, defaults to 100): The number of samples to generate in parallel for probablistic prediction. diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index e5a0af1e86d02b..9aeb6420c13c81 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1842,9 +1842,7 @@ def __init__(self, config: PatchTSTConfig): elif config.distribution_output == "normal": self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_targets) elif config.distribution_output == "negative_binomial": - self.distribution_output = NegativeBinomialOutput( - dim=config.prediction_length * config.num_targets - ) + self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_targets) else: raise ValueError(f"Unknown distribution output {config.distribution_output}") @@ -1952,9 +1950,7 @@ def generate( # get distribution distribution = self.distribution_output.distribution(outputs.forecast_outputs) # get samples - samples = [ - distribution.sample() for _ in range(num_parallel_samples) - ] # samples: list of [bs x num_targets] + samples = [distribution.sample() for _ in range(num_parallel_samples)] # samples: list of [bs x num_targets] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x num_targets] return SamplePatchTSTRegressionOutput(sequences=samples) From 3b8a3063efecadff8dd306ac942ef332041c5712 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 6 Nov 2023 14:49:18 -0500 Subject: [PATCH 163/189] Remove PatchTSTForForecasting in the test --- tests/models/patchtst/test_modeling_patchtst.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 68ba5030b35430..be833bef3ba982 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -40,7 +40,6 @@ MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING, PatchTSTConfig, PatchTSTForClassification, - PatchTSTForForecasting, PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTForRegression, @@ -148,7 +147,6 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ( PatchTSTModel, PatchTSTForPrediction, - PatchTSTForForecasting, PatchTSTForPretraining, PatchTSTForClassification, PatchTSTForRegression, @@ -157,7 +155,7 @@ class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase else () ) all_generative_model_classes = ( - (PatchTSTForPrediction, PatchTSTForForecasting, PatchTSTForPretraining) if is_torch_available() else () + (PatchTSTForPrediction, PatchTSTForRegression, PatchTSTForPretraining) if is_torch_available() else () ) pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {} test_pruning = False @@ -335,7 +333,7 @@ def test_pretrain_head(self): # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. def test_forecast_head(self): - model = PatchTSTForForecasting.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device) + model = PatchTSTForPrediction.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device) batch = prepare_batch(file="test-batch.pt") From ff45a2036f2bed25a948151df42507a145146a39 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 6 Nov 2023 14:58:45 -0500 Subject: [PATCH 164/189] remove PatchTSTForForecastingOutput class --- .../models/patchtst/modeling_patchtst.py | 50 ++----------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 9aeb6420c13c81..ca318fbfcb8d86 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -946,35 +946,6 @@ class PatchTSTForPretrainingOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor]] = None -@dataclass -class PatchTSTForForecastingOutput(ModelOutput): - """ - Output type of [`PatchTSTForForecastingtion`]. - - Parameters: - loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): - MSE loss. - forecast_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`): - Forecast outputs of the time series modeling heads. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[torch.FloatTensor] = None - forecast_outputs: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - @dataclass class PatchTSTForRegressionOutput(ModelOutput): """ @@ -1081,21 +1052,6 @@ class SamplePatchTSTPredictionOutput(ModelOutput): sequences: torch.FloatTensor = None -@dataclass -class SamplePatchTSTForecastOutput(ModelOutput): - """ - Base class for time series model's predictions outputs that contains the sampled values from the chosen - distribution. - - Parameters: - sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, - num_samples, prediction_length, number_channels)`): - Sampled values from the chosen distribution. - """ - - sequences: torch.FloatTensor = None - - @dataclass class SamplePatchTSTRegressionOutput(ModelOutput): """ @@ -1732,7 +1688,7 @@ def generate( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - ) -> SamplePatchTSTForecastOutput: + ) -> SamplePatchTSTPredictionOutput: """ Generate sequences of sample predictions from a model with a probability distribution head. @@ -1748,7 +1704,7 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: - [`SamplePatchTSTForecastOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. """ @@ -1773,7 +1729,7 @@ def generate( ] # samples: list of [bs x forecast_len x num_channels] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x num_channels] - return SamplePatchTSTForecastOutput(sequences=samples) + return SamplePatchTSTPredictionOutput(sequences=samples) class PatchTSTRegressionHead(nn.Module): From abc64c0d7b4210ef819d8dd761232ff0cf008e7f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Mon, 6 Nov 2023 15:00:33 -0500 Subject: [PATCH 165/189] change test_forecast_head to test_prediction_head --- tests/models/patchtst/test_modeling_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index be833bef3ba982..07d30826acc813 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -332,7 +332,7 @@ def test_pretrain_head(self): self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE)) # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable. - def test_forecast_head(self): + def test_prediction_head(self): model = PatchTSTForPrediction.from_pretrained("ibm/patchtst-etth1-forecast").to(torch_device) batch = prepare_batch(file="test-batch.pt") @@ -342,7 +342,7 @@ def test_forecast_head(self): output = model( past_values=batch["past_values"].to(torch_device), future_values=batch["future_values"].to(torch_device), - ).forecast_outputs + ).prediction_outputs expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels]) self.assertEqual(output.shape, expected_shape) From 6b3fb305bb0b1e3696137cf6769a62f451fa271c Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 7 Nov 2023 09:33:22 +0100 Subject: [PATCH 166/189] style --- src/transformers/models/patchtst/modeling_patchtst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index ca318fbfcb8d86..23c1254acd14ee 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1704,8 +1704,8 @@ def generate( - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). Return: - [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number - of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, + [`SamplePatchTSTPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, + number of samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)` for multivariate predictions. """ # get number of samples From 69897e3225fd7033cef4d6ec089af29d8e1f4773 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 7 Nov 2023 10:10:19 +0100 Subject: [PATCH 167/189] fix docs --- docs/source/en/model_doc/patchtst.md | 6 ------ src/transformers/__init__.py | 2 -- src/transformers/models/patchtst/__init__.py | 2 -- src/transformers/utils/dummy_pt_objects.py | 7 ------- utils/check_repo.py | 1 - 5 files changed, 18 deletions(-) diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index ba4b5e27636056..c18abeb20e64ef 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -55,12 +55,6 @@ The original code can be found [here](https://github.com/yuqinie98/PatchTST). - forward -## PatchTSTForForecasting - -[[autodoc]] PatchTSTForForecasting - - forward - - ## PatchTSTForClassification [[autodoc]] PatchTSTForClassification diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 1a29b855d5d38a..8c66ea0f2333c3 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2482,7 +2482,6 @@ [ "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST", "PatchTSTForClassification", - "PatchTSTForForecasting", "PatchTSTForPrediction", "PatchTSTForPretraining", "PatchTSTForRegression", @@ -6375,7 +6374,6 @@ from .models.patchtst import ( PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, - PatchTSTForForecasting, PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTForRegression, diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py index e2ac594688d90e..8c7db64c198406 100644 --- a/src/transformers/models/patchtst/__init__.py +++ b/src/transformers/models/patchtst/__init__.py @@ -35,7 +35,6 @@ "PatchTSTModel", "PatchTSTPreTrainedModel", "PatchTSTForPrediction", - "PatchTSTForForecasting", "PatchTSTForPretraining", "PatchTSTForRegression", "PatchTSTForClassification", @@ -54,7 +53,6 @@ from .modeling_patchtst import ( PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST, PatchTSTForClassification, - PatchTSTForForecasting, PatchTSTForPrediction, PatchTSTForPretraining, PatchTSTForRegression, diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index d05b00c864cb5f..8d07da493c72a8 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -5990,13 +5990,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class PatchTSTForForecasting(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class PatchTSTForPrediction(metaclass=DummyObject): _backends = ["torch"] diff --git a/utils/check_repo.py b/utils/check_repo.py index ea3acc1311f1f9..d510fe43531a62 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -185,7 +185,6 @@ "TimeSeriesTransformerForPrediction", "InformerForPrediction", "AutoformerForPrediction", - "PatchTSTForForecasting", "PatchTSTForPretraining", "PatchTSTForPrediction", "JukeboxVQVAE", From 42ec43bd2667e95281698aff1da0cec69c05a04c Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 7 Nov 2023 10:28:42 +0100 Subject: [PATCH 168/189] fix tests --- src/transformers/models/patchtst/modeling_patchtst.py | 2 -- tests/models/patchtst/test_modeling_patchtst.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 23c1254acd14ee..693f3050f851ec 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -983,10 +983,8 @@ class PatchTSTForPredictionOutput(ModelOutput): Parameters: loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): MSE loss. - prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, -1)`): Prediction outputs of the time series modeling heads. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 07d30826acc813..de3a42ef5fd9b5 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -193,7 +193,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): rng = random.Random(self.model_tester.seed_number) labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng) - inputs_dict["labels"] = labels + inputs_dict["target_values"] = labels inputs_dict.pop("future_values") elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): rng = random.Random(self.model_tester.seed_number) @@ -281,7 +281,7 @@ def test_forward_signature(self): ): expected_arg_names.remove("future_values") expected_arg_names.remove("past_observed_mask") - expected_arg_names.append("labels") if model_class in get_values( + expected_arg_names.append("target_values") if model_class in get_values( MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING ) else expected_arg_names.append("target_values") expected_arg_names.append("past_observed_mask") From f451c0574d2ab081adb7185c204d0241d596ac7c Mon Sep 17 00:00:00 2001 From: nnguyen Date: Tue, 7 Nov 2023 17:11:01 -0500 Subject: [PATCH 169/189] change num_labels to num_targets --- tests/models/patchtst/test_modeling_patchtst.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index de3a42ef5fd9b5..4313591da29609 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -70,7 +70,7 @@ def __init__( lags_sequence=[1, 2, 3, 4, 5], distil=False, seed_number=42, - num_labels=2, + num_targets=2, num_output_channels=2, ): self.parent = parent @@ -92,7 +92,7 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.seed_number = seed_number - self.num_labels = num_labels + self.num_targets = num_targets self.num_output_channels = num_output_channels self.distil = distil self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 @@ -112,7 +112,7 @@ def get_config(self): context_length=self.context_length, activation_function=self.hidden_act, seed_number=self.seed_number, - num_labels=self.num_labels, + num_targets=self.num_targets, num_output_channels=self.num_output_channels, ) @@ -192,7 +192,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): # else if classification model: elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING): rng = random.Random(self.model_tester.seed_number) - labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_labels, rng=rng) + labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng) inputs_dict["target_values"] = labels inputs_dict.pop("future_values") elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING): From 131454d0c8d72c44a10e01dc88cf8e438aa9b208 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 17:00:37 -0500 Subject: [PATCH 170/189] Remove PatchTSTTranspose --- .../models/patchtst/modeling_patchtst.py | 45 +++---------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 693f3050f851ec..194e19ae3f508e 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -199,33 +199,6 @@ def forward( return attn_output, attn_weights_reshaped, past_key_value -class PatchTSTTranspose(nn.Module): - """ - Transpose the tensor to the dimension defined in **dims** - - Parameters: - dims (`list`): list of dimensions to be transposed - contiguous (`bool`, default to False): if True, the transposed tensor is contiguous - """ - - def __init__(self, *dims, contiguous=False): - super().__init__() - self.dims = dims - self.contiguous = dims - - def forward(self, inputs: torch.Tensor): - """ - Parameters: - inputs (`torch.Tensor`): input to be transposed - Returns: - `torch.Tensor`: transposed tensor - """ - if self.contiguous: - return inputs.transpose(*self.dims).contiguous() - else: - return inputs.transpose(*self.dims) - - class PatchTSTBatchNorm(nn.Module): """ Parameters: @@ -233,11 +206,9 @@ class PatchTSTBatchNorm(nn.Module): d_model (`int`): model dimension """ - def __init__(self, d_model): + def __init__(self, config): super().__init__() - self.d_model = d_model - self.transpose = PatchTSTTranspose(1, 2) - self.batchnorm = nn.BatchNorm1d(self.d_model) + self.batchnorm = nn.BatchNorm1d(config.d_model) def forward(self, inputs: torch.Tensor): """ @@ -245,12 +216,11 @@ def forward(self, inputs: torch.Tensor): inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`): input for Batch norm calculation Returns: - `torch.Tensor`: tensor + `torch.Tensor` of shape `(batch_size, sequence_length, d_model)` """ - output = self.transpose(inputs) # output: (batch_size, d_model, sequence_length) + output = inputs.transpose(1, 2) # output: (batch_size, d_model, sequence_length) output = self.batchnorm(output) - output = self.transpose(output) # output: (batch_size, sequence_length, d_model) - return output + return output.transpose(1, 2) def positional_encoding(position_embedding_type, learned, q_len, d_model): @@ -742,7 +712,6 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel): """ PatchTST Encoder """ - def __init__(self, config: PatchTSTConfig): super().__init__(config) self.num_input_channels = config.num_input_channels @@ -1297,7 +1266,7 @@ def forward( ) -class MaskPretrainHead(nn.Module): +class PatchTSTMaskPretrainHead(nn.Module): """ Pretraining head for mask modelling """ @@ -1335,7 +1304,7 @@ def __init__(self, config: PatchTSTConfig): config.mask_input = True self.model = PatchTSTModel(config=config) - self.head = MaskPretrainHead(config) + self.head = PatchTSTMaskPretrainHead(config) # Initialize weights and apply final processing self.post_init() From 58dd1ec0bdf43ae2f117c0bfa9c3fef46b1dd1c2 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 17:28:43 -0500 Subject: [PATCH 171/189] remove arguments in PatchTSTMeanScaler --- .../models/patchtst/modeling_patchtst.py | 95 +++++++++---------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 194e19ae3f508e..018df19f60c153 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1103,83 +1103,67 @@ def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tens # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST class PatchTSTMeanScaler(nn.Module): """ - Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data + Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - default_scale (`float`, *optional*, defaults to `None`): - Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch. - minimum_scale (`float`, *optional*, defaults to 1e-10): - Default minimum possible scale that is used for any item. """ - def __init__( - self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 - ): + def __init__(self): super().__init__() - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale - self.default_scale = default_scale @torch.no_grad() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - # shape: (N, [C], T=1) - ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) - num_observed = observed_indicator.sum(self.dim, keepdim=True) - + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + ts_sum = (data * observed_indicator).abs().sum(dim=1, keepdim=True) + num_observed = observed_indicator.sum(dim=1, keepdim=True) scale = ts_sum / torch.clamp(num_observed, min=1) - # If `default_scale` is provided, we use it, otherwise we use the scale - # of the batch. - if self.default_scale is None: - batch_sum = ts_sum.sum(dim=0) - batch_observations = torch.clamp(num_observed.sum(0), min=1) - default_scale = torch.squeeze(batch_sum / batch_observations) - else: - default_scale = self.default_scale * torch.ones_like(scale) + # use the scale of the batch. + batch_sum = ts_sum.sum(dim=0) + batch_observations = torch.clamp(num_observed.sum(0), min=1) + default_scale = torch.squeeze(batch_sum / batch_observations) # apply default scale where there are no observations scale = torch.where(num_observed > 0, scale, default_scale) - # ensure the scale is at least `self.minimum_scale` - scale = torch.clamp(scale, min=self.minimum_scale) + # ensure the scale is at least 1e-10 + scale = torch.clamp(scale, min=1e-10) scaled_data = data / scale - - if not self.keepdim: - scale = scale.squeeze(dim=self.dim) - return scaled_data, torch.zeros_like(scale), scale # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST class PatchTSTNOPScaler(nn.Module): """ - Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. """ - def __init__(self, dim: int, keepdim: bool = False): + def __init__(self): super().__init__() - self.dim = dim - self.keepdim = keepdim def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) - loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + scale = torch.ones_like(data, requires_grad=False).mean(dim=1, keepdim=True) + loc = torch.zeros_like(data, requires_grad=False).mean(dim=1, keepdim=True) return data, loc, scale @@ -1187,15 +1171,24 @@ class PatchTSTScaler(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() if config.scaling == "mean" or config.scaling is True: - self.scaler = PatchTSTMeanScaler(dim=1, keepdim=True) + self.scaler = PatchTSTMeanScaler() elif config.scaling == "std": - self.scaler = PatchTSTStdScaler(dim=1, keepdim=True) + self.scaler = PatchTSTStdScaler() else: - self.scaler = PatchTSTNOPScaler(dim=1, keepdim=True) + self.scaler = PatchTSTNOPScaler() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, um_input_channels)`) + """ data, loc, scale = self.scaler(data, observed_indicator) return data, loc, scale From 9a69973cc8168ebb30c4f1d05bd5a53f544e25dd Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 17:52:02 -0500 Subject: [PATCH 172/189] remove arguments in PatchTSTStdScaler --- .../models/patchtst/modeling_patchtst.py | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 018df19f60c153..a35e69ded624dd 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1069,34 +1069,33 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): """ - Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by subtracting from the mean and dividing by the standard deviation. - - Args: - dim (`int`): - Dimension along which to calculate the mean and standard deviation. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - minimum_scale (`float`, *optional*, defaults to 1e-5): - Default scale that is used for elements that are constantly zero along dimension `dim`. """ - def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5): + def __init__(self): super().__init__() - if not dim > 0: - raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0") - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - denominator = weights.sum(self.dim, keepdim=self.keepdim) + def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + denominator = observed_indicator.sum(dim=1, keepdim=True) denominator = denominator.clamp_min(1.0) - loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + loc = (data * observed_indicator).sum(dim=1, keepdim=True) / denominator - variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator - scale = torch.sqrt(variance + self.minimum_scale) + variance = (((data - loc) * observed_indicator) ** 2).sum(dim=1, keepdim=True) / denominator + scale = torch.sqrt(variance + 1e-10) return (data - loc) / scale, loc, scale @@ -1106,7 +1105,6 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. """ - def __init__(self): super().__init__() @@ -1118,6 +1116,8 @@ def forward( Parameters: data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. Returns: tuple of `torch.Tensor` of shapes (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, @@ -1151,7 +1151,7 @@ def __init__(self): super().__init__() def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor + self, data: torch.Tensor, observed_indicator: torch.Tensor=None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Parameters: @@ -1184,6 +1184,8 @@ def forward( Parameters: data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. Returns: tuple of `torch.Tensor` of shapes (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, From 7ab8d59233d42da87bcd1c37c35bf53bd6513f60 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 19:55:19 -0500 Subject: [PATCH 173/189] add config as an argument to all the scaler classes --- .../models/patchtst/modeling_patchtst.py | 62 ++++++++++++------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index a35e69ded624dd..9422ba9ccba459 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -206,7 +206,7 @@ class PatchTSTBatchNorm(nn.Module): d_model (`int`): model dimension """ - def __init__(self, config): + def __init__(self, config: PatchTSTConfig): super().__init__() self.batchnorm = nn.BatchNorm1d(config.d_model) @@ -1073,8 +1073,11 @@ class PatchTSTStdScaler(nn.Module): by subtracting from the mean and dividing by the standard deviation. """ - def __init__(self): + def __init__(self, config: PatchTSTConfig): super().__init__() + self.dim = 1 if config.scaling_dim is None else config.scaling_dim + self.keepdim = True if config.keepdim is None else config.keepdim + self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale @torch.no_grad() def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor @@ -1090,12 +1093,12 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, `(batch_size, 1, num_input_channels)`) """ - denominator = observed_indicator.sum(dim=1, keepdim=True) + denominator = weights.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) - loc = (data * observed_indicator).sum(dim=1, keepdim=True) / denominator + loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator - variance = (((data - loc) * observed_indicator) ** 2).sum(dim=1, keepdim=True) / denominator - scale = torch.sqrt(variance + 1e-10) + variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + scale = torch.sqrt(variance + self.minimum_scale) return (data - loc) / scale, loc, scale @@ -1105,8 +1108,12 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. """ - def __init__(self): + def __init__(self, config: PatchTSTConfig): super().__init__() + self.dim = 1 if config.scaling_dim is None else config.scaling_dim + self.keepdim = True if config.keepdim is None else config.keepdim + self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale + self.default_scale = config.default_scale if config.default_scale else None @torch.no_grad() def forward( @@ -1123,21 +1130,30 @@ def forward( (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, `(batch_size, 1, num_input_channels)`) """ - ts_sum = (data * observed_indicator).abs().sum(dim=1, keepdim=True) - num_observed = observed_indicator.sum(dim=1, keepdim=True) + ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) + num_observed = observed_indicator.sum(self.dim, keepdim=True) + scale = ts_sum / torch.clamp(num_observed, min=1) - # use the scale of the batch. - batch_sum = ts_sum.sum(dim=0) - batch_observations = torch.clamp(num_observed.sum(0), min=1) - default_scale = torch.squeeze(batch_sum / batch_observations) + # If `default_scale` is provided, we use it, otherwise we use the scale + # of the batch. + if self.default_scale is None: + batch_sum = ts_sum.sum(dim=0) + batch_observations = torch.clamp(num_observed.sum(0), min=1) + default_scale = torch.squeeze(batch_sum / batch_observations) + else: + default_scale = self.default_scale * torch.ones_like(scale) # apply default scale where there are no observations scale = torch.where(num_observed > 0, scale, default_scale) - # ensure the scale is at least 1e-10 - scale = torch.clamp(scale, min=1e-10) + # ensure the scale is at least `self.minimum_scale` + scale = torch.clamp(scale, min=self.minimum_scale) scaled_data = data / scale + + if not self.keepdim: + scale = scale.squeeze(dim=self.dim) + return scaled_data, torch.zeros_like(scale), scale @@ -1147,8 +1163,10 @@ class PatchTSTNOPScaler(nn.Module): Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. """ - def __init__(self): + def __init__(self, config: PatchTSTConfig): super().__init__() + self.dim = 1 if config.scaling_dim is None else config.scaling_dim + self.keepdim = True if config.keepdim is None else config.keepdim def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor=None @@ -1162,8 +1180,8 @@ def forward( (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, `(batch_size, 1, num_input_channels)`) """ - scale = torch.ones_like(data, requires_grad=False).mean(dim=1, keepdim=True) - loc = torch.zeros_like(data, requires_grad=False).mean(dim=1, keepdim=True) + scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale @@ -1171,11 +1189,11 @@ class PatchTSTScaler(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() if config.scaling == "mean" or config.scaling is True: - self.scaler = PatchTSTMeanScaler() + self.scaler = PatchTSTMeanScaler(config) elif config.scaling == "std": - self.scaler = PatchTSTStdScaler() + self.scaler = PatchTSTStdScaler(config) else: - self.scaler = PatchTSTNOPScaler() + self.scaler = PatchTSTNOPScaler(config) def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor @@ -1266,7 +1284,7 @@ class PatchTSTMaskPretrainHead(nn.Module): Pretraining head for mask modelling """ - def __init__(self, config): + def __init__(self, config: PatchTSTConfig): super().__init__() self.dropout = nn.Dropout(config.dropout) self.linear = nn.Linear(config.d_model, config.patch_length) From 4caa376e9225736ddeb105da6aece13c8c54482f Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 20:10:58 -0500 Subject: [PATCH 174/189] reformat --- .../models/patchtst/modeling_patchtst.py | 99 ++++++++----------- 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 9422ba9ccba459..032d23326f0925 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -295,13 +295,13 @@ def random_masking( noise = torch.rand(batch_size, 1, sequence_length, device=device) # noise in [0, 1], bs x 1 x L noise = noise.repeat(1, num_channels, 1) # bs x num_channels x time else: + # noise in [0, 1], bs x num_channels x L noise = torch.rand( - batch_size, num_channels, sequence_length, device=device - ) # noise in [0, 1], bs x num_channels x L + batch_size, num_channels, sequence_length, device=device) + # mask: [bs x num_channels x num_patch] mask = torch.ones( - batch_size, num_channels, sequence_length, device=device - ) # mask: [bs x num_channels x num_patch] + batch_size, num_channels, sequence_length, device=device) mask[:, :, :len_keep] = 0 # sort noise for each sample @@ -432,14 +432,13 @@ def forward(self, past_values: torch.Tensor): raise ValueError( f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})." ) - - output = past_values[:, self.sequence_start :, :] # output: [bs x new_sequence_length x num_channels] + # output: [bs x new_sequence_length x num_channels] + output = past_values[:, self.sequence_start :, :] + # output: [bs x num_patches x num_input_channels x patch_length] output = output.unfold( - dimension=-2, size=self.patch_length, step=self.stride - ) # output: [bs x num_patches x num_input_channels x patch_length] - output = output.transpose( - -2, -3 - ).contiguous() # output: [bs x num_input_channels x num_patches x patch_length] + dimension=-2, size=self.patch_length, step=self.stride) + # output: [bs x num_input_channels x num_patches x patch_length] + output = output.transpose(-2, -3).contiguous() return output @@ -533,7 +532,7 @@ def __init__(self, config: PatchTSTConfig): if "batch" in config.norm.lower(): self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model) else: - self.norm_sublayer1 = nn.LayerNorm(config.d_model) + self.norm_sublayer1 = nn.LayerNorm(config.d_model, eps=config.norm_eps) # Add & Norm of the sublayer 2 if self.channel_attention: @@ -541,7 +540,7 @@ def __init__(self, config: PatchTSTConfig): if "batch" in config.norm.lower(): self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model) else: - self.norm_sublayer2 = nn.LayerNorm(config.d_model) + self.norm_sublayer2 = nn.LayerNorm(config.d_model, eps=config.norm_eps) # Position-wise Feed-Forward self.ff = nn.Sequential( @@ -556,7 +555,7 @@ def __init__(self, config: PatchTSTConfig): if "batch" in config.norm.lower(): self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model) else: - self.norm_sublayer3 = nn.LayerNorm(config.d_model) + self.norm_sublayer3 = nn.LayerNorm(config.d_model, eps=config.norm_eps) self.pre_norm = config.pre_norm @@ -572,30 +571,28 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape # First sublayer: attention across time - hidden_state = hidden_state.view( - batch_size * num_input_channels, sequence_length, d_model - ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] + # hidden_states: [(bs*num_channels) x sequence_length x d_model] + hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model) if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, attn_weights, _ = self.self_attn( hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions ) - hidden_state = hidden_state + self.dropout_path1( - attn_output - ) # Add: residual connection with residual dropout + # Add: residual connection with residual dropout + hidden_state = hidden_state + self.dropout_path1(attn_output) else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT attn_output, attn_weights, _ = self.self_attn( hidden_states=hidden_state, output_attentions=output_attentions ) + # hidden_states: [(bs*num_channels) x sequence_length x d_model] hidden_state = self.norm_sublayer1( - hidden_state + self.dropout_path1(attn_output) - ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] + hidden_state + self.dropout_path1(attn_output)) + # [bs x num_channels x sequence_length x d_model] hidden_state = hidden_state.reshape( - batch_size, num_input_channels, sequence_length, d_model - ) # [bs x num_channels x sequence_length x d_model] + batch_size, num_input_channels, sequence_length, d_model) # second sublayer: attention across variable at any given time # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model] @@ -611,17 +608,16 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] attn_output, channel_attn_weights, _ = self.self_attn( hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions ) - hidden_state = hidden_state + self.dropout_path2( - attn_output - ) # Add: residual connection with residual dropout + # Add: residual connection with residual dropout + hidden_state = hidden_state + self.dropout_path2(attn_output) else: ## Multi-Head attention and Add residual connection and Norm attn_output, channel_attn_weights, _ = self.self_attn( hidden_states=hidden_state, output_attentions=output_attentions ) + # hidden_states: [(bs*sequence_length) x num_channels x d_model] hidden_state = self.norm_sublayer2( - hidden_state + self.dropout_path2(attn_output) - ) # hidden_states: [(bs*sequence_length) x num_channels x d_model] + hidden_state + self.dropout_path2(attn_output)) hidden_state = ( hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model) @@ -630,26 +626,24 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] ) # src: [bs x num_channels x sequence_length x d_model] # Third sublayer: mixing across hidden + # src: [(batch_size*num_channels) x sequence_length x d_model] hidden_state = hidden_state.view( - batch_size * num_input_channels, sequence_length, d_model - ) # src: [(batch_size*num_channels) x sequence_length x d_model] + batch_size * num_input_channels, sequence_length, d_model) if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection + # Add: residual connection with residual dropout hidden_state = hidden_state + self.dropout_path3( - self.ff(self.norm_sublayer3(hidden_state)) - ) # Add: residual connection with residual dropout + self.ff(self.norm_sublayer3(hidden_state))) else: ## Position-wise Feed-Forward and Add residual connection and Norm + # Add: residual connection with residual dropout hidden_state = self.norm_sublayer3( - hidden_state + self.dropout_path3(self.ff(hidden_state)) - ) # Add: residual connection with residual dropout + hidden_state + self.dropout_path3(self.ff(hidden_state))) - hidden_state = hidden_state.reshape( - batch_size, num_input_channels, sequence_length, d_model - ) # [bs x num_channels x sequence_length x d_model] + # [bs x num_channels x sequence_length x d_model] + hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model) outputs = (hidden_state,) - if output_attentions: outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights,) @@ -777,13 +771,11 @@ def forward( # append cls token cls_token = self.cls_token + self.position_enc[:1, :] # cls_token: [1 x 1 x 1 x d_model] cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1) # get the same copy for all the batch samples - hidden_state = torch.cat( - (cls_tokens, patch_input), dim=1 - ) # x: [bs x num_channels x (num_patches+1) x d_model] + # x: [bs x num_channels x (num_patches+1) x d_model] + hidden_state = torch.cat((cls_tokens, patch_input), dim=1) else: - hidden_state = self.positional_dropout( - patch_input + self.position_enc - ) # x: [bs x num_channels x num_patches x d_model] + # x: [bs x num_channels x num_patches x d_model] + hidden_state = self.positional_dropout(patch_input + self.position_enc) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -792,10 +784,8 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_state,) - layer_outputs = encoder_layer( - hidden_state=hidden_state, - output_attentions=output_attentions, - ) + layer_outputs = encoder_layer(hidden_state=hidden_state, + output_attentions=output_attentions) # get hidden state hidden_state = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token @@ -1542,9 +1532,8 @@ def forward(self, embedding: torch.Tensor): for i in range(self.num_input_channels): z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] z = self.dropouts[i](z) - z = self.projections[i]( - z - ) # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + z = self.projections[i](z) x_out.append(z) output = torch.stack(x_out, dim=1) # x: [bs x num_channels x forecast_len] else: @@ -1554,12 +1543,10 @@ def forward(self, embedding: torch.Tensor): # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head if isinstance(output, tuple): - output = tuple( - z.transpose(2, 1) for z in output - ) # ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels]) + # output: ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels]) + output = tuple(z.transpose(2, 1) for z in output) else: output = output.transpose(2, 1) # [bs x forecast_len x num_channels] - return output From 03d27f778ab422ec3e5f9a1cbd60226315a453bf Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 21:38:10 -0500 Subject: [PATCH 175/189] Add norm_eps for batchnorm and layernorm --- src/transformers/models/patchtst/configuration_patchtst.py | 4 ++++ src/transformers/models/patchtst/modeling_patchtst.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 7a91549684feb7..c36241c3010740 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -68,6 +68,8 @@ class PatchTSTConfig(PretrainedConfig): Dimension of the "intermediate" (often named feed-forward) layer in encoder. norm (`str` , *optional*, defaults to `"BatchNorm"`): Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`. + norm_eps (`float`, *optional*, defaults to 1e-5): + A value added to the denominator for numerical stability of normalization. Default: 1e-5 attention_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for the attention probabilities. dropout (`float`, *optional*, defaults to 0.0): @@ -170,6 +172,7 @@ def __init__( channel_attention: bool = False, encoder_ffn_dim: int = 256, norm: str = "BatchNorm", + norm_eps: float = 1e-5, attention_dropout: float = 0.0, dropout: float = 0.0, positional_dropout: float = 0.0, @@ -221,6 +224,7 @@ def __init__( self.shared_embedding = shared_embedding self.channel_attention = channel_attention self.norm = norm + self.norm_eps = norm_eps self.positional_dropout = positional_dropout self.dropout_path = dropout_path self.ff_dropout = ff_dropout diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 032d23326f0925..0cbffcb844aca6 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -208,7 +208,7 @@ class PatchTSTBatchNorm(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.batchnorm = nn.BatchNorm1d(config.d_model) + self.batchnorm = nn.BatchNorm1d(config.d_model, eps=config.norm_eps) def forward(self, inputs: torch.Tensor): """ From 03e32203b24f53b00123eb50dd2f082d40c13201 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 21:49:19 -0500 Subject: [PATCH 176/189] reformat. --- .../models/patchtst/modeling_patchtst.py | 49 +++++++------------ 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 0cbffcb844aca6..dc3845745063f1 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -577,58 +577,47 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions - ) + hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions) # Add: residual connection with residual dropout hidden_state = hidden_state + self.dropout_path1(attn_output) else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT attn_output, attn_weights, _ = self.self_attn( - hidden_states=hidden_state, output_attentions=output_attentions - ) + hidden_states=hidden_state, output_attentions=output_attentions) # hidden_states: [(bs*num_channels) x sequence_length x d_model] - hidden_state = self.norm_sublayer1( - hidden_state + self.dropout_path1(attn_output)) + hidden_state = self.norm_sublayer1(hidden_state + self.dropout_path1(attn_output)) - # [bs x num_channels x sequence_length x d_model] - hidden_state = hidden_state.reshape( - batch_size, num_input_channels, sequence_length, d_model) + # hidden_state: [bs x num_channels x sequence_length x d_model] + hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model) # second sublayer: attention across variable at any given time - # [bs x num_channels x sequence_length x d_model] -> [bs x sequence_length x num_channels x d_model] - # -> [(bs*sequence_length) x num_channels x d_model] if self.channel_attention: - hidden_state = ( - hidden_state.transpose(2, 1) - .contiguous() - .view(batch_size * sequence_length, num_input_channels, d_model) - ) # [(bs*sequence_length) x num_channels x d_model] + # hidden_state: [bs x sequence_length x num_channels x d_model] + hidden_state = hidden_state.transpose(2, 1).contiguous() + # hidden_state: [(bs*sequence_length) x num_channels x d_model] + hidden_state = hidden_state.view(batch_size * sequence_length, num_input_channels, d_model) if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, channel_attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions - ) + hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions) # Add: residual connection with residual dropout hidden_state = hidden_state + self.dropout_path2(attn_output) else: ## Multi-Head attention and Add residual connection and Norm attn_output, channel_attn_weights, _ = self.self_attn( - hidden_states=hidden_state, output_attentions=output_attentions - ) + hidden_states=hidden_state, output_attentions=output_attentions) # hidden_states: [(bs*sequence_length) x num_channels x d_model] - hidden_state = self.norm_sublayer2( - hidden_state + self.dropout_path2(attn_output)) + hidden_state = self.norm_sublayer2(hidden_state + self.dropout_path2(attn_output)) - hidden_state = ( - hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model) - .transpose(1, 2) - .contiguous() - ) # src: [bs x num_channels x sequence_length x d_model] + # Reshape hidden state + # hidden_state: [bs x sequence_length x num_channels x d_model] + hidden_state = hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model) + # hidden_state: [bs x num_channels x sequence_length x d_model] + hidden_state = hidden_state.transpose(1, 2).contiguous() # Third sublayer: mixing across hidden - # src: [(batch_size*num_channels) x sequence_length x d_model] - hidden_state = hidden_state.view( - batch_size * num_input_channels, sequence_length, d_model) + # hidden_state: [(batch_size*num_channels) x sequence_length x d_model] + hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model) if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection # Add: residual connection with residual dropout From a8dc48adcdff3ac767cceb63d6919fe59d689d20 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 22:07:56 -0500 Subject: [PATCH 177/189] reformat --- .../models/patchtst/modeling_patchtst.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index dc3845745063f1..877b4089720da7 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1072,11 +1072,11 @@ def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, `(batch_size, 1, num_input_channels)`) """ - denominator = weights.sum(self.dim, keepdim=self.keepdim) + denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) - loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator - variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator scale = torch.sqrt(variance + self.minimum_scale) return (data - loc) / scale, loc, scale @@ -1217,11 +1217,31 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, - future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, PatchTSTModelOutput]: + """ + Parameters: + past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*): + Input sequence to the model + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers + output_attentions (`bool`, *optional*): + Whether or not to return the output attention of all layers + return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. + + Returns: + `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or + `config.return_dict`=False) + + """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -1585,7 +1605,7 @@ def forward( - 1 for values that are **observed**, - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*): - future target values associates with the `past_values` + future target values associated with the `past_values` output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. @@ -1679,10 +1699,8 @@ def generate( distribution = self.distribution_output.distribution( outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale ) - # get samples - samples = [ - distribution.sample() for _ in range(num_parallel_samples) - ] # samples: list of [bs x forecast_len x num_channels] + # get samples: list of [bs x forecast_len x num_channels] + samples = [distribution.sample() for _ in range(num_parallel_samples)] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x forecast_len x num_channels] return SamplePatchTSTPredictionOutput(sequences=samples) @@ -1861,8 +1879,8 @@ def generate( # get distribution distribution = self.distribution_output.distribution(outputs.forecast_outputs) - # get samples - samples = [distribution.sample() for _ in range(num_parallel_samples)] # samples: list of [bs x num_targets] + # get samples: list of [bs x num_targets] + samples = [distribution.sample() for _ in range(num_parallel_samples)] # stack tensors samples = torch.stack(samples, dim=1) # [bs x num_samples x num_targets] return SamplePatchTSTRegressionOutput(sequences=samples) From d002bac92bc93abadb04cf426f5f6f6c6c4afad3 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 22:21:35 -0500 Subject: [PATCH 178/189] edit docstring --- .../models/patchtst/configuration_patchtst.py | 10 ++++----- .../models/patchtst/modeling_patchtst.py | 22 ++++++++----------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index c36241c3010740..b8a9d0f512025f 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -51,9 +51,9 @@ class PatchTSTConfig(PretrainedConfig): distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared error "mse". patch_length (`int`, *optional*, defaults to 1): - Define the patch length of the patchification process. Default to 1. + Define the patch length of the patchification process. stride (`int`, *optional*, defaults to 1): - define the stride of the patchification process. Default to 1. + define the stride of the patchification process. encoder_layers (`int`, *optional*, defaults to 3): Number of encoder layers. d_model (`int`, *optional*, defaults to 64): @@ -69,7 +69,7 @@ class PatchTSTConfig(PretrainedConfig): norm (`str` , *optional*, defaults to `"BatchNorm"`): Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`. norm_eps (`float`, *optional*, defaults to 1e-5): - A value added to the denominator for numerical stability of normalization. Default: 1e-5 + A value added to the denominator for numerical stability of normalization. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for the attention probabilities. dropout (`float`, *optional*, defaults to 0.0): @@ -85,7 +85,7 @@ class PatchTSTConfig(PretrainedConfig): activation_function (`str`, *optional*, defaults to `"gelu"`): The non-linear activation function (string) in the encoder.`"gelu"` and `"relu"` are supported. pre_norm (`bool`, *optional*, defaults to `True`): - Normalization is applied before self-attention if pre_norm is set to True. Otherwise, normalization is + Normalization is applied before self-attention if pre_norm is set to `True`. Otherwise, normalization is applied after residual block. positional_encoding (`str`, *optional*, defaults to `"sincos"`): Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. @@ -98,7 +98,7 @@ class PatchTSTConfig(PretrainedConfig): shared_projection (`bool`, *optional*, defaults to `True`): Sharing the projection layer across different channels in the forecast head. seed_number (`Optional`, *optional*): - Use seed number for random masking. + Seed number used for random masking. If unset, no seed is set. scaling (`Union`, *optional*, defaults to `"mean"`): Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the scaler is set to "mean". diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 877b4089720da7..39bc1b2fb3d211 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -270,11 +270,11 @@ def random_masking( mask_ratio (`float`): Mask ratio. unmasked_channel_indices (list, *optional*): - indices of unmasked channels. These channels will not be masked. Defaults to None. + indices of unmasked channels. These channels will not be masked. channel_consistent_masking (bool, *optional* defaults to False): When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary - across channels. Defaults to False. - mask_value (int, *optional* defaults to 0): + across channels. + mask_value (int, *optional*, defaults to 0): Value to use for masking. seed_number (int, *optional*): Value to set for the random seed. @@ -337,11 +337,11 @@ def forecast_masking( List of patch lengths to mask in the end of the data. forecast_mask_ratios (`list`, *optional*): [0.7, 0.3] List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and - forecast_mask_ratios is [1,1], then equal weights to both patch lengths. Defaults to None. + forecast_mask_ratios is [1,1], then equal weights to both patch lengths. unmasked_channel_indices (`list`, *optional*): - Control Variable channel indices. These channels will not be masked. Defaults to None. - mask_value (`int`, *optional* defaults to 0): - Value to use for masking. Defaults to 0. + Control Variable channel indices. These channels will not be masked. + mask_value (`int`, *optional*, defaults to 0): + Value to use for masking. seed_number (`int`, *optional*): Value to set for the random seed. @@ -716,13 +716,11 @@ def __init__(self, config: PatchTSTConfig): ) else: self.position_enc = positional_encoding( - config.positional_encoding, config.learn_pe, config.num_patches, config.d_model - ) + config.positional_encoding, config.learn_pe, config.num_patches, config.d_model) # Positional dropout self.positional_dropout = ( - nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() - ) + nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()) # Encoder self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) @@ -1058,7 +1056,6 @@ def __init__(self, config: PatchTSTConfig): self.keepdim = True if config.keepdim is None else config.keepdim self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale - @torch.no_grad() def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -1094,7 +1091,6 @@ def __init__(self, config: PatchTSTConfig): self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale self.default_scale = config.default_scale if config.default_scale else None - @torch.no_grad() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: From 97d75e66544b71ca327a1b09c642f5d1b7ecda96 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 22:30:56 -0500 Subject: [PATCH 179/189] update docstring --- .../models/patchtst/configuration_patchtst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index b8a9d0f512025f..a75c63a4065fc2 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -102,10 +102,10 @@ class PatchTSTConfig(PretrainedConfig): scaling (`Union`, *optional*, defaults to `"mean"`): Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the scaler is set to "mean". - mask_input (`bool`, *optional*, defaults to False): + mask_input (`bool`, *optional*, defaults to `False`): Apply masking during the pretraining. mask_type (`str`, *optional*, defaults to `"random"`): - Masking type. Only `"random"` is currently supported. + Masking type. Only `"random"` and `"forecast"` are currently supported. random_mask_ratio (`float`, *optional*, defaults to 0.5): Masking ratio is applied to mask the input data during random pretraining. forecast_mask_patches (`List`, *optional*, defaults to `[2, 3]`): @@ -116,9 +116,9 @@ class PatchTSTConfig(PretrainedConfig): channel_consistent_masking (`bool`, *optional*, defaults to `False`): If channel consistent masking is True, all the channels will have the same masking. unmasked_channel_indices (`list`, *optional*): - Channels are not masked during pretraining. + Channels that are not masked during pretraining. mask_value (`int`, *optional*, defaults to 0): - Mask value to set. + Define the value of entries to be masked when pretraining. pooling (`str`, *optional*, defaults to `"mean"`): Pooling in the latent representation. `"mean"`, `"max"` and None are supported. head_dropout (`float`, *optional*, defaults to 0.0): @@ -132,7 +132,7 @@ class PatchTSTConfig(PretrainedConfig): Output range for regression task. The range of output values can be set to enforce the model to produce values within a range. num_parallel_samples (`int`, *optional*, defaults to 100): - The number of samples to generate in parallel for probablistic prediction. + The number of samples is generated in parallel for probablistic prediction. ```python From 49232db420939771c6644e350cc18398b2332401 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Wed, 8 Nov 2023 22:56:43 -0500 Subject: [PATCH 180/189] change variable name pooling to pooling_type --- .../models/patchtst/configuration_patchtst.py | 8 +- .../models/patchtst/modeling_patchtst.py | 104 ++++++++++-------- 2 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index a75c63a4065fc2..d0b9963fb49b7e 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -119,8 +119,8 @@ class PatchTSTConfig(PretrainedConfig): Channels that are not masked during pretraining. mask_value (`int`, *optional*, defaults to 0): Define the value of entries to be masked when pretraining. - pooling (`str`, *optional*, defaults to `"mean"`): - Pooling in the latent representation. `"mean"`, `"max"` and None are supported. + pooling_type (`str`, *optional*, defaults to `"mean"`): + Pooling of the embedding. `"mean"`, `"max"` and `None` are supported. head_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for head. prediction_length (`int`, *optional*, defaults to 24): @@ -198,7 +198,7 @@ def __init__( unmasked_channel_indices: Optional[List[int]] = None, mask_value=0, # head - pooling: str = "mean", + pooling_type: str = "mean", head_dropout: float = 0.0, prediction_length: int = 24, num_targets: int = 1, @@ -254,7 +254,7 @@ def __init__( self.mask_value = mask_value # general head params - self.pooling = pooling + self.pooling_type = pooling_type self.head_dropout = head_dropout # For prediction head diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 39bc1b2fb3d211..645db3e90d21c7 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1448,7 +1448,7 @@ class PatchTSTClassificationHead(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() self.use_cls_token = config.use_cls_token - self.pooling = config.pooling + self.pooling_type = config.pooling_type self.flatten = nn.Flatten(start_dim=1) self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity() self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_targets) @@ -1464,17 +1464,21 @@ def forward(self, embedding: torch.Tensor): """ if self.use_cls_token: - x = embedding[:, :, 0, :] # use the first output token, x: bs x num_channels x d_model - elif self.pooling == "mean": - x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] - elif self.pooling == "max": - x = embedding.max(dim=2) # x: [bs x num_channels x d_model] + # use the first output token, pooled_embedding: bs x num_channels x d_model + pooled_embedding = embedding[:, :, 0, :] + elif self.pooling_type == "mean": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.mean(dim=2) + elif self.pooling_type == "max": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.max(dim=2) else: - raise Exception(f"pooling operator {self.pooling} is not implemented yet") - - x = self.flatten(x) # x: bs x num_channels * d_model - y = self.linear(self.dropout(x)) # y: bs x n_classes - return y + raise Exception(f"pooling operator {self.pooling_type} is not implemented yet") + # pooled_embedding: bs x num_channels * d_model + pooled_embedding = self.flatten(pooled_embedding) + # output: bs x n_classes + output = self.linear(self.dropout(pooled_embedding)) + return output class PatchTSTPredictionHead(nn.Module): @@ -1484,8 +1488,8 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): self.shared_projection = config.shared_projection self.num_input_channels = config.num_input_channels self.use_cls_token = config.use_cls_token - self.pooling = config.pooling - head_dim = config.d_model if self.pooling else config.d_model * config.num_patches + self.pooling_type = config.pooling_type + head_dim = config.d_model if self.pooling_type else config.d_model * config.num_patches if not self.shared_projection: # if each channel has its own head @@ -1523,29 +1527,38 @@ def forward(self, embedding: torch.Tensor): """ if self.use_cls_token: - y = embedding[:, :, 0, :] # y: [bs x num_channels x d_model] + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding[:, :, 0, :] else: - if self.pooling == "mean": - y = embedding.mean(dim=2) # y: [bs x num_channels x d_model] - elif self.pooling == "max": - y = embedding.max(dim=2) # y: [bs x num_channels x d_model] + if self.pooling_type == "mean": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.mean(dim=2) + elif self.pooling_type == "max": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.max(dim=2) else: - y = embedding # y: [bs x num_channels x num_patches x d_model] + # pooled_embedding: [bs x num_channels x num_patches x d_model] + pooled_embedding = embedding if not self.shared_projection: - x_out = [] + output = [] for i in range(self.num_input_channels): - z = self.flattens[i](y[:, i, :]) # y: [bs x (d_model * num_patches)] or [bs x d_model)] - z = self.dropouts[i](z) - # z: [bs x forecast_len] or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head - z = self.projections[i](z) - x_out.append(z) - output = torch.stack(x_out, dim=1) # x: [bs x num_channels x forecast_len] + # pooled_embedding: [bs x (d_model * num_patches)] or [bs x d_model)] + pooled_embedding = self.flattens[i](pooled_embedding[:, i, :]) + pooled_embedding = self.dropouts[i](pooled_embedding) + # pooled_embedding: [bs x forecast_len] + # or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head + pooled_embedding = self.projections[i](pooled_embedding) + output.append(pooled_embedding) + # output: [bs x num_channels x forecast_len] + output = torch.stack(output, dim=1) else: - z = self.flatten(y) # z: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)] - z = self.dropout(z) - output = self.projection(z) # output: [bs x num_channels x forecast_len] - # or tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head + # pooled_embedding: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)] + pooled_embedding = self.flatten(pooled_embedding) + pooled_embedding = self.dropout(pooled_embedding) + # output: [bs x num_channels x forecast_len] or + # tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head + output = self.projection(pooled_embedding) if isinstance(output, tuple): # output: ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels]) @@ -1628,8 +1641,7 @@ def forward( if future_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution( - y_hat, loc=model_output.loc, scale=model_output.scale - ) + y_hat, loc=model_output.loc, scale=model_output.scale) loss_val = nll(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) @@ -1711,7 +1723,7 @@ def __init__(self, config: PatchTSTConfig, distribution_output=None): super().__init__() self.y_range = config.output_range self.use_cls_token = config.use_cls_token - self.pooling = config.pooling + self.pooling_type = config.pooling_type self.distribution_output = distribution_output head_dim = config.num_input_channels * config.d_model @@ -1735,22 +1747,26 @@ def forward(self, embedding: torch.Tensor): """ if self.use_cls_token: - x = embedding[:, :, 0, :] # use the first output token, x: [bs x num_channels x d_model] - elif self.pooling == "mean": - x = embedding.mean(dim=2) # x: [bs x num_channels x d_model] - elif self.pooling == "max": - x = embedding.max(dim=2) # x: [bs x num_channels x d_model] + # use the first output token, pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding[:, :, 0, :] + elif self.pooling_type == "mean": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.mean(dim=2) + elif self.pooling_type == "max": + # pooled_embedding: [bs x num_channels x d_model] + pooled_embedding = embedding.max(dim=2) else: - raise Exception(f"pooling operator {self.pooling} is not implemented yet") + raise Exception(f"pooling operator {self.pooling_type} is not implemented yet") # flatten the input - x = self.dropout(self.flatten(x)) # x: bs x (num_channels * d_model) + # pooled_embedding: bs x (num_channels * d_model) + pooled_embedding = self.dropout(self.flatten(pooled_embedding)) # projection - y = self.projection(x) # y: bs x output_dim or a tuple of this shape for distribution head + # output: bs x output_dim or a tuple of this shape for distribution head + output = self.projection(pooled_embedding) # if (self.distribution_output is None) & (self.y_range is not None): # linear head - y = torch.sigmoid(y) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] - - return y + output = torch.sigmoid(output) * (self.y_range[1] - self.y_range[0]) + self.y_range[0] + return output class PatchTSTForRegression(PatchTSTPreTrainedModel): From 3684320e1e960191f907559189211a6ff69689b7 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Thu, 9 Nov 2023 08:31:06 -0500 Subject: [PATCH 181/189] fix output_hidden_states as tuple --- src/transformers/models/patchtst/modeling_patchtst.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 645db3e90d21c7..179a8ad0777e55 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -745,9 +745,7 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states # Input embedding patch_input = self.embedder(patch_input) From b818036973d396368369945ae45445fd12817b77 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 10 Nov 2023 10:53:29 -0500 Subject: [PATCH 182/189] fix bug when calling PatchTSTBatchNorm --- .../models/patchtst/modeling_patchtst.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 179a8ad0777e55..4bf068adb10776 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -530,7 +530,7 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of the sublayer 1 self.dropout_path1 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer1 = PatchTSTBatchNorm(config.d_model) + self.norm_sublayer1 = PatchTSTBatchNorm(config) else: self.norm_sublayer1 = nn.LayerNorm(config.d_model, eps=config.norm_eps) @@ -538,7 +538,7 @@ def __init__(self, config: PatchTSTConfig): if self.channel_attention: self.dropout_path2 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer2 = PatchTSTBatchNorm(config.d_model) + self.norm_sublayer2 = PatchTSTBatchNorm(config) else: self.norm_sublayer2 = nn.LayerNorm(config.d_model, eps=config.norm_eps) @@ -553,7 +553,7 @@ def __init__(self, config: PatchTSTConfig): # Add & Norm of sublayer 3 self.dropout_path3 = nn.Dropout(config.dropout_path) if config.dropout_path > 0 else nn.Identity() if "batch" in config.norm.lower(): - self.norm_sublayer3 = PatchTSTBatchNorm(config.d_model) + self.norm_sublayer3 = PatchTSTBatchNorm(config) else: self.norm_sublayer3 = nn.LayerNorm(config.d_model, eps=config.norm_eps) @@ -1050,9 +1050,9 @@ class PatchTSTStdScaler(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = 1 if config.scaling_dim is None else config.scaling_dim - self.keepdim = True if config.keepdim is None else config.keepdim - self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale + self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 + self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True + self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10 def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -1084,10 +1084,11 @@ class PatchTSTMeanScaler(nn.Module): """ def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = 1 if config.scaling_dim is None else config.scaling_dim - self.keepdim = True if config.keepdim is None else config.keepdim - self.minimum_scale = 1e-10 if config.minimum_scale is None else config.minimum_scale - self.default_scale = config.default_scale if config.default_scale else None + self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 + self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True + self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10 + self.default_scale = config.default_scale if hasattr(config, 'default_scale') else None + def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor @@ -1138,8 +1139,8 @@ class PatchTSTNOPScaler(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = 1 if config.scaling_dim is None else config.scaling_dim - self.keepdim = True if config.keepdim is None else config.keepdim + self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 + self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor=None @@ -1211,6 +1212,7 @@ def forward( self, past_values: torch.Tensor, past_observed_mask: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, From fb5f49020e536678a83f88c51d5b5509322451ec Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 10 Nov 2023 12:06:04 -0500 Subject: [PATCH 183/189] change stride to patch_stride --- .../models/patchtst/configuration_patchtst.py | 8 ++++---- src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++---- tests/models/patchtst/test_modeling_patchtst.py | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index d0b9963fb49b7e..8a7372fd68c581 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -52,7 +52,7 @@ class PatchTSTConfig(PretrainedConfig): error "mse". patch_length (`int`, *optional*, defaults to 1): Define the patch length of the patchification process. - stride (`int`, *optional*, defaults to 1): + patch_stride (`int`, *optional*, defaults to 1): define the stride of the patchification process. encoder_layers (`int`, *optional*, defaults to 3): Number of encoder layers. @@ -163,7 +163,7 @@ def __init__( loss: str = "mse", # PatchTST arguments patch_length: int = 1, - stride: int = 1, + patch_stride: int = 1, # Transformer architecture configuration encoder_layers: int = 3, d_model: int = 64, @@ -239,7 +239,7 @@ def __init__( # PatchTST parameters self.patch_length = patch_length - self.stride = stride + self.patch_stride = patch_stride self.num_patches = self._num_patches() # Mask pretraining @@ -271,4 +271,4 @@ def __init__( super().__init__(**kwargs) def _num_patches(self): - return (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 + return (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1 diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 4bf068adb10776..cf4ae6a7d857dd 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -406,7 +406,7 @@ def __init__(self, config: PatchTSTConfig): self.sequence_length = config.context_length self.patch_length = config.patch_length - self.stride = config.stride + self.patch_stride = config.patch_stride if self.sequence_length <= self.patch_length: raise ValueError( @@ -414,8 +414,8 @@ def __init__(self, config: PatchTSTConfig): ) # get the number of patches - num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.stride + 1 - new_sequence_length = self.patch_length + self.stride * (num_patches - 1) + num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.patch_stride + 1 + new_sequence_length = self.patch_length + self.patch_stride * (num_patches - 1) self.sequence_start = self.sequence_length - new_sequence_length def forward(self, past_values: torch.Tensor): @@ -436,7 +436,7 @@ def forward(self, past_values: torch.Tensor): output = past_values[:, self.sequence_start :, :] # output: [bs x num_patches x num_input_channels x patch_length] output = output.unfold( - dimension=-2, size=self.patch_length, step=self.stride) + dimension=-2, size=self.patch_length, step=self.patch_stride) # output: [bs x num_input_channels x num_patches x patch_length] output = output.transpose(-2, -3).contiguous() return output diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py index 4313591da29609..8d6f2202ee81ce 100644 --- a/tests/models/patchtst/test_modeling_patchtst.py +++ b/tests/models/patchtst/test_modeling_patchtst.py @@ -56,7 +56,7 @@ def __init__( prediction_length=7, context_length=14, patch_length=5, - stride=5, + patch_stride=5, num_input_channels=1, num_time_features=1, is_training=True, @@ -78,7 +78,7 @@ def __init__( self.prediction_length = prediction_length self.context_length = context_length self.patch_length = patch_length - self.stride = stride + self.patch_stride = patch_stride self.num_input_channels = num_input_channels self.num_time_features = num_time_features self.lags_sequence = lags_sequence @@ -95,13 +95,13 @@ def __init__( self.num_targets = num_targets self.num_output_channels = num_output_channels self.distil = distil - self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.stride + 1 + self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1 def get_config(self): return PatchTSTConfig( prediction_length=self.prediction_length, patch_length=self.patch_length, - stride=self.stride, + patch_stride=self.patch_stride, num_input_channels=self.num_input_channels, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, @@ -321,7 +321,7 @@ def test_pretrain_head(self): output = model(past_values=batch["past_values"].to(torch_device)).prediction_output num_patch = ( max(model.config.context_length, model.config.patch_length) - model.config.patch_length - ) // model.config.stride + 1 + ) // model.config.patch_stride + 1 expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length]) self.assertEqual(output.shape, expected_shape) From f45baef4e2cda00e3c3ed0d5ccd13e0e83438f60 Mon Sep 17 00:00:00 2001 From: nnguyen Date: Fri, 10 Nov 2023 13:24:11 -0500 Subject: [PATCH 184/189] create PatchTSTPositionalEncoding class and restructure the PatchTSTEncoder --- .../models/patchtst/configuration_patchtst.py | 6 +- .../models/patchtst/modeling_patchtst.py | 91 ++++++++++--------- 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 8a7372fd68c581..65711f2c599437 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -87,7 +87,7 @@ class PatchTSTConfig(PretrainedConfig): pre_norm (`bool`, *optional*, defaults to `True`): Normalization is applied before self-attention if pre_norm is set to `True`. Otherwise, normalization is applied after residual block. - positional_encoding (`str`, *optional*, defaults to `"sincos"`): + positional_encoding_type (`str`, *optional*, defaults to `"sincos"`): Positional encodings. `"zeros"`, `"normal"`, `"uniform"' and `"sincos"` are supported. learn_pe (`bool`, *optional*, defaults to `False`): Whether the positional encoding is updated during training. @@ -181,7 +181,7 @@ def __init__( bias: bool = True, activation_function: str = "gelu", pre_norm: bool = True, - positional_encoding: str = "sincos", + positional_encoding_type: str = "sincos", learn_pe: bool = False, use_cls_token: bool = False, init_std: float = 0.02, @@ -231,7 +231,7 @@ def __init__( self.bias = bias self.activation_function = activation_function self.pre_norm = pre_norm - self.positional_encoding = positional_encoding + self.positional_encoding_type = positional_encoding_type self.learn_pe = learn_pe self.use_cls_token = use_cls_token self.init_std = init_std diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index cf4ae6a7d857dd..78d59ca20602f4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -223,23 +223,23 @@ def forward(self, inputs: torch.Tensor): return output.transpose(1, 2) -def positional_encoding(position_embedding_type, learned, q_len, d_model): +def positional_encoding(positional_encoding_type, learned, q_len, d_model): # Positional encoding - if position_embedding_type is None: - # position_embedding_type = None and learned = False can be used to measure impact of positional encoding + if positional_encoding_type is None: + # positional_encoding_type = None and learned = False can be used to measure impact of positional encoding position_enc = torch.empty((q_len, d_model)) nn.init.uniform_(position_enc, -0.02, 0.02) learned = False - elif position_embedding_type == "zeros": + elif positional_encoding_type == "zeros": position_enc = torch.empty((q_len, d_model)) nn.init.uniform_(position_enc, -0.02, 0.02) - elif position_embedding_type == "normal": + elif positional_encoding_type == "normal": position_enc = torch.zeros((q_len, 1)) nn.init.normal_(position_enc, mean=0.0, std=0.1) - elif position_embedding_type == "uniform": + elif positional_encoding_type == "uniform": position_enc = torch.zeros((q_len, 1)) nn.init.uniform_(position_enc, a=0.0, b=0.1) - elif position_embedding_type == "sincos": + elif positional_encoding_type == "sincos": position_enc = torch.zeros(q_len, d_model) position = torch.arange(0, q_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) @@ -249,7 +249,7 @@ def positional_encoding(position_embedding_type, learned, q_len, d_model): position_enc = position_enc / (position_enc.std() * 10) else: raise ValueError( - f"{position_embedding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." + f"{positional_encoding_type} is not a valid positional encoder. Available types are 'normal', 'zeros', 'zero', uniform', 'sincos', None." ) return nn.Parameter(position_enc, requires_grad=learned) @@ -691,6 +691,41 @@ def forward(self, patch_input: torch.Tensor): return embeddings +class PatchTSTPositionalEncoding(nn.Module): + """ + Class for positional encoding + """ + def __init__(self, config: PatchTSTConfig): + super().__init__() + self.use_cls_token = config.use_cls_token + if config.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) + num_patches = config.num_patches + 1 + else: + num_patches = config.num_patches + # postional encoding + self.position_enc = positional_encoding( + config.positional_encoding_type, config.learn_pe, num_patches, config.d_model) + # Positional dropout + self.positional_dropout = ( + nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()) + + def forward(self, patch_input: torch.Tensor): + if self.use_cls_token: + # patch_input: [bs x num_channels x num_patches x d_model] + patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :]) + # append cls token where cls_token: [1 x 1 x 1 x d_model] + cls_token = self.cls_token + self.position_enc[:1, :] + # get the same copy of cls_token for all the samples in batch + cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1) + # hidden_state: [bs x num_channels x (num_patches+1) x d_model] + hidden_state = torch.cat((cls_tokens, patch_input), dim=1) + else: + # hidden_state: [bs x num_channels x num_patches x d_model] + hidden_state = self.positional_dropout(patch_input + self.position_enc) + return hidden_state + + class PatchTSTEncoder(PatchTSTPreTrainedModel): """ PatchTST Encoder @@ -707,21 +742,8 @@ def __init__(self, config: PatchTSTConfig): # Input embedding: projection of feature vectors onto a d-dim vector space self.embedder = PatchTSTEmbedding(config) - # Positional encoding - if config.use_cls_token: - self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model)) - self.position_enc = positional_encoding( - config.positional_encoding, config.learn_pe, config.num_patches + 1, config.d_model - ) - else: - self.position_enc = positional_encoding( - config.positional_encoding, config.learn_pe, config.num_patches, config.d_model) - - # Positional dropout - self.positional_dropout = ( - nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()) - + self.positional_encoder = PatchTSTPositionalEncoding(config) # Encoder self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.encoder_layers)]) @@ -749,35 +771,22 @@ def forward( # Input embedding patch_input = self.embedder(patch_input) - - if self.use_cls_token: - # x: [bs x num_channels x num_patches x d_model] - patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :]) - # append cls token - cls_token = self.cls_token + self.position_enc[:1, :] # cls_token: [1 x 1 x 1 x d_model] - cls_tokens = cls_token.expand(patch_input.shape[0], -1, -1) # get the same copy for all the batch samples - # x: [bs x num_channels x (num_patches+1) x d_model] - hidden_state = torch.cat((cls_tokens, patch_input), dim=1) - else: - # x: [bs x num_channels x num_patches x d_model] - hidden_state = self.positional_dropout(patch_input + self.position_enc) + # Positional encoding + hidden_state = self.positional_encoder(patch_input) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None - for encoder_layer in self.layers: if output_hidden_states: encoder_states = encoder_states + (hidden_state,) - layer_outputs = encoder_layer(hidden_state=hidden_state, - output_attentions=output_attentions) - # get hidden state - hidden_state = layer_outputs[0] # hidden_states: [bs x num_channels x num_patches x d_model] + layer_outputs = encoder_layer(hidden_state=hidden_state, output_attentions=output_attentions) + # get hidden state. hidden_state shape is [bs x num_channels x num_patches x d_model] # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token - # append layer attention + hidden_state = layer_outputs[0] + # append attention matrix at each layer if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) - # return past_values, hidden_states return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions) From 32f11dc63375bf22a280d630d3c1726fef20a2fc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 10 Nov 2023 19:29:36 +0100 Subject: [PATCH 185/189] formatting --- .../models/patchtst/modeling_patchtst.py | 76 ++++++++++--------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 78d59ca20602f4..2a09e5a4dfadc4 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -296,12 +296,10 @@ def random_masking( noise = noise.repeat(1, num_channels, 1) # bs x num_channels x time else: # noise in [0, 1], bs x num_channels x L - noise = torch.rand( - batch_size, num_channels, sequence_length, device=device) + noise = torch.rand(batch_size, num_channels, sequence_length, device=device) # mask: [bs x num_channels x num_patch] - mask = torch.ones( - batch_size, num_channels, sequence_length, device=device) + mask = torch.ones(batch_size, num_channels, sequence_length, device=device) mask[:, :, :len_keep] = 0 # sort noise for each sample @@ -435,8 +433,7 @@ def forward(self, past_values: torch.Tensor): # output: [bs x new_sequence_length x num_channels] output = past_values[:, self.sequence_start :, :] # output: [bs x num_patches x num_input_channels x patch_length] - output = output.unfold( - dimension=-2, size=self.patch_length, step=self.patch_stride) + output = output.unfold(dimension=-2, size=self.patch_length, step=self.patch_stride) # output: [bs x num_input_channels x num_patches x patch_length] output = output.transpose(-2, -3).contiguous() return output @@ -577,13 +574,15 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions) + hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions + ) # Add: residual connection with residual dropout hidden_state = hidden_state + self.dropout_path1(attn_output) else: ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT attn_output, attn_weights, _ = self.self_attn( - hidden_states=hidden_state, output_attentions=output_attentions) + hidden_states=hidden_state, output_attentions=output_attentions + ) # hidden_states: [(bs*num_channels) x sequence_length x d_model] hidden_state = self.norm_sublayer1(hidden_state + self.dropout_path1(attn_output)) @@ -599,13 +598,15 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] if self.pre_norm: ## Norm and Multi-Head attention and Add residual connection attn_output, channel_attn_weights, _ = self.self_attn( - hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions) + hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions + ) # Add: residual connection with residual dropout hidden_state = hidden_state + self.dropout_path2(attn_output) else: ## Multi-Head attention and Add residual connection and Norm attn_output, channel_attn_weights, _ = self.self_attn( - hidden_states=hidden_state, output_attentions=output_attentions) + hidden_states=hidden_state, output_attentions=output_attentions + ) # hidden_states: [(bs*sequence_length) x num_channels x d_model] hidden_state = self.norm_sublayer2(hidden_state + self.dropout_path2(attn_output)) @@ -621,13 +622,11 @@ def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] if self.pre_norm: ## Norm and Position-wise Feed-Forward and Add residual connection # Add: residual connection with residual dropout - hidden_state = hidden_state + self.dropout_path3( - self.ff(self.norm_sublayer3(hidden_state))) + hidden_state = hidden_state + self.dropout_path3(self.ff(self.norm_sublayer3(hidden_state))) else: ## Position-wise Feed-Forward and Add residual connection and Norm # Add: residual connection with residual dropout - hidden_state = self.norm_sublayer3( - hidden_state + self.dropout_path3(self.ff(hidden_state))) + hidden_state = self.norm_sublayer3(hidden_state + self.dropout_path3(self.ff(hidden_state))) # [bs x num_channels x sequence_length x d_model] hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model) @@ -695,6 +694,7 @@ class PatchTSTPositionalEncoding(nn.Module): """ Class for positional encoding """ + def __init__(self, config: PatchTSTConfig): super().__init__() self.use_cls_token = config.use_cls_token @@ -705,10 +705,12 @@ def __init__(self, config: PatchTSTConfig): num_patches = config.num_patches # postional encoding self.position_enc = positional_encoding( - config.positional_encoding_type, config.learn_pe, num_patches, config.d_model) + config.positional_encoding_type, config.learn_pe, num_patches, config.d_model + ) # Positional dropout self.positional_dropout = ( - nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()) + nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity() + ) def forward(self, patch_input: torch.Tensor): if self.use_cls_token: @@ -730,6 +732,7 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel): """ PatchTST Encoder """ + def __init__(self, config: PatchTSTConfig): super().__init__(config) self.num_input_channels = config.num_input_channels @@ -767,7 +770,9 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) # Input embedding patch_input = self.embedder(patch_input) @@ -1053,18 +1058,19 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): """ - Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it - by subtracting from the mean and dividing by the standard deviation. + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by + subtracting from the mean and dividing by the standard deviation. """ def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 - self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True - self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10 + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 - def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Parameters: data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): @@ -1091,13 +1097,13 @@ class PatchTSTMeanScaler(nn.Module): Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. """ + def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 - self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True - self.minimum_scale = config.minimum_scale if hasattr(config, 'minimum_scale') else 1e-10 - self.default_scale = config.default_scale if hasattr(config, 'default_scale') else None - + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 + self.default_scale = config.default_scale if hasattr(config, "default_scale") else None def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor @@ -1148,11 +1154,11 @@ class PatchTSTNOPScaler(nn.Module): def __init__(self, config: PatchTSTConfig): super().__init__() - self.dim = config.scaling_dim if hasattr(config, 'scaling_dim') else 1 - self.keepdim = config.keepdim if hasattr(config, 'keepdim') else True + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor=None + self, data: torch.Tensor, observed_indicator: torch.Tensor = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Parameters: @@ -1243,8 +1249,7 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple. Returns: - `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or - `config.return_dict`=False) + `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1650,7 +1655,8 @@ def forward( if future_values is not None: if self.distribution_output: distribution = self.distribution_output.distribution( - y_hat, loc=model_output.loc, scale=model_output.scale) + y_hat, loc=model_output.loc, scale=model_output.scale + ) loss_val = nll(distribution, future_values) # take average of the loss loss_val = weighted_average(loss_val) From dcfd2015b56af0a1b76eeebfecdeddb9791e8138 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 10 Nov 2023 19:48:29 +0100 Subject: [PATCH 186/189] initialize scalers with configs --- .../models/autoformer/modeling_autoformer.py | 118 +++++++++--------- .../models/informer/modeling_informer.py | 118 +++++++++--------- .../models/patchtst/configuration_patchtst.py | 2 +- .../models/patchtst/modeling_patchtst.py | 6 +- .../modeling_time_series_transformer.py | 112 +++++++++-------- 5 files changed, 181 insertions(+), 175 deletions(-) diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 92e9df2c7e5b1b..8f26274b44bcdb 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -208,71 +208,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor: ) -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Autoformer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer class AutoformerStdScaler(nn.Module): """ - Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it - by subtracting from the mean and dividing by the standard deviation. - - Args: - dim (`int`): - Dimension along which to calculate the mean and standard deviation. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - minimum_scale (`float`, *optional*, defaults to 1e-5): - Default scale that is used for elements that are constantly zero along dimension `dim`. + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by + subtracting from the mean and dividing by the standard deviation. """ - def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5): + def __init__(self, config: AutoformerConfig): super().__init__() - if not dim > 0: - raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0") - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 - @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - denominator = weights.sum(self.dim, keepdim=self.keepdim) + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) - loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator - variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator scale = torch.sqrt(variance + self.minimum_scale) return (data - loc) / scale, loc, scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Autoformer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer class AutoformerMeanScaler(nn.Module): """ - Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data + Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - default_scale (`float`, *optional*, defaults to `None`): - Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch. - minimum_scale (`float`, *optional*, defaults to 1e-10): - Default minimum possible scale that is used for any item. """ - def __init__( - self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 - ): + def __init__(self, config: AutoformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale - self.default_scale = default_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 + self.default_scale = config.default_scale if hasattr(config, "default_scale") else None - @torch.no_grad() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - # shape: (N, [C], T=1) + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -300,26 +299,29 @@ def forward( return scaled_data, torch.zeros_like(scale), scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Autoformer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer class AutoformerNOPScaler(nn.Module): """ - Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. """ - def __init__(self, dim: int, keepdim: bool = False): + def __init__(self, config: AutoformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor + self, data: torch.Tensor, observed_indicator: torch.Tensor = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale @@ -1433,11 +1435,11 @@ def __init__(self, config: AutoformerConfig): super().__init__(config) if config.scaling == "mean" or config.scaling is True: - self.scaler = AutoformerMeanScaler(dim=1, keepdim=True) + self.scaler = AutoformerMeanScaler(config) elif config.scaling == "std": - self.scaler = AutoformerStdScaler(dim=1, keepdim=True) + self.scaler = AutoformerStdScaler(config) else: - self.scaler = AutoformerNOPScaler(dim=1, keepdim=True) + self.scaler = AutoformerNOPScaler(config) if config.num_static_categorical_features > 0: self.embedder = AutoformerFeatureEmbedder( diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index c0a5a205950285..205c8ba22f743e 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -81,71 +81,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor: ) -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer class InformerStdScaler(nn.Module): """ - Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it - by subtracting from the mean and dividing by the standard deviation. - - Args: - dim (`int`): - Dimension along which to calculate the mean and standard deviation. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - minimum_scale (`float`, *optional*, defaults to 1e-5): - Default scale that is used for elements that are constantly zero along dimension `dim`. + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by + subtracting from the mean and dividing by the standard deviation. """ - def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5): + def __init__(self, config: InformerConfig): super().__init__() - if not dim > 0: - raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0") - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 - @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - denominator = weights.sum(self.dim, keepdim=self.keepdim) + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) - loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator - variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator scale = torch.sqrt(variance + self.minimum_scale) return (data - loc) / scale, loc, scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer class InformerMeanScaler(nn.Module): """ - Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data + Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - default_scale (`float`, *optional*, defaults to `None`): - Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch. - minimum_scale (`float`, *optional*, defaults to 1e-10): - Default minimum possible scale that is used for any item. """ - def __init__( - self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 - ): + def __init__(self, config: InformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale - self.default_scale = default_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 + self.default_scale = config.default_scale if hasattr(config, "default_scale") else None - @torch.no_grad() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - # shape: (N, [C], T=1) + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -173,26 +172,29 @@ def forward( return scaled_data, torch.zeros_like(scale), scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer class InformerNOPScaler(nn.Module): """ - Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. """ - def __init__(self, dim: int, keepdim: bool = False): + def __init__(self, config: InformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor + self, data: torch.Tensor, observed_indicator: torch.Tensor = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale @@ -1446,11 +1448,11 @@ def __init__(self, config: InformerConfig): super().__init__(config) if config.scaling == "mean" or config.scaling is True: - self.scaler = InformerMeanScaler(dim=1, keepdim=True) + self.scaler = InformerMeanScaler(config) elif config.scaling == "std": - self.scaler = InformerStdScaler(dim=1, keepdim=True) + self.scaler = InformerStdScaler(config) else: - self.scaler = InformerNOPScaler(dim=1, keepdim=True) + self.scaler = InformerNOPScaler(config) if config.num_static_categorical_features > 0: self.embedder = InformerFeatureEmbedder( diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py index 65711f2c599437..4ced00c3604600 100644 --- a/src/transformers/models/patchtst/configuration_patchtst.py +++ b/src/transformers/models/patchtst/configuration_patchtst.py @@ -68,7 +68,7 @@ class PatchTSTConfig(PretrainedConfig): Dimension of the "intermediate" (often named feed-forward) layer in encoder. norm (`str` , *optional*, defaults to `"BatchNorm"`): Normalization at each Transformer layer. Can be `"BatchNorm"` or `"LayerNorm"`. - norm_eps (`float`, *optional*, defaults to 1e-5): + norm_eps (`float`, *optional*, defaults to 1e-05): A value added to the denominator for numerical stability of normalization. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout probability for the attention probabilities. diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 2a09e5a4dfadc4..658140fc1c087a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -1055,7 +1055,7 @@ def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] return input_tensor.mean(dim=dim) -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->PatchTST +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST class PatchTSTStdScaler(nn.Module): """ Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by @@ -1091,7 +1091,7 @@ def forward( return (data - loc) / scale, loc, scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->PatchTST +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST class PatchTSTMeanScaler(nn.Module): """ Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data @@ -1146,7 +1146,7 @@ def forward( return scaled_data, torch.zeros_like(scale), scale -# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->PatchTST +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST class PatchTSTNOPScaler(nn.Module): """ Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index 904c02b4f04308..2c875dd56e1b08 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -83,67 +83,66 @@ def forward(self, features: torch.Tensor) -> torch.Tensor: class TimeSeriesStdScaler(nn.Module): """ - Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it - by subtracting from the mean and dividing by the standard deviation. - - Args: - dim (`int`): - Dimension along which to calculate the mean and standard deviation. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - minimum_scale (`float`, *optional*, defaults to 1e-5): - Default scale that is used for elements that are constantly zero along dimension `dim`. + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by + subtracting from the mean and dividing by the standard deviation. """ - def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5): + def __init__(self, config: TimeSeriesTransformerConfig): super().__init__() - if not dim > 0: - raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0") - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 - @torch.no_grad() - def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - denominator = weights.sum(self.dim, keepdim=self.keepdim) + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim) denominator = denominator.clamp_min(1.0) - loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator + loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator - variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator scale = torch.sqrt(variance + self.minimum_scale) return (data - loc) / scale, loc, scale class TimeSeriesMeanScaler(nn.Module): """ - Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data + Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data accordingly. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. - default_scale (`float`, *optional*, defaults to `None`): - Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch. - minimum_scale (`float`, *optional*, defaults to 1e-10): - Default minimum possible scale that is used for any item. """ - def __init__( - self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10 - ): + def __init__(self, config: TimeSeriesTransformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim - self.minimum_scale = minimum_scale - self.default_scale = default_scale + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 + self.default_scale = config.default_scale if hasattr(config, "default_scale") else None - @torch.no_grad() def forward( self, data: torch.Tensor, observed_indicator: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - # shape: (N, [C], T=1) + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) num_observed = observed_indicator.sum(self.dim, keepdim=True) @@ -173,23 +172,26 @@ def forward( class TimeSeriesNOPScaler(nn.Module): """ - Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data. - - Args: - dim (`int`): - Dimension along which to compute the scale. - keepdim (`bool`, *optional*, defaults to `False`): - Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it. + Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. """ - def __init__(self, dim: int, keepdim: bool = False): + def __init__(self, config: TimeSeriesTransformerConfig): super().__init__() - self.dim = dim - self.keepdim = keepdim + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True def forward( - self, data: torch.Tensor, observed_indicator: torch.Tensor + self, data: torch.Tensor, observed_indicator: torch.Tensor = None ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) return data, loc, scale @@ -1180,11 +1182,11 @@ def __init__(self, config: TimeSeriesTransformerConfig): super().__init__(config) if config.scaling == "mean" or config.scaling is True: - self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True) + self.scaler = TimeSeriesMeanScaler(config) elif config.scaling == "std": - self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True) + self.scaler = TimeSeriesStdScaler(config) else: - self.scaler = TimeSeriesNOPScaler(dim=1, keepdim=True) + self.scaler = TimeSeriesNOPScaler(config) if config.num_static_categorical_features > 0: self.embedder = TimeSeriesFeatureEmbedder( From 5ed7a9fb86a677a32472f14c37b3718b30d7425d Mon Sep 17 00:00:00 2001 From: nnguyen Date: Sun, 12 Nov 2023 21:43:54 -0500 Subject: [PATCH 187/189] edit output_hidden_states --- src/transformers/models/patchtst/modeling_patchtst.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 2a09e5a4dfadc4..6c48c686e5c163 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -770,9 +770,7 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states # Input embedding patch_input = self.embedder(patch_input) From 01294fd0e9302e467e8e928fb5ec3aae9d89806b Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 13 Nov 2023 11:21:13 +0100 Subject: [PATCH 188/189] style --- src/transformers/models/patchtst/modeling_patchtst.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 640a7786c9411f..658140fc1c087a 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -770,7 +770,9 @@ def forward( `BaseModelOutput` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) # Input embedding patch_input = self.embedder(patch_input) From 9bf4074b7d93e544b8daff316e04ad5004cbbb4e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 13 Nov 2023 12:23:55 +0100 Subject: [PATCH 189/189] fix forecast_mask_patches doc string --- src/transformers/models/patchtst/modeling_patchtst.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py index 658140fc1c087a..30522a048f024d 100755 --- a/src/transformers/models/patchtst/modeling_patchtst.py +++ b/src/transformers/models/patchtst/modeling_patchtst.py @@ -331,10 +331,10 @@ def forecast_masking( inputs (`torch.Tensor`): Input of shape `(bs, num_channels, num_patch, patch_len)` or `(bs, tsg1, tag2, num_channels, num_patch, patch_len)` - forecast_mask_patches (`list`): [2, 4] - List of patch lengths to mask in the end of the data. - forecast_mask_ratios (`list`, *optional*): [0.7, 0.3] - List of weights to use for each patch length. For Ex. if forecast_mask_patches is [5,4] and + forecast_mask_patches (`list`): + List of patch lengths to mask at the end of the data e.g. [2, 4]. + forecast_mask_ratios (`list`, *optional*): + List of weights to use for each patch length. For example if forecast_mask_patches is [5,4] and forecast_mask_ratios is [1,1], then equal weights to both patch lengths. unmasked_channel_indices (`list`, *optional*): Control Variable channel indices. These channels will not be masked.