fix style

2025-08-03 03:31:05 +06:00 · 2023-01-30 12:04:13 +01:00 · 2023-01-30 12:04:13 +01:00 · fdffeb819c
commit fdffeb819c
parent 83d39df0b1
8 changed files with 122 additions and 161 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -290,6 +290,10 @@ _import_structure = {
    "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
    "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
    "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
    "models.informer": [
        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "InformerConfig",
    ],
    "models.jukebox": [
        "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "JukeboxConfig",
@ -414,10 +418,6 @@ _import_structure = {
        "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "TimeSeriesTransformerConfig",
    ],
    "models.informer": [
        "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "InformerConfig",
    ],
    "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
    "models.trajectory_transformer": [
        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
@ -1621,6 +1621,14 @@ else:
            "load_tf_weights_in_imagegpt",
        ]
    )
    _import_structure["models.informer"].extend(
        [
            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
            "InformerForPrediction",
            "InformerModel",
            "InformerPreTrainedModel",
        ]
    )
    _import_structure["models.jukebox"].extend(
        [
            "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -2275,14 +2283,6 @@ else:
            "TimeSeriesTransformerPreTrainedModel",
        ]
    )
    _import_structure["models.informer"].extend(
        [
            "INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
            "InformerForPrediction",
            "InformerModel",
            "InformerPreTrainedModel",
        ]
    )
    _import_structure["models.timesformer"].extend(
        [
            "TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -3741,6 +3741,7 @@ if TYPE_CHECKING:
    from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
    from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
    from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
    from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
    from .models.jukebox import (
        JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
        JukeboxConfig,
@ -3855,10 +3856,6 @@ if TYPE_CHECKING:
        TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        TimeSeriesTransformerConfig,
    )
    from .models.informer import (
        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        InformerConfig,
    )
    from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
    from .models.trajectory_transformer import (
        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@ -4865,6 +4862,12 @@ if TYPE_CHECKING:
            ImageGPTPreTrainedModel,
            load_tf_weights_in_imagegpt,
        )
        from .models.informer import (
            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            InformerForPrediction,
            InformerModel,
            InformerPreTrainedModel,
        )
        from .models.jukebox import (
            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
            JukeboxModel,
@ -5393,12 +5396,6 @@ if TYPE_CHECKING:
            TimeSeriesTransformerModel,
            TimeSeriesTransformerPreTrainedModel,
        )
        from .models.informer import (
            INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            InformerForPrediction,
            InformerModel,
            InformerPreTrainedModel,
        )
        from .models.timesformer import (
            TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TimesformerForVideoClassification,
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -90,6 +90,7 @@ from . import (
    hubert,
    ibert,
    imagegpt,
    informer,
    jukebox,
    layoutlm,
    layoutlmv2,
@ -165,7 +166,6 @@ from . import (
    tapas,
    tapex,
    time_series_transformer,
    informer,
    timesformer,
    trajectory_transformer,
    transfo_xl,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -93,6 +93,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("hubert", "HubertConfig"),
        ("ibert", "IBertConfig"),
        ("imagegpt", "ImageGPTConfig"),
        ("informer", "InformerConfig"),
        ("jukebox", "JukeboxConfig"),
        ("layoutlm", "LayoutLMConfig"),
        ("layoutlmv2", "LayoutLMv2Config"),
@ -161,7 +162,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("table-transformer", "TableTransformerConfig"),
        ("tapas", "TapasConfig"),
        ("time_series_transformer", "TimeSeriesTransformerConfig"),
        ("informer", "InformerConfig"),
        ("timesformer", "TimesformerConfig"),
        ("trajectory_transformer", "TrajectoryTransformerConfig"),
        ("transfo-xl", "TransfoXLConfig"),
@ -258,6 +258,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
        ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -319,7 +320,6 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
        ("table-transformer", "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -424,6 +424,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("hubert", "Hubert"),
        ("ibert", "I-BERT"),
        ("imagegpt", "ImageGPT"),
        ("informer", "Informer"),
        ("jukebox", "Jukebox"),
        ("layoutlm", "LayoutLM"),
        ("layoutlmv2", "LayoutLMv2"),
@ -500,7 +501,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("tapas", "TAPAS"),
        ("tapex", "TAPEX"),
        ("time_series_transformer", "Time Series Transformer"),
        ("informer", "Informer"),
        ("timesformer", "TimeSformer"),
        ("trajectory_transformer", "Trajectory Transformer"),
        ("transfo-xl", "Transformer-XL"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -92,6 +92,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("hubert", "HubertModel"),
        ("ibert", "IBertModel"),
        ("imagegpt", "ImageGPTModel"),
        ("informer", "InformerModel"),
        ("jukebox", "JukeboxModel"),
        ("layoutlm", "LayoutLMModel"),
        ("layoutlmv2", "LayoutLMv2Model"),
@ -157,7 +158,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("table-transformer", "TableTransformerModel"),
        ("tapas", "TapasModel"),
        ("time_series_transformer", "TimeSeriesTransformerModel"),
        ("informer", "InformerModel"),
        ("timesformer", "TimesformerModel"),
        ("trajectory_transformer", "TrajectoryTransformerModel"),
        ("transfo-xl", "TransfoXLModel"),
--- a/src/transformers/models/informer/init.py
+++ b/src/transformers/models/informer/init.py
@ -43,10 +43,7 @@ else:
 if TYPE_CHECKING:
-    from .configuration_informer import (
+    from .configuration_informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
        INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        InformerConfig,
    )
    try:
        if not is_torch_available():
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@ -27,13 +27,11 @@ INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 }
 class InformerConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to
+    This is the configuration class to store the configuration of a [`InformerModel`]. It is used to instantiate an
-    instantiate an Informer model according to the specified arguments, defining the model architecture.
+    Informer model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the Time Series
+    with the defaults will yield a similar configuration to that of the Time Series Transformer
    Transformer
    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
    architecture.
@ -136,47 +134,47 @@ class InformerConfig(PretrainedConfig):
    }
    def __init__(
-            self,
+        self,
-            input_size: int = 1,
+        input_size: int = 1,
-            prediction_length: Optional[int] = None,
+        prediction_length: Optional[int] = None,
-            context_length: Optional[int] = None,
+        context_length: Optional[int] = None,
-            distribution_output: str = "student_t",
+        distribution_output: str = "student_t",
-            loss: str = "nll",
+        loss: str = "nll",
-            lags_sequence: List[int] = None,
+        lags_sequence: List[int] = None,
-            scaling: bool = True,
+        scaling: bool = True,
-            num_dynamic_real_features: int = 0,
+        num_dynamic_real_features: int = 0,
-            num_static_real_features: int = 0,
+        num_static_real_features: int = 0,
-            num_static_categorical_features: int = 0,
+        num_static_categorical_features: int = 0,
-            num_time_features: int = 0,
+        num_time_features: int = 0,
-            cardinality: Optional[List[int]] = None,
+        cardinality: Optional[List[int]] = None,
-            embedding_dimension: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
-            encoder_ffn_dim: int = 32,
+        encoder_ffn_dim: int = 32,
-            decoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
-            encoder_attention_heads: int = 2,
+        encoder_attention_heads: int = 2,
-            decoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
-            encoder_layers: int = 2,
+        encoder_layers: int = 2,
-            decoder_layers: int = 2,
+        decoder_layers: int = 2,
-            is_encoder_decoder: bool = True,
+        is_encoder_decoder: bool = True,
-            activation_function: str = "gelu",
+        activation_function: str = "gelu",
-            dropout: float = 0.05,
+        dropout: float = 0.05,
-            encoder_layerdrop: float = 0.1,
+        encoder_layerdrop: float = 0.1,
-            decoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
-            attention_dropout: float = 0.1,
+        attention_dropout: float = 0.1,
-            activation_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
-            num_parallel_samples: int = 100,
+        num_parallel_samples: int = 100,
-            init_std: float = 0.02,
+        init_std: float = 0.02,
-            use_cache=True,
+        use_cache=True,
-            # Informer arguments
+        # Informer arguments
-            attn: str = "prob",
+        attn: str = "prob",
-            factor: int = 5,
+        factor: int = 5,
-            distil: bool = True,
+        distil: bool = True,
-            **kwargs
+        **kwargs
    ):
        # time series specific configuration
        self.prediction_length = prediction_length
        self.context_length = context_length or prediction_length
        self.distribution_output = distribution_output
-        self.loss = loss # Eli: From vanilla ts transformer
+        self.loss = loss  # Eli: From vanilla ts transformer
        self.input_size = input_size
        self.num_time_features = num_time_features
        self.lags_sequence = lags_sequence
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@ -17,9 +17,12 @@
 import random
 from dataclasses import dataclass
 from math import sqrt
 from typing import Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
 from torch.distributions import (
    AffineTransform,
@ -37,11 +40,6 @@ from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
 from math import sqrt
 from typing import List, Optional
 import numpy as np
 import torch.nn.functional as F
 logger = logging.get_logger(__name__)
@ -54,7 +52,6 @@ INFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
 class AffineTransformed(TransformedDistribution):
    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
        self.scale = 1.0 if scale is None else scale
@ -472,6 +469,7 @@ class Seq2SeqTimeSeriesModelOutput(ModelOutput):
    scale: Optional[torch.FloatTensor] = None
    static_features: Optional[torch.FloatTensor] = None
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
@dataclass
 class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
@ -540,6 +538,7 @@ class Seq2SeqTimeSeriesPredictionOutput(ModelOutput):
    scale: Optional[torch.FloatTensor] = None
    static_features: Optional[torch.FloatTensor] = None
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer
@dataclass
 class SampleTimeSeriesPredictionOutput(ModelOutput):
@ -554,9 +553,7 @@ class TriangularCausalMask:
    def __init__(self, B, L, device="cpu"):
        mask_shape = [B, 1, L, L]
        with torch.no_grad():
-            self._mask = torch.triu(
+            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
            ).to(device)
    @property
    def mask(self):
@ -568,9 +565,7 @@ class ProbMask:
    def __init__(self, B, H, L, index, scores, device="cpu"):
        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
-        indicator = _mask_ex[
+        indicator = _mask_ex[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :].to(device)
            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
        ].to(device)
        self._mask = indicator.view(scores.shape).to(device)
    @property
@ -597,7 +592,7 @@ class FullAttention(nn.Module):
    def forward(self, queries, keys, values, attn_mask):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
-        scale = self.scale or 1. / sqrt(E)
+        scale = self.scale or 1.0 / sqrt(E)
        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
        if self.mask_flag:
@ -673,14 +668,12 @@ class ProbAttention(nn.Module):
        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
-        context_in[
+        context_in[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = torch.matmul(
-            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            attn, V
-        ] = torch.matmul(attn, V).type_as(context_in)
+        ).type_as(context_in)
        if self.output_attention:
            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
-            attns[
+            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
            ] = attn
            return (context_in, attns)
        else:
            return (context_in, None)
@ -708,18 +701,14 @@ class ProbAttention(nn.Module):
        # get the context
        context = self._get_initial_context(values, L_Q)
        # update the context with selected top_k queries
-        context, attn = self._update_context(
+        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
            context, values, scores_top, index, L_Q, attn_mask
        )
        return context.transpose(2, 1).contiguous(), attn
 # source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/attn.py
 class AttentionLayer(nn.Module):
-    def __init__(
+    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False):
        self, attention, d_model, n_heads, d_keys=None, d_values=None, mix=False
    ):
        super(AttentionLayer, self).__init__()
        d_keys = d_keys or (d_model // n_heads)
@ -761,13 +750,13 @@ class ConvLayer(nn.Module):
            padding=1,
            padding_mode="circular",
        )
-        self.norm = nn.BatchNorm1d(c_in) # Eli question: why batchnorm here?
+        self.norm = nn.BatchNorm1d(c_in)  # Eli question: why batchnorm here?
        self.activation = nn.ELU()
        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
    def forward(self, x):
        x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x) # Eli: why? maybe because the impl...
+        x = self.norm(x)  # Eli: why? maybe because the impl...
        x = self.activation(x)
        x = self.maxPool(x)
        x = x.transpose(1, 2)
@ -830,9 +819,7 @@ class DecoderLayer(nn.Module):
        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
        x = self.norm1(x)
-        x = x + self.dropout(
+        x = x + self.dropout(self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0])
            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
        )
        y = x = self.norm2(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
@ -847,8 +834,9 @@ class InformerEncoder(nn.Module):
        self.activation_fn = ACT2FN[config.activation_function]
        Attn = ProbAttention if config.attn == "prob" else FullAttention
-        self.attn_layers = nn.ModuleList([
+        self.attn_layers = nn.ModuleList(
-            EncoderLayer(
+            [
                EncoderLayer(
                    AttentionLayer(
                        Attn(
                            mask_flag=False,
@ -864,8 +852,10 @@ class InformerEncoder(nn.Module):
                    d_ff=config.encoder_ffn_dim,
                    dropout=config.attention_dropout,
                    activation=self.activation_fn,
-                ) for _ in range(config.encoder_layers)
+                )
-        ])
+                for _ in range(config.encoder_layers)
            ]
        )
        if config.distil is not None:
            self.conv_layers = nn.ModuleList([ConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)])
@ -1000,22 +990,15 @@ class InformerModel(InformerPreTrainedModel):
        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
    ) -> torch.Tensor:
        """
-        Returns lagged subsequences of a given sequence.
+        Returns lagged subsequences of a given sequence. Parameters ---------- sequence : Tensor
-        Parameters
+            the sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
        ----------
        sequence : Tensor
            the sequence from which lagged subsequences should be extracted.
            Shape: (N, T, C).
        subsequences_length : int
            length of the subsequences to be extracted.
        shift: int
            shift the lags by this amount back.
-        Returns
+        Returns -------- lagged : Tensor
-        --------
+            a tensor of shape (N, S, C, I), where S = subsequences_length and I = len(indices), containing lagged
-        lagged : Tensor
+            subsequences. Specifically, lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
            a tensor of shape (N, S, C, I), where S = subsequences_length and
            I = len(indices), containing lagged subsequences. Specifically,
            lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
        """
        sequence_length = sequence.shape[1]
        indices = [lag - shift for lag in self.config.lags_sequence]
@ -1125,24 +1108,24 @@ class InformerModel(InformerPreTrainedModel):
        return self.decoder
    def forward(
-            self,
+        self,
-            past_values: torch.Tensor,
+        past_values: torch.Tensor,
-            past_time_features: torch.Tensor,
+        past_time_features: torch.Tensor,
-            past_observed_mask: torch.Tensor,
+        past_observed_mask: torch.Tensor,
-            static_categorical_features: torch.Tensor,
+        static_categorical_features: torch.Tensor,
-            static_real_features: torch.Tensor,
+        static_real_features: torch.Tensor,
-            future_values: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
-            future_time_features: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
-            decoder_attention_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
-            head_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
-            decoder_head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
-            cross_attn_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
-            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
-            output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
-            use_cache: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[Seq2SeqTimeSeriesModelOutput, Tuple]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@ -1178,7 +1161,7 @@ class InformerModel(InformerPreTrainedModel):
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )
-        dec_input = transformer_inputs[:, self.config.context_length:, ...]
+        dec_input = transformer_inputs[:, self.config.context_length :, ...]
        decoder_outputs = self.decoder(
            inputs_embeds=dec_input,
            attention_mask=decoder_attention_mask,
@ -1462,6 +1445,3 @@ class InformerForPrediction(InformerPreTrainedModel):
                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
            )
        )
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@ -31,15 +31,8 @@ TOLERANCE = 1e-4
 if is_torch_available():
    import torch
-    from transformers import (
+    from transformers import InformerConfig, InformerForPrediction, InformerModel
-        InformerConfig,
+    from transformers.models.informer.modeling_informer import InformerDecoder, InformerEncoder
        InformerForPrediction,
        InformerModel,
    )
    from transformers.models.informer.modeling_informer import (
        InformerDecoder,
        InformerEncoder,
    )
@require_torch
@ -171,9 +164,7 @@ class InformerModelTester:
@require_torch
 class InformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
+    all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
        (InformerModel, InformerForPrediction) if is_torch_available() else ()
    )
    all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
    is_encoder_decoder = True
    test_pruning = False
@ -374,9 +365,7 @@ def prepare_batch(filename="train-batch.pt"):
@slow
 class InformerModelIntegrationTests(unittest.TestCase):
    def test_inference_no_head(self):
-        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
+        model = InformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(torch_device)
            torch_device
        )
        batch = prepare_batch()
        with torch.no_grad():
@ -399,9 +388,9 @@ class InformerModelIntegrationTests(unittest.TestCase):
        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
    def test_inference_head(self):
-        model = InformerForPrediction.from_pretrained(
+        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            "huggingface/time-series-transformer-tourism-monthly"
+            torch_device
-        ).to(torch_device)
+        )
        batch = prepare_batch("val-batch.pt")
        with torch.no_grad():
            output = model(
@ -421,9 +410,9 @@ class InformerModelIntegrationTests(unittest.TestCase):
        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
    def test_seq_to_seq_generation(self):
-        model = InformerForPrediction.from_pretrained(
+        model = InformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly").to(
-            "huggingface/time-series-transformer-tourism-monthly"
+            torch_device
-        ).to(torch_device)
+        )
        batch = prepare_batch("val-batch.pt")
        with torch.no_grad():
            outputs = model.generate(