Docs: formatting nits (#32247)

* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2025-07-04 05:10:06 +06:00 · 2024-07-30 15:49:14 +01:00 · 2024-07-30 15:49:14 +01:00 · e68ec18ce2
commit e68ec18ce2
parent 2fbbcf5007
88 changed files with 172 additions and 170 deletions
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http
 To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
+Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
 <frameworkcontent>
 <pt>
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@ -147,7 +147,7 @@ def get_original_command(max_width=80, full_python_path=False):
    Return the original command line string that can be replayed nicely and wrapped for 80 char width.
    Args:
-        max_width (`int`, `optional`, defaults to 80):
+        max_width (`int`, *optional*, defaults to 80):
            The width to wrap for.
        full_python_path (`bool`, `optional`, defaults to `False`):
             Whether to replicate the full path or just the last segment (i.e. `python`).
--- a/src/transformers/agents/evaluate_agent.py
+++ b/src/transformers/agents/evaluate_agent.py
@ -113,7 +113,7 @@ class Problem:
            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
            inputs expected (the value used will be `<<input_name>>` in this case).
-        answer (`str` or `list[str`]):
+        answer (`str` or `list[str]`):
            The theoretical answer (or list of possible valid answers) to the problem, as code.
    """
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -663,7 +663,7 @@ def spectrogram_batch(
            Specifies log scaling strategy; options are None, "log", "log10", "dB".
        reference (`float`, *optional*, defaults to 1.0):
            Reference value for dB conversion in log_mel.
-        min_value (`float`, °optional*, defaults to 1e-10):
+        min_value (`float`, *optional*, defaults to 1e-10):
            Minimum floor value for log scale conversions.
        db_range (`float`, *optional*):
            Dynamic range for dB scale spectrograms.
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -542,7 +542,7 @@ class QuantoQuantizedCache(QuantizedCache):
    Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
    Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
    """
@ -583,7 +583,7 @@ class HQQQuantizedCache(QuantizedCache):
    Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
    Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
    """
@ -794,7 +794,7 @@ class StaticCache(Cache):
    Static Cache class to be used with `torch.compile(model)` and `torch.export()`.
    Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
@ -924,7 +924,7 @@ class SlidingWindowCache(StaticCache):
    We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)
    Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -225,7 +225,7 @@ def get_resize_output_image_size(
    Args:
        input_image (`np.ndarray`):
            The image to resize.
-        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+        size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
            this.
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -1389,7 +1389,7 @@ class NeptuneCallback(TrainerCallback):
            You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
            value of the `NEPTUNE_PROJECT` environment variable is used.
        name (`str`, *optional*): Custom name for the run.
-        base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
+        base_namespace (`str`, *optional*, defaults to "finetuning"): In the Neptune run, the root namespace
            that will contain all of the metadata logged by the callback.
        log_parameters (`bool`, *optional*, defaults to `True`):
            If True, logs all Trainer arguments and model parameters provided by the Trainer.
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@ -266,7 +266,7 @@ class AttentionMaskConverter:
            # or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
            # Thus, we only set `ignore_causal_mask = True` if the model is set to training.
            #
-            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`).
+            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
            if (
                (is_training or not is_tracing)
                and (query_length == 1 or key_value_length == query_length)
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@ -39,7 +39,7 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.T
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    Return:
-        indices (`torch.Tensor):
+        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
@ -83,7 +83,7 @@ def _upad_input(
            Target length.
    Return:
-        query_layer (`torch.Tensor):
+        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
@ -149,7 +149,7 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    Return:
-        query (`torch.Tensor):
+        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@ -1444,7 +1444,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
        Args:
            dataset (`Any`):
                A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
-            batch_size (`int`, defaults to 8):
+            batch_size (`int`, *optional*, defaults to 8):
                The size of batches to return.
            shuffle (`bool`, defaults to `True`):
                Whether to return samples from the dataset in random order. Usually `True` for training datasets and
@ -3442,7 +3442,7 @@ class TFSequenceSummary(keras.layers.Layer):
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
-        initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
+        initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
    """
--- a/src/transformers/models/autoformer/configuration_autoformer.py
+++ b/src/transformers/models/autoformer/configuration_autoformer.py
@ -105,10 +105,10 @@ class AutoformerConfig(PretrainedConfig):
        label_length (`int`, *optional*, defaults to 10):
            Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
            non-autoregressive generation).
-        moving_average (`int`, defaults to 25):
+        moving_average (`int`, *optional*, defaults to 25):
            The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
            Layer.
-        autocorrelation_factor (`int`, defaults to 3):
+        autocorrelation_factor (`int`, *optional*, defaults to 3):
            "Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
            It's recommended in the paper to set it to a number between 1 and 5.
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@ -1219,7 +1219,7 @@ class BertForPreTraining(BertPreTrainedModel):
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
                Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@ -1291,7 +1291,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Return:
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@ -2290,7 +2290,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@ -57,7 +57,7 @@ def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torc
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
+        num_heads (`int`):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
@ -94,13 +94,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
    Dropout add function
    Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
            input tensor
-        residual (`torch.tensor`, *required*):
+        residual (`torch.tensor`):
            residual tensor
-        prob (`float`, *required*):
+        prob (`float`):
            dropout probability
-        training (`bool`, *required*):
+        training (`bool`):
            training mode
    """
    out = F.dropout(x, p=prob, training=training)
@ -114,7 +114,7 @@ def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
    make the model jitable.
    Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
            input hidden states
    """
    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
@ -126,9 +126,9 @@ def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
    0.3989423 * x * torch.exp(-0.5 * x * x)
    Args:
-        g (`torch.tensor`, *required*):
+        g (`torch.tensor`):
            gradient output tensor
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
            input tensor
    """
    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
@ -210,7 +210,7 @@ class BloomAttention(nn.Module):
        without making any copies, results share same memory storage as `fused_qkv`
        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
        Returns:
            query: [batch_size, num_heads, seq_length, head_dim]
@ -229,7 +229,7 @@ class BloomAttention(nn.Module):
        Merge heads together over the last dimension
        Args:
-            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@ -247,7 +247,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
                Image to resize.
            size (`Dict[str, int]`):
                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
-            size_divisor (`int`, defaults to 32):
+            size_divisor (`int`, *optional*, defaults to 32):
                The image is resized to a size that is a multiple of this value.
            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resiizing the image.
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@ -972,7 +972,7 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/clvp/feature_extraction_clvp.py
+++ b/src/transformers/models/clvp/feature_extraction_clvp.py
@ -173,7 +173,7 @@ class ClvpFeatureExtractor(SequenceFeatureExtractor):
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
-            padding_value (`float`, defaults to 0.0):
+            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            max_length (`int`, *optional*):
                The maximum input length of the inputs.
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@ -41,9 +41,9 @@ class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
    Args:
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
-        patch_size (`int`, optional, defaults to 4):
+        patch_size (`int`, *optional*, defaults to 4):
            Patch size to use in the patch embedding layer.
-        num_stages (`int`, optional, defaults to 4):
+        num_stages (`int`, *optional*, defaults to 4):
            The number of stages in the model.
        hidden_sizes (`List[int]`, *optional*, defaults to [96, 192, 384, 768]):
            Dimensionality (hidden size) at each stage.
--- a/src/transformers/models/convnextv2/configuration_convnextv2.py
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@ -35,9 +35,9 @@ class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
    Args:
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
-        patch_size (`int`, optional, defaults to 4):
+        patch_size (`int`, *optional*, defaults to 4):
            Patch size to use in the patch embedding layer.
-        num_stages (`int`, optional, defaults to 4):
+        num_stages (`int`, *optional*, defaults to 4):
            The number of stages in the model.
        hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
            Dimensionality (hidden size) at each stage.
--- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
@ -175,7 +175,7 @@ class TFConvNextV2Layer(keras.layers.Layer):
            Model configuration class.
        dim (`int`):
            Number of input channels.
-        drop_path (`float`, defaults to 0.0):
+        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    """
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@ -1077,7 +1077,7 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@ -37,8 +37,8 @@ class DbrxAttentionConfig(PretrainedConfig):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*):
            If set, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
+        kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
+        rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
    """
    def __init__(
@ -92,11 +92,11 @@ class DbrxFFNConfig(PretrainedConfig):
        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of the activation function along with
            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
-        ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
+        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
-        moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
+        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
-        moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
+        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
+        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
    """
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@ -144,7 +144,7 @@ def load_balancing_loss_func(
            Number of experts.
        top_k (`int`):
            The number of experts each token is routed to.
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
@ -757,16 +757,16 @@ class DbrxBlock(nn.Module):
        Args:
            hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
-            attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
+            attention_mask (`torch.Tensor`, *optional*): attention mask of size (batch_size, sequence_length)
                if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
                if default attention is used.
-            past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
+            past_key_value (`Tuple(torch.Tensor)`, *optional*): cached past key and value projection states
-            output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
+            output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
                attention layers. See `attentions` under returned tensors for more detail.
-            output_router_logits (`bool`, optional): Whether or not to return the router logits.
+            output_router_logits (`bool`, *optional*): Whether or not to return the router logits.
-            use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
+            use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are
                returned and can be used to speed up decoding (see `past_key_values`).
-            cache_position (`torch.LongTensor`, optional): position ids of the cache
+            cache_position (`torch.LongTensor`, *optional*): position ids of the cache
        """
        # Norm + Attention + Norm
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@ -80,7 +80,7 @@ class DebertaConfig(PretrainedConfig):
        pos_att_type (`List[str]`, *optional*):
            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
            `["p2c", "c2p"]`.
-        layer_norm_eps (`float`, optional, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    Example:
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@ -602,10 +602,10 @@ class DisentangledSelfAttention(nn.Module):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.
-            output_attentions (`bool`, optional):
+            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.
-            query_states (`torch.FloatTensor`, optional):
+            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.
            relative_pos (`torch.LongTensor`):
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@ -669,10 +669,10 @@ class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.
-            return_att (`bool`, optional):
+            return_att (`bool`, *optional*):
                Whether return the attention matrix.
-            query_states (`tf.Tensor`, optional):
+            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.
            relative_pos (`tf.Tensor`):
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@ -80,7 +80,7 @@ class DebertaV2Config(PretrainedConfig):
        pos_att_type (`List[str]`, *optional*):
            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
-        layer_norm_eps (`float`, optional, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    Example:
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@ -678,10 +678,10 @@ class DisentangledSelfAttention(nn.Module):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.
-            output_attentions (`bool`, optional):
+            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.
-            query_states (`torch.FloatTensor`, optional):
+            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.
            relative_pos (`torch.LongTensor`):
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@ -738,10 +738,10 @@ class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.
-            return_att (`bool`, optional):
+            return_att (`bool`, *optional*):
                Whether return the attention matrix.
-            query_states (`tf.Tensor`, optional):
+            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.
            relative_pos (`tf.Tensor`):
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@ -1019,7 +1019,7 @@ class ErnieForPreTraining(ErniePreTrainedModel):
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
                Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@ -993,7 +993,7 @@ class EsmForMaskedLM(EsmPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@ -1232,7 +1232,7 @@ class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@ -229,13 +229,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
    Dropout add function
    Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
            input tensor
-        residual (`torch.tensor`, *required*):
+        residual (`torch.tensor`):
            residual tensor
-        prob (`float`, *required*):
+        prob (`float`):
            dropout probability
-        training (`bool`, *required*):
+        training (`bool`):
            training mode
    """
    out = F.dropout(x, p=prob, training=training)
@ -315,7 +315,7 @@ class FalconAttention(nn.Module):
        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
@ -347,7 +347,7 @@ class FalconAttention(nn.Module):
        Merge heads together over the last dimension
        Args:
-            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@ -389,16 +389,16 @@ class FlavaImageCodebookConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.
    Args:
-        num_groups (`int`, defaults to 4):
+        num_groups (`int`, *optional*, defaults to 4):
            Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
            internal calculation and estimations.
-        input_channels (`int`, defaults to 3):
+        input_channels (`int`, *optional*, defaults to 3):
            Number of channels in the image to be passed.
-        num_blocks_per_group (`int`, defaults to 2):
+        num_blocks_per_group (`int`, *optional*, defaults to 2):
            Number of conv-based blocks per group.
-        hidden_size (`int`, defaults to 256):
+        hidden_size (`int`, *optional*, defaults to 256):
            Size of hidden dim for the blocks.
-        vocab_size (`int`, defaults to 8192):
+        vocab_size (`int`, *optional*, defaults to 8192):
            Size of the output vocabulary for the codebook.
        freeze (`bool`, defaults to `True`):
            Whether to freeze the weights of the model.
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@ -176,7 +176,7 @@ class FlavaForPreTrainingOutput(ModelOutput):
            The output of the [`FlavaTextModel`].
        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
-        multimodal_masked_output (`BaseModelOutputWithPooling`, returned when `input_ids_masked` and `pixel_values` are present):
+        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The output of the [`FlavaMultimodalModel`].
        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@ -651,7 +651,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@ -501,9 +501,9 @@ class FSMTEncoder(nn.Module):
            BaseModelOutput or Tuple comprised of:
                - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
-                - **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape *(src_len,
+                - **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
                  batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
-                - **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
+                - **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """
        # check attention mask and invert
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@ -839,7 +839,7 @@ PARALLELIZE_DOCSTRING = r"""
    it will evenly distribute blocks across all devices.
    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@ -587,7 +587,7 @@ PARALLELIZE_DOCSTRING = r"""
    across all devices.
    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/idefics/configuration_idefics.py
+++ b/src/transformers/models/idefics/configuration_idefics.py
@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.
    Args:
-        additional_vocab_size (`int`, *optional`, defaults to 0):
+        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@ -97,7 +97,7 @@ def load_balancing_loss_func(
        router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@ -69,7 +69,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin):
        Args:
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
-            num_image_tokens (`int`, defaults to 64):
+            num_image_tokens (`int`, *optional* defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
    Calculate the number of patches after the preprocessing for images of any resolution.
    Args:
-        image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
            The size of the input image in the format (height, width). ?
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
    Calculate the number of patches after the preprocessing for images of any resolution.
    Args:
-        image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
            The size of the input image in the format (height, width). ?
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the
                cross(masked)-attention of the decoder.
-            feature_size_list (`List[torch.Size]` ):
+            feature_size_list (`List[torch.Size]`):
                This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@ -84,7 +84,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r"""
    it will evenly distribute blocks across all devices.
    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
        Args:
            outputs ([`OneFormerForUniversalSegmentationOutput`]):
                The outputs from [`OneFormerForUniversalSegmentationOutput`].
-            task_type (`str`, *optional)*, defaults to "instance"):
+            task_type (`str`, *optional*, defaults to "instance"):
                The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
                ignore the stuff predictions.
            is_demo (`bool`, *optional)*, defaults to `True`):
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape):
            channels is preserved.
    Returns
-        image (`np.ndarray):
+        image (`np.ndarray`):
            The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
            input.ndim`.
        output_shape (`Tuple`):
--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module):
    """Normalization block
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
    """This module mixes the features in the channel dimension.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module):
    """This module mixes the patch dimension.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module):
    """This module mixes the hidden feature dimension.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module):
    The `PatchTSMixer` layer that does all three kinds of mixing.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module):
    """The main computing framework of the `PatchTSMixer` model.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module):
    """Prediction Head for Forecasting
    Args:
-        config (`PatchTSMixerConfig`, *required*): Configuration.
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module):
    """Linear head for Classification and Regression.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
-
+            Configuration.
    """
    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module):
    """Pretraining head.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
    Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """
@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for mask pretraining.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    Returns:
@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for forecasting application.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    Returns:
@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for classification application.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    Returns:
@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for regression application.
    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    Returns:
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module):
        storage as `fused_qkv`
        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@ -75,7 +75,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
             Legacy dictionary, which is required so that model can use *generate()* function.
        Returns:
@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.
        Returns:
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.
        Returns:
@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.
        Returns:
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@ -1075,7 +1075,7 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@ -1151,7 +1151,7 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
                 ignored (masked), the loss is only computed for the tokens with labels in `[0, ...,
                 config.vocab_size]`
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@ -59,7 +59,7 @@ class SegGptEncoderOutput(ModelOutput):
        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
            Tuple of *torch.FloatTensor* (one for each layer) of shape
            `(batch_size, num_heads, seq_len, seq_len)`.
-        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.intermediate_hidden_state_indices` is set):
+        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
            Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
            Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
            Additionaly, each feature passes through a LayerNorm.
@ -77,7 +77,7 @@ class SegGptImageSegmentationOutput(ModelOutput):
    Output type of [`SegGptImageSegmentationOutput`].
    Args:
-        loss (`torch.FloatTensor`, `optional`, returned when `labels` is provided):
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
            The loss value.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            The predicted masks.
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@ -745,10 +745,10 @@ class DisentangledSelfAttention(nn.Module):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.
-            output_attentions (`bool`, optional):
+            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.
-            query_states (`torch.FloatTensor`, optional):
+            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.
            relative_pos (`torch.LongTensor`):
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@ -220,7 +220,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, defaults to 0.0):
+            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
        """
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@ -181,7 +181,7 @@ PARALLELIZE_DOCSTRING = r"""
    it will evenly distribute blocks across all devices.
    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@ -1249,7 +1249,7 @@ class TapasTokenizer(PreTrainedTokenizer):
                Total number of table columns
            max_length (`int`):
                Total maximum length.
-            truncation_strategy (`str` or [`TapasTruncationStrategy`]):
+            truncation_strategy (`str` or [`TapasTruncationStrategy]`):
                Truncation strategy to use. Seeing as this method should only be called when truncating, the only
                available strategy is the `"drop_rows_to_fit"` strategy.
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@ -833,7 +833,7 @@ class UdopTokenizer(PreTrainedTokenizer):
        </Tip>
        Args:
-            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
+            text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@ -814,7 +814,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
        </Tip>
        Args:
-            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
+            text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@ -243,7 +243,7 @@ class ViltImageProcessor(BaseImageProcessor):
                Image to resize.
            size (`Dict[str, int]`):
                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
-            size_divisor (`int`, defaults to 32):
+            size_divisor (`int`, *optional*, defaults to 32):
                The image is resized to a size that is a multiple of this value.
            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resiizing the image.
--- a/src/transformers/models/vitdet/modeling_vitdet.py
+++ b/src/transformers/models/vitdet/modeling_vitdet.py
@ -182,7 +182,7 @@ def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_siz
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
-        k_size (`Tuple[int]`]):
+        k_size (`Tuple[int]`):
            Spatial sequence size of key k with (keys_height, keys_width).
    Returns:
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@ -36,11 +36,11 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
    most of the main methods. Users should refer to this superclass for more information regarding those methods.
    Args:
-        feature_size (`int`, defaults to 1):
+        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
-        sampling_rate (`int`, defaults to 16000):
+        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, defaults to 0.0):
+        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
@ -166,7 +166,7 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, defaults to 0.0):
+            padding_value (`float`, *optional*, defaults to 0.0):
        """
        if sampling_rate is not None:
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@ -184,9 +184,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
            If `"rotary"` position embeddings are used, defines the size of the embedding base.
        max_source_positions (`int`, *optional*, defaults to 5000):
            if `"relative"` position embeddings are used, defines the maximum source input positions.
-        conv_depthwise_kernel_size (`int`, defaults to 31):
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
-        conformer_conv_dropout (`float`, defaults to 0.1):
+        conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all convolutional layers in Conformer blocks.
    Example:
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@ -44,16 +44,16 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
    Args:
-        feature_size (`int`, defaults to 80):
+        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
-        sampling_rate (`int`, defaults to 16000):
+        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        hop_length (`int`, defaults to 160):
+        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
-        chunk_length (`int`, defaults to 30):
+        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
-        n_fft (`int`, defaults to 400):
+        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
@ -231,7 +231,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
-            padding_value (`float`, defaults to 0.0):
+            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@ -1368,7 +1368,7 @@ class WhisperGenerationMixin:
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
-            num_segment_frames (`int`, defaults to 3000):
+            num_segment_frames (`int`, *optional*, defaults to 3000):
                The number of log-mel frames the model expects
        Return:
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@ -565,7 +565,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        offsets = []
@ -615,7 +615,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
        Args:
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
@ -671,7 +671,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
            output_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                timestamps.
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
                Whether or not to decode with timestamps included in the raw text.
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@ -207,7 +207,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        offsets = []
@ -258,7 +258,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
        Args:
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        """
        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
@ -317,7 +317,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
            output_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                timestamps.
-            time_precision (`float`, `optional`, defaults to 0.02):
+            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
                Whether or not to decode with timestamps included in the raw text.
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@ -1081,7 +1081,7 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@ -1039,7 +1039,7 @@ class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@ -1173,7 +1173,7 @@ class XmodForMaskedLM(XmodPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@ -647,8 +647,9 @@ class YolosModel(YolosPreTrainedModel):
        Prunes heads of the model.
        Args:
-            heads_to_prune (`dict` of {layer_num: list of heads to prune in this layer}):
+            heads_to_prune (`dict`):
-                See base class `PreTrainedModel`.
+                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@ -218,7 +218,7 @@ def infer_framework_load_model(
    If both frameworks are installed and available for `model`, PyTorch is selected.
    Args:
-        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
        config ([`AutoConfig`]):
            The config associated with the model to help using the correct class
@ -322,7 +322,7 @@ def infer_framework_from_model(
    If both frameworks are installed and available for `model`, PyTorch is selected.
    Args:
-        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
        model_classes (dictionary `str` to `type`, *optional*):
            A mapping framework to class.
@ -349,7 +349,7 @@ def get_framework(model, revision: Optional[str] = None):
    Select framework (TensorFlow or PyTorch) to use.
    Args:
-        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
            If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
            the model name). If no specific model is provided, defaults to using PyTorch.
    """
@ -385,7 +385,7 @@ def get_default_model_and_revision(
    Select a default model to use for a given task. Defaults to pytorch if ambiguous.
    Args:
-        targeted_task (`Dict` ):
+        targeted_task (`Dict`):
           Dictionary representing the given task, that should contain default models
        framework (`str`, None)
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@ -22,7 +22,7 @@ logger = logging.get_logger(__name__)
@add_end_docstrings(
    build_pipeline_init_args(has_tokenizer=True),
    r"""
-        top_k (`int`, defaults to 5):
+        top_k (`int`, *optional*, defaults to 5):
            The number of predictions to return.
        targets (`str` or `List[str]`, *optional*):
            When passed, the model will limit the scores to the passed targets instead of looking up in the whole
--- a/src/transformers/pipelines/pt_utils.py
+++ b/src/transformers/pipelines/pt_utils.py
@ -31,7 +31,7 @@ class PipelineIterator(IterableDataset):
        ```
                Arguments:
-                    loader (`torch.utils.data.DataLoader` or any iterator):
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
                        The iterator that will be used to apply `infer` on.
                    infer (any function):
                        The function to apply of each element of `loader`.
@ -163,7 +163,7 @@ class PipelineChunkIterator(PipelineIterator):
        ```
                Arguments:
-                    loader (`torch.utils.data.DataLoader` or any iterator):
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
                        The iterator that will be used to apply `infer` on.
                    infer (any function):
                        The function to apply of each element of `loader`.
@ -224,7 +224,7 @@ class PipelinePackIterator(PipelineIterator):
    ```
        Arguments:
-            loader (`torch.utils.data.DataLoader` or any iterator):
+            loader (`torch.utils.data.DataLoader` or `Iterable`):
                The iterator that will be used to apply `infer` on.
            infer (any function):
                The function to apply of each element of `loader`.
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@ -3200,7 +3200,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        </Tip>
        Args:
-            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
+            text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -745,7 +745,7 @@ class Trainer:
        Add a callback to the current list of [`~transformers.TrainerCallback`].
        Args:
-           callback (`type` or [`~transformers.TrainerCallback`]):
+           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will instantiate a member of that class.
        """
@ -758,7 +758,7 @@ class Trainer:
        If the callback is not found, returns `None` (and no error is raised).
        Args:
-           callback (`type` or [`~transformers.TrainerCallback`]):
+           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will pop the first member of that class found in the list of callbacks.
@ -772,7 +772,7 @@ class Trainer:
        Remove a callback from the current list of [`~transformers.TrainerCallback`].
        Args:
-           callback (`type` or [`~transformers.TrainerCallback`]):
+           callback (`type` or [`~transformers.TrainerCallback]`):
               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
               first case, will remove the first member of that class found in the list of callbacks.
        """
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@ -80,7 +80,7 @@ class Seq2SeqTrainer(Trainer):
        Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
        Args:
-            gen_config_arg (`str` or [`~generation.GenerationConfig`]):
+            gen_config_arg (`str` or [`~generation.GenerationConfig]`):
                `Seq2SeqTrainingArguments.generation_config` argument.
        Returns:
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -1605,7 +1605,7 @@ def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
    Args:
        path (`str`): The path to the source file
-        file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
+        file (`str`, *optional*): The file to join with the path. Defaults to "__init__.py".
    Returns:
        `ModuleType`: The resulting imported module