apply it

2025-07-03 12:50:06 +06:00 · 2025-07-02 22:59:10 +02:00 · 2025-07-02 22:59:10 +02:00 · 98882f1353
commit 98882f1353
parent fb8b32ef35
21 changed files with 116 additions and 209 deletions
--- a/src/transformers/models/d_fine/modeling_d_fine.py
+++ b/src/transformers/models/d_fine/modeling_d_fine.py
@ -1674,21 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel):
        return_dict: Optional[bool] = None,
        **loss_kwargs,
    ) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
-        r"""
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
-            can choose to directly pass a flattened representation of an image.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
-            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
-            embedded representation.
-        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
-            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
-            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
-            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
-            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
-
-        Examples:
-
+        """
        ```python
        >>> import torch
        >>> from transformers.image_utils import load_image
@ -1729,7 +1715,8 @@ class DFineForObjectDetection(DFinePreTrainedModel):
        Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
        Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
        Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
-        ```"""
+        ```
+        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
--- a/src/transformers/models/diffllama/modeling_diffllama.py
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@ -797,11 +797,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@ -746,7 +746,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> BackboneOutput:
-        r"""
+        """
        Examples:

        ```python
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@ -1365,11 +1365,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
@ -1530,11 +1530,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
        **kwargs,
    ) -> Union[tuple, CausalLMOutputWithPast]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@ -523,11 +523,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@ -548,11 +548,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
        **loss_kwargs,
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@ -646,11 +646,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
        **loss_kwargs,
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@ -1825,11 +1825,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
        **loss_kwargs,
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs):


 class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Glm4vImagesKwargs
-    videos_kwargs: Glm4vVideosProcessorKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
    }
+    images_kwargs: Glm4vImagesKwargs
+    videos_kwargs: Glm4vVideosProcessorKwargs


 class Glm4vProcessor(ProcessorMixin):
--- a/src/transformers/models/helium/modeling_helium.py
+++ b/src/transformers/models/helium/modeling_helium.py
@ -522,11 +522,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> CausalLMOutputWithPast:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@ -1504,41 +1504,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
        r"""
-        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
-            to serve as text prompt, which the Q-Former model will encode.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
-            provided to serve as text prompt, which the language model can continue.
-
-            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
-            details.
-
-            [What are input IDs?](../glossary#input-ids)
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-
-            Only relevant in case an encoder-decoder language model (like T5) is used.
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
-            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size]`
-
-        Examples:
-
        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
--- a/src/transformers/models/internvl/modeling_internvl.py
+++ b/src/transformers/models/internvl/modeling_internvl.py
@ -904,11 +904,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[tuple, InternVLCausalLMOutputWithPast]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@ -223,12 +223,11 @@ def apply_rotary_pos_emb_vision(


 class MLCDAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper
-    Multi-headed attention with RoPE. Refer to papers:
-        - Attention is all you need:
-            https://huggingface.co/papers/1706.03762
-        - RoFormer: Enhanced Transformer with Rotary Position Embedding:
-            https://huggingface.co/papers/2104.09864
+    """Multi-headed attention with RoPE. Refer to papers:
+    - Attention is all you need:
+        https://huggingface.co/papers/1706.03762
+    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
+        https://huggingface.co/papers/2104.09864
    """

    def __init__(self, config: MLCDVisionConfig):
--- a/src/transformers/models/sam_hq/configuration_sam_hq.py
+++ b/src/transformers/models/sam_hq/configuration_sam_hq.py
@ -221,8 +221,6 @@ class SamHQMaskDecoderConfig(PretrainedConfig):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
-
-
        vit_dim (`int`, *optional*, defaults to 768):
            Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
    """
--- a/src/transformers/models/sam_hq/modular_sam_hq.py
+++ b/src/transformers/models/sam_hq/modular_sam_hq.py
@ -71,8 +71,37 @@ class SamHQVisionConfig(SamVisionConfig):

 class SamHQMaskDecoderConfig(SamMaskDecoderConfig):
    r"""
-    vit_dim (`int`, *optional*, defaults to 768):
-        Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
+    This is the configuration class to store the configuration of a [`SamHQMaskDecoder`]. It is used to instantiate a SAM_HQ
+    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
+    will yield a similar configuration to that of the SAM_HQ-vit-h
+    [facebook/sam_hq-vit-huge](https://huggingface.co/facebook/sam_hq-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden states.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function used inside the `SamHQMaskDecoder` module.
+        mlp_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 2):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_downsample_rate (`int`, *optional*, defaults to 2):
+            The downsampling rate of the attention layer.
+        num_multimask_outputs (`int`, *optional*, defaults to 3):
+            The number of outputs from the `SamHQMaskDecoder` module. In the Segment Anything paper, this is set to 3.
+        iou_head_depth (`int`, *optional*, defaults to 3):
+            The number of layers in the IoU head module.
+        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
+            The dimensionality of the hidden states in the IoU head module.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        vit_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
    """

    def __init__(
--- a/src/transformers/models/smolvlm/modeling_smolvlm.py
+++ b/src/transformers/models/smolvlm/modeling_smolvlm.py
@ -877,16 +877,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
        r"""
-        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
-            Mask to avoid performing attention on padding pixel indices.
-        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
-            The hidden states of the image encoder after modality projection.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
-            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
-            computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
        Example:

        ```python
--- a/src/transformers/models/t5gemma/configuration_t5gemma.py
+++ b/src/transformers/models/t5gemma/configuration_t5gemma.py
@ -26,81 +26,80 @@ from ...configuration_utils import PretrainedConfig, layer_type_validation

 class T5GemmaModuleConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
-        model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-        defaults will yield a similar configuration to that of the T5GemmaModule-7B.
-        e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
-        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-        documentation from [`PretrainedConfig`] for more information.
-        Args:
-            vocab_size (`int`, *optional*, defaults to 256000):
-                Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
-                `inputs_ids` passed when calling [`T5GemmaModuleModel`]
-            hidden_size (`int`, *optional*, defaults to 2304):
-                Dimension of the hidden representations.
-            intermediate_size (`int`, *optional*, defaults to 9216):
-                Dimension of the MLP representations.
-            num_hidden_layers (`int`, *optional*, defaults to 26):
-                Number of hidden layers in the Transformer decoder.
-            num_attention_heads (`int`, *optional*, defaults to 8):
-                Number of attention heads for each attention layer in the Transformer decoder.
-            num_key_value_heads (`int`, *optional*, defaults to 4):
-                This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-                `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-                `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-                converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-                by meanpooling all the original heads within that group. For more details, check out [this
-                paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
-                `num_attention_heads`.
-            head_dim (`int`, *optional*, defaults to 256):
-                The attention head dimension.
-            hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-                The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
-                if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
-            max_position_embeddings (`int`, *optional*, defaults to 8192):
-                The maximum sequence length that this model might ever be used with.
-            initializer_range (`float`, *optional*, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-                The epsilon used by the rms normalization layers.
-            use_cache (`bool`, *optional*, defaults to `True`):
-                Whether or not the model should return the last key/values attentions (not used by all models). Only
-                relevant if `config.is_decoder=True`.
-            pad_token_id (`int`, *optional*, defaults to 0):
-                Padding token id.
-            eos_token_id (`int`, *optional*, defaults to 1):
-                End of stream token id.
-            bos_token_id (`int`, *optional*, defaults to 2):
-                Beginning of stream token id.
-            tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-                Whether to tie weight embeddings
-            rope_theta (`float`, *optional*, defaults to 10000.0):
-                The base period of the RoPE embeddings.
-            attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-                Whether to use a bias in the query, key, value and output projection layers during self-attention.
-            attention_dropout (`float`, *optional*, defaults to 0.0):
-                The dropout ratio for the attention probabilities.
-            query_pre_attn_scalar (`float`, *optional*, defaults to 256):
-                scaling factor used on the attention scores
-            sliding_window (`int`, *optional*, defaults to 4096):
-                in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
-            layer_types (`list`, *optional*):
-                Attention pattern for each layer.
-            final_logit_softcapping (`float`, *optional*, defaults to 30.0):
-                scaling factor when applying tanh softcapping on the logits.
-            attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
-                scaling factor when applying tanh softcapping on the attention scores.
+    This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the T5GemmaModule-7B.
+    e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5GemmaModuleModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
+            scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
+            scaling factor when applying tanh softcapping on the attention scores.

-        ```python
-        >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
-        >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
-        >>> configuration = T5GemmaModuleConfig()
-        >>> # Initializing a model from the t5_gemma_module-7b style configuration
-        >>> model = T5GemmaModuleModel(configuration)
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-        ```
-    Module config (encoder or decoder): the same as Gemma2Config."""
+    ```python
+    >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
+    >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
+    >>> configuration = T5GemmaModuleConfig()
+    >>> # Initializing a model from the t5_gemma_module-7b style configuration
+    >>> model = T5GemmaModuleModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""

    model_type = "t5_gemma_module"
    keys_to_ignore_at_inference = ["past_key_values"]
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@ -65,10 +65,7 @@ logger = logging.get_logger(__name__)


 class T5GemmaModuleConfig(Gemma2Config):
-    """Module config (encoder or decoder): the same as Gemma2Config."""
-
-    def __init__(self, **super_kwargs):
-        super().__init__(**super_kwargs)
+    pass


 class T5GemmaConfig(PretrainedConfig):
--- a/src/transformers/models/zamba2/modeling_zamba2.py
+++ b/src/transformers/models/zamba2/modeling_zamba2.py
@ -319,17 +319,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):

 class Zamba2Attention(nn.Module):
    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-
-    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
-    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
-    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
-    (see fig. 2 in https://huggingface.co/papers/2405.16712).
-    Additionally, replaced
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
-
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
--- a/tests/repo_utils/modular/test_conversion_order.py
+++ b/tests/repo_utils/modular/test_conversion_order.py
@ -23,7 +23,6 @@ FILES_TO_PARSE = [
    os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
    os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
    os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"),
-    os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"),
    os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
    os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
    os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),