fix stupid kosmos2

2025-07-31 10:12:23 +06:00 · 2025-07-03 13:17:18 +02:00 · 2025-07-03 13:17:18 +02:00 · 3cba8ac3f3
commit 3cba8ac3f3
parent 0f3c368384
3 changed files with 6 additions and 18 deletions
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@ -1011,7 +1011,6 @@ class Kosmos2TextTransformer(nn.Module):

        return hidden_states

-    @can_return_tuple
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
@ -1028,7 +1027,6 @@ class Kosmos2TextTransformer(nn.Module):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1036,7 +1034,6 @@ class Kosmos2TextTransformer(nn.Module):
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@ -1307,7 +1304,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
        r"""
@ -1340,7 +1336,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
            **kwargs,
        )

--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@ -941,6 +941,9 @@ class MoonshineModel(MoonshinePreTrainedModel):
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
+        decoder_position_ids (<fill_type>):
+            <fill_docstring>
+
        Example:

        ```python
@ -1058,6 +1061,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
+        decoder_position_ids (<fill_type>):
+            <fill_docstring>

        Example:

--- a/src/transformers/models/sam_hq/modeling_sam_hq.py
+++ b/src/transformers/models/sam_hq/modeling_sam_hq.py
@ -885,8 +885,6 @@ class SamHQMaskDecoder(nn.Module):
                Whether to use only the high-quality token output or combine with SAM output.
            intermediate_embeddings (`torch.Tensor`):
                Intermediate embeddings from the vision encoder for feature fusion.
-            output_attentions (bool, *optional*):
-                Whether or not to return the attentions tensors of all attention layers.
            attention_similarity (`torch.Tensor`, *optional*):
                Optional tensor for attention similarity computation.
            target_embedding (`torch.Tensor`, *optional*):
@ -1286,20 +1284,10 @@ class SamHQModel(SamHQPreTrainedModel):
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Input pixel values
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
        """
-        vision_output = self.vision_encoder(
-            pixel_values=pixel_values,
-        )
+        vision_output = self.vision_encoder(pixel_values=pixel_values)
        image_embeddings = vision_output[0]
        intermediate_embeddings = vision_output[1]
-
        return image_embeddings, intermediate_embeddings

    @torch.no_grad()