fix stupid kosmos2

This commit is contained in:
Arthur 2025-07-03 13:17:18 +02:00
parent 0f3c368384
commit 3cba8ac3f3
3 changed files with 6 additions and 18 deletions

View File

@ -1011,7 +1011,6 @@ class Kosmos2TextTransformer(nn.Module):
return hidden_states
@can_return_tuple
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1028,7 +1027,6 @@ class Kosmos2TextTransformer(nn.Module):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1036,7 +1034,6 @@ class Kosmos2TextTransformer(nn.Module):
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@ -1307,7 +1304,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
r"""
@ -1340,7 +1336,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)

View File

@ -941,6 +941,9 @@ class MoonshineModel(MoonshinePreTrainedModel):
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (<fill_type>):
<fill_docstring>
Example:
```python
@ -1058,6 +1061,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (<fill_type>):
<fill_docstring>
Example:

View File

@ -885,8 +885,6 @@ class SamHQMaskDecoder(nn.Module):
Whether to use only the high-quality token output or combine with SAM output.
intermediate_embeddings (`torch.Tensor`):
Intermediate embeddings from the vision encoder for feature fusion.
output_attentions (bool, *optional*):
Whether or not to return the attentions tensors of all attention layers.
attention_similarity (`torch.Tensor`, *optional*):
Optional tensor for attention similarity computation.
target_embedding (`torch.Tensor`, *optional*):
@ -1286,20 +1284,10 @@ class SamHQModel(SamHQPreTrainedModel):
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Input pixel values
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
vision_output = self.vision_encoder(
pixel_values=pixel_values,
)
vision_output = self.vision_encoder(pixel_values=pixel_values)
image_embeddings = vision_output[0]
intermediate_embeddings = vision_output[1]
return image_embeddings, intermediate_embeddings
@torch.no_grad()