mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 10:12:23 +06:00
fix stupid kosmos2
This commit is contained in:
parent
0f3c368384
commit
3cba8ac3f3
@ -1011,7 +1011,6 @@ class Kosmos2TextTransformer(nn.Module):
|
||||
|
||||
return hidden_states
|
||||
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
@ -1028,7 +1027,6 @@ class Kosmos2TextTransformer(nn.Module):
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
@ -1036,7 +1034,6 @@ class Kosmos2TextTransformer(nn.Module):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@ -1307,7 +1304,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
r"""
|
||||
@ -1340,7 +1336,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
@ -941,6 +941,9 @@ class MoonshineModel(MoonshinePreTrainedModel):
|
||||
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
||||
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
|
||||
and conversion into a tensor of type `torch.FloatTensor`.
|
||||
decoder_position_ids (<fill_type>):
|
||||
<fill_docstring>
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
@ -1058,6 +1061,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
|
||||
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
||||
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
|
||||
and conversion into a tensor of type `torch.FloatTensor`.
|
||||
decoder_position_ids (<fill_type>):
|
||||
<fill_docstring>
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -885,8 +885,6 @@ class SamHQMaskDecoder(nn.Module):
|
||||
Whether to use only the high-quality token output or combine with SAM output.
|
||||
intermediate_embeddings (`torch.Tensor`):
|
||||
Intermediate embeddings from the vision encoder for feature fusion.
|
||||
output_attentions (bool, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers.
|
||||
attention_similarity (`torch.Tensor`, *optional*):
|
||||
Optional tensor for attention similarity computation.
|
||||
target_embedding (`torch.Tensor`, *optional*):
|
||||
@ -1286,20 +1284,10 @@ class SamHQModel(SamHQPreTrainedModel):
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input pixel values
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
|
||||
"""
|
||||
vision_output = self.vision_encoder(
|
||||
pixel_values=pixel_values,
|
||||
)
|
||||
vision_output = self.vision_encoder(pixel_values=pixel_values)
|
||||
image_embeddings = vision_output[0]
|
||||
intermediate_embeddings = vision_output[1]
|
||||
|
||||
return image_embeddings, intermediate_embeddings
|
||||
|
||||
@torch.no_grad()
|
||||
|
Loading…
Reference in New Issue
Block a user