mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-02 04:10:06 +06:00
Add support for auto_docstring with model outputs (#38242)
* experiment auto_docstring model outputs * Fix PatchTSMixer * Add check model output docstring to check_auto_docstring and fix all model outputs docstring * add reordering of docstring in check_docstrings * add check for redundant docstring in check_docstrings, remove redundant docstrings * refactor check_auto_docstring * make style * fix copies * remove commented code * change List-> list Tuple-> tuple in docstrings * fix modular * make style * Fix modular vipllava --------- Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co>
This commit is contained in:
parent
0c98f24889
commit
b6b4d43d6d
@ -570,11 +570,13 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`AlbertForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -583,17 +585,6 @@ class AlbertForPreTrainingOutput(ModelOutput):
|
||||
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -40,20 +40,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -62,26 +57,15 @@ class AlignVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -91,15 +75,15 @@ class AlignTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class AlignOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
|
@ -53,10 +53,10 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->AltCLIP
|
||||
class AltCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
|
@ -963,11 +963,13 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -978,17 +980,6 @@ class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -1003,30 +994,19 @@ class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -46,11 +46,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
@ -67,17 +69,6 @@ class AutoFormerDecoderOutput(ModelOutput):
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
@ -95,11 +86,13 @@ class AutoFormerDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Autoformer model output that contains the additional trend output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
@ -114,36 +107,6 @@ class AutoformerModelOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to shift back to the original magnitude.
|
||||
@ -1795,6 +1758,14 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Transformer requires to provide additional features.
|
||||
|
||||
The Autoformer only learns additional embeddings for `static_categorical_features`.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
@ -1804,14 +1775,6 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
|
||||
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
|
||||
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -117,11 +117,13 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -132,17 +134,6 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -157,30 +148,19 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -44,39 +44,19 @@ from .configuration_beit import BeitConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
|
||||
# Base docstring
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "microsoft/beit-base-patch16-224"
|
||||
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`BeitModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -805,11 +805,13 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BertForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -818,17 +820,6 @@ class BertForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -1744,11 +1744,13 @@ class BigBirdPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BigBirdForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -1757,17 +1759,6 @@ class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1778,30 +1769,17 @@ class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
|
||||
pooler output from BigBigModel
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,20 +49,20 @@ def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the text decoder.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
|
||||
Prediction scores of the language modeling head of the text decoder model.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
|
||||
The image embeddings obtained after applying the Vision Transformer model to the input image.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
@ -94,29 +94,18 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -127,34 +116,23 @@ class BlipTextVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
|
||||
scores.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
r"""
|
||||
itm_score (`torch.FloatTensor`):
|
||||
The image-text similarity scores.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
Last layer hidden-state of the vision of the vision-only branch of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
question_embeds (`torch.FloatTensor`):
|
||||
The question embeddings obtained by the text projection layer.
|
||||
"""
|
||||
@ -170,15 +148,15 @@ class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class BlipOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
|
@ -45,11 +45,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`Blip2ForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -78,9 +80,9 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
@ -115,27 +117,16 @@ class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Blip2
|
||||
class Blip2TextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -145,27 +136,16 @@ class Blip2TextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Blip2
|
||||
class Blip2VisionModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
|
@ -45,11 +45,13 @@ _TOKENIZER_FOR_DOC = "RobertaTokenizer"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BridgeTowerModel`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the text output of the last layer of the model.
|
||||
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
|
||||
@ -57,16 +59,6 @@ class BridgeTowerModelOutput(ModelOutput):
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
|
||||
Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
|
||||
token), respectively, after further processing through layers used for auxiliary pretraining tasks.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
text_features: Optional[torch.FloatTensor] = None
|
||||
@ -77,12 +69,14 @@ class BridgeTowerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of ['BridgeTowerForContrastiveLearning']
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
|
||||
"""
|
||||
)
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Image-text contrastive loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@ -92,10 +86,6 @@ class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
@ -40,28 +40,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores for entity initial tokens (before SoftMax).
|
||||
subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
|
||||
Classification scores for entity sequence tokens (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,13 +49,15 @@ _PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211,
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
|
||||
different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
|
||||
Transformer encoders.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
|
||||
shallow Transformer encoder).
|
||||
|
@ -52,15 +52,15 @@ def chinese_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ChineseCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
|
@ -122,27 +122,16 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
|
||||
class ClapTextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -152,26 +141,15 @@ class ClapTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
ClapAudio model output to mimic the output of the original implementation.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
r"""
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
The Audio embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
audio_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -181,10 +159,10 @@ class ClapAudioModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
|
||||
class ClapOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for audio-text similarity.
|
||||
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
|
||||
@ -1931,11 +1909,11 @@ class ClapModel(ClapPreTrainedModel):
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
||||
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
||||
the features.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -57,26 +57,15 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -86,26 +75,15 @@ class CLIPVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -115,9 +93,9 @@ class CLIPTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
|
@ -49,10 +49,10 @@ def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLIPSeg
|
||||
class CLIPSegOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
@ -87,18 +87,11 @@ class CLIPSegOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
@ -107,14 +100,21 @@ class CLIPSegDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegImageSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
...
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Binary cross entropy loss for segmentation.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
|
||||
Conditional embeddings used for segmentation.
|
||||
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
|
||||
Pooled output of the [`CLIPSegVisionModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
decoder_output (`CLIPSegDecoderOutput`):
|
||||
The output of the [`CLIPSegDecoder`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1260,15 +1260,15 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, CLIPSegOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
conditional_pixel_values (`torch.FloatTensor`, *optional*):
|
||||
The pixel values of the conditional images.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
|
||||
The conditional embeddings for the query images. If provided, the model will use this instead of computing
|
||||
the embeddings from the conditional_pixel_values.
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -144,26 +144,20 @@ def _pad_extra_bos_eos_tokens(
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for CLVP encoder's outputs that contains a pooling of the last hidden states as well as a projection
|
||||
output (a linear layer on top of the pooled output).
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The hidden state of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Pooled output of the `last_hidden_state`.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
embeds: Optional[torch.FloatTensor] = None
|
||||
@ -174,9 +168,9 @@ class ClvpEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ClvpOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for speech-text similarity.
|
||||
speech_ids (`torch.LongTensor`, *optional*):
|
||||
|
@ -28,6 +28,7 @@ from ...utils import ModelOutput, auto_docstring, can_return_tuple
|
||||
from .configuration_colpali import ColPaliConfig
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ColPaliPreTrainedModel(PreTrainedModel):
|
||||
config_class = ColPaliConfig
|
||||
base_model_prefix = "model"
|
||||
@ -51,11 +52,13 @@ class ColPaliPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColPaliForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColPali embeddings output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ColPaliForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
@ -66,17 +69,6 @@ class ColPaliForRetrievalOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
|
@ -36,6 +36,7 @@ if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ColQwen2PreTrainedModel(PreTrainedModel):
|
||||
config_class = ColQwen2Config
|
||||
base_model_prefix = "model"
|
||||
@ -62,11 +63,13 @@ class ColQwen2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColQwen2 embeddings output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
@ -77,17 +80,6 @@ class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -231,11 +231,13 @@ class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColQwen2 embeddings output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
@ -246,17 +248,6 @@ class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -39,24 +39,16 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
|
||||
BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
|
||||
of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
|
||||
decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
@ -73,38 +65,18 @@ class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
|
||||
Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
|
||||
layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
|
||||
losses.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
@ -117,12 +89,14 @@ class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ConditionalDetrForObjectDetection`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
|
||||
class ConditionalDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ConditionalDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -142,28 +116,6 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -181,12 +133,14 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ConditionalDetrForSegmentation`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
|
||||
class ConditionalDetrSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ConditionalDetrForSegmentation`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -212,28 +166,6 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1022,7 +954,6 @@ class MLP(nn.Module):
|
||||
|
||||
|
||||
@auto_docstring
|
||||
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
|
||||
class ConditionalDetrPreTrainedModel(PreTrainedModel):
|
||||
config_class = ConditionalDetrConfig
|
||||
|
@ -46,11 +46,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for the model autoregressive outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -61,17 +63,6 @@ class CsmOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
|
@ -46,11 +46,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for the model autoregressive outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -61,17 +63,6 @@ class CsmOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
|
@ -33,19 +33,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithCLSToken(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithCLSToken(ModelOutput):
|
||||
r"""
|
||||
cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
|
||||
Classification token at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
|
@ -433,11 +433,13 @@ class DFineDecoderLayer(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the RT-DETR encoder-decoder model.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DFineModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
@ -446,28 +448,10 @@ class DFineModelOutput(ModelOutput):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points used for the first decoder layer.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
@ -483,7 +467,7 @@ class DFineModelOutput(ModelOutput):
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
denoising_meta_values (`dict`):
|
||||
Extra dictionary for the denoising related values
|
||||
Extra dictionary for the denoising related values.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -507,11 +491,13 @@ class DFineModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DFineForObjectDetection`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DFineObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -541,28 +527,6 @@ class DFineObjectDetectionOutput(ModelOutput):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
@ -1008,16 +972,16 @@ class DFineIntegral(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DFineDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class DFineDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
|
||||
@ -1028,14 +992,6 @@ class DFineDecoderOutput(ModelOutput):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
|
@ -39,25 +39,17 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
|
||||
BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
|
||||
of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
|
||||
decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
@ -74,39 +66,19 @@ class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
|
||||
Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
|
||||
layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
|
||||
losses.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
@ -119,12 +91,14 @@ class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DabDetrForObjectDetection`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->DabDetr
|
||||
class DabDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`DabDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -144,28 +118,6 @@ class DabDetrObjectDetectionOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -29,9 +29,9 @@ from .configuration_dac import DacConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class DacOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
|
||||
@ -52,9 +52,9 @@ class DacOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class DacEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
|
||||
@ -72,10 +72,10 @@ class DacEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length
|
||||
class DacDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Dac.
|
||||
"""
|
||||
|
@ -43,29 +43,18 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Data2VecVisionModel`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.beit.modeling_beit.BeitModelOutputWithPooling with Beit->Data2VecVision
|
||||
class Data2VecVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
Class for outputs of [`Data2VecVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -711,30 +711,19 @@ class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DecisionTransformerOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class DecisionTransformerOutput(ModelOutput):
|
||||
r"""
|
||||
state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
|
||||
Environment state predictions
|
||||
action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
|
||||
Model action predictions
|
||||
return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
|
||||
Predicted returns for each state
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
state_preds: Optional[torch.FloatTensor] = None
|
||||
|
@ -108,28 +108,20 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DeformableDetrDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class DeformableDetrDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
@ -145,11 +137,13 @@ class DeformableDetrDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Deformable DETR encoder-decoder model.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DeformableDetrModelOutput(ModelOutput):
|
||||
r"""
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
@ -158,28 +152,6 @@ class DeformableDetrModelOutput(ModelOutput):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
@ -203,11 +175,13 @@ class DeformableDetrModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DeformableDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DeformableDetrObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -225,36 +199,14 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
|
||||
4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
|
||||
in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
|
@ -807,11 +807,13 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DeiTForImageClassificationWithTeacher`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the cls_logits and distillation logits.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
@ -820,14 +822,6 @@ class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
|
@ -32,26 +32,17 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DepthProOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for DepthPro's outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DepthProOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
features (`Union[torch.FloatTensor, list[torch.FloatTensor]]`, *optional*):
|
||||
features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
|
||||
Features from encoders. Can be a single feature or a list of features.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -61,28 +52,17 @@ class DepthProOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DepthProDepthEstimatorOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for DepthProForDepthEstimation's output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DepthProDepthEstimatorOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Predicted depth for each pixel.
|
||||
field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
|
||||
Field of View Scaler.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -45,23 +45,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
@ -75,37 +67,17 @@ class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
@ -115,11 +87,13 @@ class DetrModelOutput(Seq2SeqModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DetrObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -139,28 +113,6 @@ class DetrObjectDetectionOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -178,11 +130,13 @@ class DetrObjectDetectionOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DetrForSegmentation`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DetrSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -208,28 +162,6 @@ class DetrSegmentationOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -57,24 +57,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat encoder's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class DinatEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -90,26 +79,15 @@ class DinatEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class DinatModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -126,26 +104,17 @@ class DinatModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat outputs for image classification.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DinatImageClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
@ -38,25 +38,14 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin encoder's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
|
||||
class DonutSwinEncoderOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin encoder's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -72,27 +61,16 @@ class DonutSwinEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
|
||||
class DonutSwinModelOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -109,27 +87,18 @@ class DonutSwinModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin outputs for image classification.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinImageClassifierOutput with Swin->DonutSwin
|
||||
class DonutSwinImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin outputs for image classification.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
@ -40,26 +40,17 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRContextEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DPRContextEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
@ -68,26 +59,17 @@ class DPRContextEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRQuestionEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DPRQuestionEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
@ -96,11 +78,13 @@ class DPRQuestionEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRReaderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class DPRReaderOutput(ModelOutput):
|
||||
r"""
|
||||
start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
Logits of the start index of the span for each passage.
|
||||
end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
@ -108,17 +92,6 @@ class DPRReaderOutput(ModelOutput):
|
||||
relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
|
||||
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
|
||||
question, compared to all the other passages.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
start_logits: torch.FloatTensor
|
||||
|
@ -42,13 +42,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
|
||||
in the context of Vision models.:
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
r"""
|
||||
last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
@ -59,30 +61,19 @@ class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
|
||||
activations that can be used by the model at later stages.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) after further processing
|
||||
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
|
||||
the classification token after processing through a linear layer and a tanh activation function. The linear
|
||||
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
"""
|
||||
|
@ -667,26 +667,17 @@ class ElectraPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElectraForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ElectraForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class ElectraForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -38,12 +38,12 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
"""
|
||||
|
||||
@ -52,9 +52,9 @@ class EncodecOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||
@ -66,9 +66,9 @@ class EncodecEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
"""
|
||||
|
@ -647,12 +647,14 @@ class ErniePreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ErnieForPreTraining`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->Ernie
|
||||
class ErnieForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ErnieForPreTraining`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -661,17 +663,6 @@ class ErnieForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -53,11 +53,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EsmForProteinFoldingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`EsmForProteinFoldingOutput`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class EsmForProteinFoldingOutput(ModelOutput):
|
||||
r"""
|
||||
frames (`torch.FloatTensor`):
|
||||
Output frames.
|
||||
sidechain_frames (`torch.FloatTensor`):
|
||||
|
@ -492,24 +492,19 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the FALCONMAMBA model outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->FALCONMAMBA,Mamba->FalconMamba,FalconMambaCache->MambaCache
|
||||
class FalconMambaOutput(ModelOutput):
|
||||
"""
|
||||
Class for the FALCONMAMBA model outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
r"""
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -518,12 +513,14 @@ class FalconMambaOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->FalconMamba,FalconMambaCache->MambaCache
|
||||
class FalconMambaCausalLMOutput(ModelOutput):
|
||||
"""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -533,11 +530,6 @@ class FalconMambaCausalLMOutput(ModelOutput):
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -35,46 +35,21 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FastSpeech2ConformerModel`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
|
||||
The predicted spectrogram.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -90,47 +65,23 @@ class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FastSpeech2ConformerWithHifiGan`].
|
||||
|
||||
Args:
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
|
||||
Speech output as a result of passing the predicted mel spectrogram through the vocoder.
|
||||
"""
|
||||
)
|
||||
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
|
||||
The predicted spectrogram.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
|
||||
Speech output as a result of passing the predicted mel spectrogram through the vocoder.
|
||||
"""
|
||||
|
||||
waveform: Optional[torch.FloatTensor] = None
|
||||
|
@ -246,12 +246,14 @@ class FlaubertPredLayer(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert
|
||||
class FlaubertSquadHeadOutput(ModelOutput):
|
||||
"""
|
||||
Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
@ -266,7 +268,6 @@ class FlaubertSquadHeadOutput(ModelOutput):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -815,6 +816,14 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, BaseModelOutput]:
|
||||
r"""
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||
also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
|
||||
@ -824,14 +833,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
|
||||
decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
|
||||
hidden-states.
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
@ -1040,6 +1041,14 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, MaskedLMOutput]:
|
||||
r"""
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||
also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
|
||||
@ -1053,14 +1062,6 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
||||
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
|
||||
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
|
||||
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
@ -1413,12 +1414,14 @@ class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models using a `SquadHead`.
|
||||
"""
|
||||
)
|
||||
# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert
|
||||
class FlaubertForQuestionAnsweringOutput(ModelOutput):
|
||||
"""
|
||||
Base class for outputs of question answering models using a `SquadHead`.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
@ -1433,17 +1436,6 @@ class FlaubertForQuestionAnsweringOutput(ModelOutput):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,15 +49,17 @@ FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalC
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlavaModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output from FlavaModel containing embeddings and outputs from individual encoders.
|
||||
|
||||
Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
|
||||
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
|
||||
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FlavaModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`].
|
||||
image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
@ -87,23 +89,26 @@ class FlavaModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class representing pretraining losses from FLAVA model
|
||||
"""
|
||||
)
|
||||
class FlavaLosses(ModelOutput):
|
||||
"""Class representing pretraining losses from FLAVA model
|
||||
|
||||
Args:
|
||||
mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
|
||||
r"""
|
||||
mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
|
||||
Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
|
||||
mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
|
||||
mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
|
||||
Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
|
||||
itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
|
||||
itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
|
||||
Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
|
||||
masked pairs in FLAVA.
|
||||
global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
|
||||
global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
|
||||
Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
|
||||
data. This is calculated on unmasked images and texts.
|
||||
mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
|
||||
mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
|
||||
Masked Multimodal Modeling loss's image component calculated on paired image-text data.
|
||||
mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
|
||||
mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
|
||||
Masked Multimodal Modeling loss's text component calculated on paired image-text data.
|
||||
"""
|
||||
|
||||
@ -124,15 +129,17 @@ class FlavaLosses(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlavaForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.
|
||||
|
||||
Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
|
||||
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
|
||||
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FlavaForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
|
||||
Total loss calculated for this model.
|
||||
loss_info (`FlavaLosses`):
|
||||
@ -150,7 +157,6 @@ class FlavaForPreTrainingOutput(ModelOutput):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
|
||||
image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
|
||||
to create masked images.
|
||||
@ -164,7 +170,6 @@ class FlavaForPreTrainingOutput(ModelOutput):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
|
||||
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
|
||||
The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
|
||||
returned when `bool_masked_pos` has some of the patches masked.
|
||||
@ -173,12 +178,6 @@ class FlavaForPreTrainingOutput(ModelOutput):
|
||||
the tokens masked.
|
||||
itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
|
||||
mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
|
||||
output is returned when `bool_masked_pos` has some of the patches masked.
|
||||
mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
|
||||
some of the tokens masked.
|
||||
contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
|
||||
`image_projection` and `text_projection` layers respectively. This represents the image-text similarity
|
||||
@ -187,6 +186,12 @@ class FlavaForPreTrainingOutput(ModelOutput):
|
||||
The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
|
||||
`text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
|
||||
texts.
|
||||
mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
|
||||
output is returned when `bool_masked_pos` has some of the patches masked.
|
||||
mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
|
||||
some of the tokens masked.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1207,12 +1212,12 @@ class FlavaModel(FlavaPreTrainedModel):
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
|
||||
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
||||
skip_multimodal_encoder (*bool*, *optional*):
|
||||
Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
|
||||
image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
|
||||
- 1 for pixel values that are real (i.e., **not masked**),
|
||||
- 0 for pixel values that are padding (i.e., **masked**).
|
||||
skip_multimodal_encoder (*bool*, *optional*):
|
||||
Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
|
||||
|
||||
Examples:
|
||||
|
||||
@ -1681,6 +1686,8 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
|
||||
to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
|
||||
[`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
|
||||
codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
|
||||
Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
|
||||
token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
@ -1714,8 +1721,6 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
|
||||
The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
|
||||
return_loss (`bool`, *optional*, default to None):
|
||||
Whether to return calculated loss or not.
|
||||
codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
|
||||
Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
|
@ -409,11 +409,13 @@ class FNetPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FNetForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FNetForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FNetForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -422,10 +424,6 @@ class FNetForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -37,19 +37,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet encoder's outputs, with potential hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
|
||||
"""
|
||||
)
|
||||
class FocalNetEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -64,20 +58,15 @@ class FocalNetEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class FocalNetModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -93,20 +82,17 @@ class FocalNetModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet masked image model outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
|
||||
Masked image modeling (MLM) loss.
|
||||
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Reconstructed pixel values.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
@ -122,20 +108,17 @@ class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet outputs for image classification.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FocalNetImageClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
@ -804,26 +804,17 @@ class FunnelClassificationHead(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunnelForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FunnelForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class FunnelForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA-style objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -290,12 +290,12 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
|
||||
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
|
||||
Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
|
||||
hidden size of the model.
|
||||
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Tensor of indices of the image patches in the input_ids tensor.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
|
||||
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Tensor of indices of the image patches in the input_ids tensor.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -48,30 +48,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Gemma3 outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -81,11 +70,13 @@ class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Gemma3 causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Gemma3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||
@ -96,17 +87,6 @@ class Gemma3CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
import copy
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -346,12 +345,10 @@ class Gemma3Config(PretrainedConfig):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3CausalLMOutputWithPast(PaligemmaCausalLMOutputWithPast):
|
||||
pass
|
||||
|
||||
|
@ -49,27 +49,16 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git
|
||||
class GitVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
|
@ -289,27 +289,16 @@ class GotOcr2VisionLayer(GradientCheckpointingLayer):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2VisionEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for got_ocr2 vision model's outputs that also contains image embeddings obtained by applying the projection
|
||||
layer to the pooler_output.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GotOcr2VisionEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -505,11 +494,13 @@ class GotOcr2MultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for GotOcr2 causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -520,17 +511,6 @@ class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -545,30 +525,19 @@ class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for GotOcr2 outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -597,11 +597,13 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of models predicting if two sentences are consecutive or not.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
|
||||
@ -616,17 +618,6 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -33,11 +33,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -48,17 +50,6 @@ class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -382,12 +373,12 @@ class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, Genera
|
||||
The tensors corresponding to the input audios. input features can be obtained using
|
||||
[`AutoFeatureExtractor`]. See [`GraniteSpeechFeatureExtractor.__call__`] for details.
|
||||
[`GraniteSpeechProcessor`] uses [`GraniteSpeechFeatureExtractor`] for processing audio.
|
||||
input_features_mask (`torch.Tensor`, *optional*):
|
||||
Mask to be applied to audio features prior to scattering into the language embeddings.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
input_features_mask (`torch.Tensor`, *optional*):
|
||||
Mask to be applied to audio features prior to scattering into the language embeddings.
|
||||
"""
|
||||
# TODO (@alex-jw-brooks) add an example to this docstring once models are released
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
@ -102,28 +102,20 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class GroundingDinoDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -134,13 +126,15 @@ class GroundingDinoDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to:
|
||||
- vision and text last hidden states
|
||||
- vision and text intermediate hidden states
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GroundingDinoEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the vision encoder.
|
||||
last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
@ -153,11 +147,6 @@ class GroundingDinoEncoderOutput(ModelOutput):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state_vision: Optional[torch.FloatTensor] = None
|
||||
@ -168,11 +157,13 @@ class GroundingDinoEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Grounding DINO encoder-decoder model.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GroundingDinoModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
@ -181,14 +172,6 @@ class GroundingDinoModelOutput(ModelOutput):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
@ -237,11 +220,13 @@ class GroundingDinoModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`GroundingDinoForObjectDetection`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
@ -261,14 +246,12 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
@ -281,17 +264,6 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
|
||||
region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
|
||||
|
@ -259,9 +259,9 @@ class GroupViTTokenAssign(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class GroupViTModelOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
@ -280,7 +280,6 @@ class GroupViTModelOutput(ModelOutput):
|
||||
original image size as post-processing. You should always check your logits shape and resize as needed.
|
||||
|
||||
</Tip>
|
||||
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`GroupViTTextModel`].
|
||||
|
@ -42,24 +42,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera encoder's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Thesre are the unrolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class HieraEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
@ -75,30 +64,19 @@ class HieraEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class HieraModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
@ -117,11 +95,13 @@ class HieraModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera image classification outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
|
||||
Loss value for the training task.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
||||
@ -153,11 +133,13 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for HieraForPreTraining's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class HieraForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`):
|
||||
Pixel reconstruction loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
|
||||
@ -166,14 +148,6 @@ class HieraForPreTrainingOutput(ModelOutput):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
|
@ -52,11 +52,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
@ -71,17 +73,6 @@ class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
@ -97,11 +88,13 @@ class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class IdeficsCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class IdeficsCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -112,17 +105,6 @@ class IdeficsCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
@ -1445,16 +1427,16 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, IdeficsCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
|
||||
The output of the image encoder.
|
||||
perceiver_embeddings (`torch.FloatTensor`, *optional*):
|
||||
The output of the perceiver resampler.
|
||||
image_attention_mask (`torch.LongTensor`, *optional*):
|
||||
The attention mask for the image encoder.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -39,10 +39,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
@ -55,15 +58,6 @@ class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
@ -78,11 +72,14 @@ class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics2 causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2
|
||||
class Idefics2CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
Base class for Idefics2 causal language model (or autoregressive) outputs.
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -92,15 +89,6 @@ class Idefics2CausalLMOutputWithPast(ModelOutput):
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
@ -39,10 +39,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics3 model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
@ -55,15 +58,6 @@ class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
@ -78,11 +72,13 @@ class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Idefics3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -92,15 +88,6 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
@ -44,12 +44,14 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`InstructBlipForConditionalGeneration`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlip
|
||||
class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
Class defining the outputs of [`InstructBlipForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
|
@ -1147,11 +1147,13 @@ class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
|
@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -189,7 +188,6 @@ class InstructBlipVideoQFormerModel(InstructBlipQFormerModel):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForConditionalGenerationModelOutput):
|
||||
pass
|
||||
|
||||
|
@ -209,28 +209,17 @@ class InternVLVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`InternVLVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
|
||||
@ -569,30 +558,19 @@ class InternVLMultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for InternVL outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class InternVLModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -805,11 +783,13 @@ class InternVLModel(InternVLPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for InternVL causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -820,17 +800,6 @@ class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -171,28 +171,17 @@ class InternVLVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`InternVLVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -87,10 +87,13 @@ class JanusPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus VQ-VAE mode model outputs.
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
r"""
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
@ -102,11 +105,13 @@ class JanusVQVAEOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
@ -121,17 +126,6 @@ class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
@ -147,11 +141,13 @@ class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class JanusCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -162,17 +158,6 @@ class JanusCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
@ -408,10 +408,13 @@ class JanusPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus VQ-VAE mode model outputs.
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
r"""
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
@ -422,12 +425,10 @@ class JanusVQVAEOutput(ModelOutput):
|
||||
embedding_loss: torch.FloatTensor = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
|
||||
pass
|
||||
|
||||
|
@ -90,24 +90,22 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
|
||||
|
||||
|
||||
@dataclass
|
||||
class Kosmos2ModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class Kosmos2ModelOutput(ModelOutput):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
@ -118,15 +116,6 @@ class Kosmos2ModelOutput(ModelOutput):
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -145,26 +134,26 @@ class Kosmos2ModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Model output class for `Kosmos2ForConditionalGeneration`.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
@ -175,15 +164,6 @@ class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1332,6 +1312,8 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
|
||||
1]`:
|
||||
@ -1343,8 +1325,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
"""
|
||||
return self.model(
|
||||
input_ids=input_ids,
|
||||
@ -1423,6 +1403,8 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
|
||||
1]`:
|
||||
@ -1438,8 +1420,6 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
||||
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
@ -1794,12 +1774,12 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
|
||||
- 1 for places where to put the image features,
|
||||
- 0 for places that are not for image features (i.e. for text tokens).
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
||||
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -1132,19 +1132,14 @@ class LEDPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput with Longformer->LEDEncoder
|
||||
class LEDEncoderBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
r"""
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -1176,12 +1171,14 @@ class LEDEncoderBaseModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
|
||||
decoding.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
@ -1193,36 +1190,6 @@ class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
@ -1244,11 +1211,13 @@ class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for sequence-to-sequence language models outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -1259,36 +1228,6 @@ class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
@ -1311,11 +1250,13 @@ class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sequence-to-sequence sentence classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
@ -1326,36 +1267,6 @@ class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
@ -1378,53 +1289,21 @@ class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sequence-to-sequence question answering models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
@ -38,11 +38,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevitForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LevitForImageClassificationWithTeacher`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LevitForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the `cls_logits` and `distillation_logits`.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
@ -51,10 +53,6 @@ class LevitForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
|
@ -36,15 +36,17 @@ from .configuration_lightglue import LightGlueConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of LightGlue keypoint matching models. Due to the nature of keypoint detection and matching,
|
||||
the number of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the
|
||||
batch of images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask
|
||||
tensor is used to indicate which values in the keypoints, matches, matching_scores and prune tensors are keypoint
|
||||
matching information.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
|
@ -37,9 +37,6 @@ from ..superpoint import SuperPointConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC_ = "LightGlueConfig"
|
||||
_CHECKPOINT_FOR_DOC_ = "ETH-CVG/lightglue_superpoint"
|
||||
|
||||
|
||||
class LightGlueConfig(PretrainedConfig):
|
||||
r"""
|
||||
@ -158,15 +155,17 @@ class LightGlueConfig(PretrainedConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of LightGlue keypoint matching models. Due to the nature of keypoint detection and matching,
|
||||
the number of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the
|
||||
batch of images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask
|
||||
tensor is used to indicate which values in the keypoints, matches, matching_scores and prune tensors are keypoint
|
||||
matching information.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
|
@ -719,11 +719,13 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Llama4CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Llama4CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -734,17 +736,6 @@ class Llama4CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -36,30 +36,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class LlavaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -69,11 +58,13 @@ class LlavaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -84,17 +75,6 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -145,30 +145,19 @@ def unpad_image(tensor, original_size):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -178,11 +167,13 @@ class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -193,17 +184,6 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -43,34 +43,22 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -82,11 +70,13 @@ class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNextVideo causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -97,21 +87,9 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -182,9 +181,17 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast):
|
||||
"""
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -193,9 +200,21 @@ class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast):
|
||||
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoCausalLMOutputWithPast(LlavaNextCausalLMOutputWithPast):
|
||||
"""
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -49,34 +49,22 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -88,11 +76,13 @@ class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaOnevision causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -103,21 +93,9 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -35,18 +35,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class LongformerBaseModelOutput(ModelOutput):
|
||||
r"""
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -78,22 +73,17 @@ class LongformerBaseModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Longformer's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
|
||||
prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -126,20 +116,17 @@ class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerMaskedLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for masked language models outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LongformerMaskedLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked language modeling (MLM) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -172,22 +159,15 @@ class LongformerMaskedLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering Longformer models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -221,20 +201,17 @@ class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sentence classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -267,22 +244,19 @@ class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of multiple choice Longformer models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
@ -315,20 +289,17 @@ class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerTokenClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LongformerTokenClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
@ -36,61 +36,22 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the LUKE model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length +
|
||||
entity_length, sequence_length + entity_length)`. Attentions weights after the attention softmax, used to
|
||||
compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
entity_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
entity_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseLukeModelOutput(BaseModelOutput):
|
||||
"""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
entity_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -98,11 +59,33 @@ class BaseLukeModelOutput(BaseModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeMaskedLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class BaseLukeModelOutput(BaseModelOutput):
|
||||
r"""
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
Args:
|
||||
entity_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
entity_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LukeMaskedLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
The sum of masked language modeling (MLM) loss and entity prediction loss.
|
||||
mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
@ -113,21 +96,10 @@ class LukeMaskedLMOutput(ModelOutput):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -141,27 +113,21 @@ class LukeMaskedLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class EntityClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -172,27 +138,21 @@ class EntityClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityPairClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity pair classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class EntityPairClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -203,27 +163,21 @@ class EntityPairClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntitySpanClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity span classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class EntitySpanClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -234,30 +188,21 @@ class EntitySpanClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of sentence classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LukeSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -268,30 +213,21 @@ class LukeSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeTokenClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LukeTokenClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -302,32 +238,19 @@ class LukeTokenClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of question answering models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -339,32 +262,23 @@ class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeMultipleChoiceModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of multiple choice models.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LukeMultipleChoiceModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -42,14 +42,15 @@ class GeLU(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
|
||||
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
|
||||
encoder")
|
||||
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LxmertModelOutput(ModelOutput):
|
||||
r"""
|
||||
language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the language encoder.
|
||||
vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
@ -88,11 +89,13 @@ class LxmertModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LxmertForQuestionAnswering`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.k.
|
||||
@ -128,11 +131,13 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LxmertForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class LxmertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -161,7 +166,6 @@ class LxmertForPreTrainingOutput(ModelOutput):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -429,23 +429,18 @@ class MambaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the MAMBA model outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class MambaOutput(ModelOutput):
|
||||
r"""
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -454,11 +449,13 @@ class MambaOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaCausalLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MambaCausalLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -468,11 +465,6 @@ class MambaCausalLMOutput(ModelOutput):
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -763,24 +763,19 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the MAMBA2 model outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->MAMBA2,Mamba->Mamba2
|
||||
class Mamba2Output(ModelOutput):
|
||||
"""
|
||||
Class for the MAMBA2 model outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
r"""
|
||||
cache_params (`Mamba2Cache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -789,12 +784,14 @@ class Mamba2Output(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->Mamba2
|
||||
class Mamba2CausalLMOutput(ModelOutput):
|
||||
"""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -804,11 +801,6 @@ class Mamba2CausalLMOutput(ModelOutput):
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -44,12 +44,14 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Mask2Former's pixel decoder module output, practically a Multi-Scale Deformable Attention based decoder. It returns
|
||||
the mask features and the multiscale features.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
multi_scale_features (`tuple(torch.FloatTensor)`):
|
||||
Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
|
||||
width)`from the Multi-Scale Deformable Attenntion based Pixel Decoder.
|
||||
@ -68,15 +70,15 @@ class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Transformer decoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions for mask predictions logits and a tuple of intermediate decoder activations,
|
||||
i.e. the output of each decoder layer, each of them gone through a layernorm.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
@ -100,16 +102,18 @@ class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Mask2Former's pixel level module output. It returns the output of the encoder (optional) and all hidden states
|
||||
(multi-scale features) from the `decoder`. By default, the `encoder` is a Swin Backbone and the `decoder` is a
|
||||
Multi-Scale Deformable Attention based decoder.
|
||||
|
||||
The `decoder_last_hidden_state` are the **per-pixel embeddings** while `decoder_hidden_states` refer to multi-scale
|
||||
feature maps produced using **multi-scaling strategy** defined in the paper.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor`):
|
||||
Last hidden states (final feature map of shape `(batch_size, num_channels, height, width)`) of the last
|
||||
stage of the encoder.
|
||||
@ -131,26 +135,28 @@ class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Mask2FormerModel`]. This class returns all the needed hidden states to compute the logits.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Mask2FormerModelOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone). Returned when
|
||||
`output_hidden_states=True` is passed.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, , *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
@ -177,16 +183,18 @@ class Mask2FormerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Mask2FormerForUniversalSegmentationOutput`].
|
||||
|
||||
This output can be directly passed to [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or
|
||||
[`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
|
||||
[`~Mask2FormerImageProcessor] for details regarding usage.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
@ -199,18 +207,18 @@ class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
|
||||
List of class and mask predictions from each layer of the transformer decoder.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
|
@ -53,24 +53,16 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
@ -84,24 +76,26 @@ class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
MaskFormer's pixel level module output. It returns both the last and (optionally) the hidden states from the
|
||||
`encoder` and `decoder`. By default, the `encoder` is a MaskFormerSwin Transformer and the `decoder` is a Feature
|
||||
Pyramid Network (FPN).
|
||||
|
||||
The `encoder_last_hidden_state` are referred on the paper as **images features**, while `decoder_last_hidden_state`
|
||||
as **pixel embeddings**
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the decoder.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
the output of each stage.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the decoder.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
@ -115,22 +109,16 @@ class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
MaskFormer's pixel decoder module output, practically a Feature Pyramid Network. It returns the last hidden state
|
||||
and (optionally) the hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -139,11 +127,13 @@ class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`MaskFormerModel`]. This class returns all the needed hidden states to compute the logits.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MaskFormerModelOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
@ -165,10 +155,6 @@ class MaskFormerModelOutput(ModelOutput):
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -182,16 +168,18 @@ class MaskFormerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`MaskFormerForInstanceSegmentation`].
|
||||
|
||||
This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or or
|
||||
[`~MaskFormerImageProcessor.post_process_instance_segmentation`] or
|
||||
[`~MaskFormerImageProcessor.post_process_panoptic_segmentation`] depending on the task. Please, see
|
||||
[`~MaskFormerImageProcessor] for details regarding usage.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
@ -200,6 +188,8 @@ class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
masks_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
|
||||
query.
|
||||
auxiliary_logits (`Dict[str, torch.FloatTensor]`, *optional*, returned when `output_auxiliary_logits=True`):
|
||||
Dictionary containing auxiliary predictions for each decoder layer when auxiliary losses are enabled.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
@ -221,10 +211,6 @@ class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -30,36 +30,25 @@ from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BackboneOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
|
||||
from ...utils import torch_int
|
||||
from ...utils import auto_docstring, torch_int
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from .configuration_maskformer_swin import MaskFormerSwinConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for MaskFormerSwinModel's outputs that also contains the spatial dimensions of the hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state after a mean pooling operation.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot be inferred before the
|
||||
`forward` method.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -70,28 +59,17 @@ class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerSwinBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for SwinEncoder's outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class MaskFormerSwinBaseModelOutput(ModelOutput):
|
||||
r"""
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward`
|
||||
method.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -759,12 +737,8 @@ class MaskFormerSwinEncoder(nn.Module):
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MaskFormerSwinPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = MaskFormerSwinConfig
|
||||
base_model_prefix = "model"
|
||||
main_input_name = "pixel_values"
|
||||
|
@ -700,12 +700,14 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`MegatronBertForPreTraining`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
|
||||
class MegatronBertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`MegatronBertForPreTraining`].
|
||||
|
||||
Args:
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -714,17 +716,6 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -69,11 +69,13 @@ class MgpstrDropPath(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MgpstrModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MgpstrModelOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size,
|
||||
config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size,
|
||||
@ -81,17 +83,6 @@ class MgpstrModelOutput(ModelOutput):
|
||||
config.max_token_length, config.num_wordpiece_labels)`) .
|
||||
|
||||
Classification scores (before SoftMax) of character, bpe and wordpiece.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, config.max_token_length,
|
||||
sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one
|
||||
for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`.
|
||||
|
@ -47,12 +47,12 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
|
||||
@ -79,9 +79,9 @@ class MimiOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
@ -99,9 +99,9 @@ class MimiEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
decoder_past_key_values (`Cache`, *optional*):
|
||||
|
@ -1209,17 +1209,6 @@ class MiniMaxForQuestionAnswering(MiniMaxPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -760,17 +760,6 @@ class MistralForQuestionAnswering(MistralPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -29,8 +29,6 @@ from .configuration_mistral import MistralConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1"
|
||||
|
||||
|
||||
class MistralMLP(LlamaMLP):
|
||||
def __init__(self, config):
|
||||
@ -247,17 +245,6 @@ class MistralForQuestionAnswering(LlamaForQuestionAnswering):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -123,11 +123,13 @@ class Mistral3MultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Mistral3 causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -138,17 +140,6 @@ class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -163,30 +154,19 @@ class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mistral3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Mistral3 outputs, with hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
"""
|
||||
)
|
||||
class Mistral3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -992,17 +992,6 @@ class MixtralForQuestionAnswering(MixtralPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -1531,10 +1531,6 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
|
||||
For each text token (in seq_length):
|
||||
- 1 indicates the token **should attend** to the corresponding image tile
|
||||
- 0 indicates the token **should not attend** to the corresponding image tile
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
full_text_row_masked_out_mask (`tuple[torch.Tensor, torch.Tensor]`, *optional*):
|
||||
A tuple containing two tensors that mask out rows in the cross-attention mechanism:
|
||||
- The first tensor has shape `(batch_size, 1, seq_length, 1)` and contains values of 0 or 1.
|
||||
@ -1544,6 +1540,10 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
|
||||
the forward pass of cross-attention layers.
|
||||
This mask is derived from the cross_attention_mask and is used to handle cases where a text token
|
||||
should not attend to any image token.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -678,11 +678,13 @@ class MobileBertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MobileBertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`MobileBertForPreTraining`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MobileBertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
@ -691,17 +693,6 @@ class MobileBertForPreTrainingOutput(ModelOutput):
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -50,11 +50,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of [`MoshiForConditionalConditionalGeneration.generate`].
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
r"""
|
||||
audio_sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, 1, sequence_length)`, *optional*):
|
||||
The generated audio waveforms.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
|
||||
@ -81,7 +83,7 @@ class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
||||
`torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
|
||||
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
|
||||
Contains the model cache, used to speed up decoding. Different models have a different cache format, check
|
||||
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size*num_return_sequences, num_codeooks, sequence_length)`, *optional*):
|
||||
The generated audio codes. Returned if `return_audio_codes=True`. Intermediate audio "tokens" which transforms to `audio_sequences` once passed through the audio decoder.
|
||||
@ -100,34 +102,23 @@ class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
`MoshiForCausalLM` outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -139,34 +130,23 @@ class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
`MoshiForConditionalGeneration` outputs.
|
||||
|
||||
Args:
|
||||
"""
|
||||
)
|
||||
class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `text_labels` is provided):
|
||||
Text language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the text language modeling head (scores for each vocabulary token before SoftMax).
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `audio_labels` is provided):
|
||||
Audio language modeling loss (for next-token prediction).
|
||||
audio_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
@ -194,9 +174,9 @@ class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MoshiUnconditionalInput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
r"""
|
||||
input_ids (`torch.Tensor `of shape `(batch_size, sequence_length), *optional*):
|
||||
The sequence used as a text prompt for the generation.
|
||||
user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user