mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-01 20:00:09 +06:00
Add support for auto_docstring with model outputs (#38242)
* experiment auto_docstring model outputs * Fix PatchTSMixer * Add check model output docstring to check_auto_docstring and fix all model outputs docstring * add reordering of docstring in check_docstrings * add check for redundant docstring in check_docstrings, remove redundant docstrings * refactor check_auto_docstring * make style * fix copies * remove commented code * change List-> list Tuple-> tuple in docstrings * fix modular * make style * Fix modular vipllava --------- Co-authored-by: Cyril Vallez <cyril.vallez@huggingface.co>
This commit is contained in:
parent
0c98f24889
commit
b6b4d43d6d
@ -570,30 +570,21 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`AlbertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -40,20 +40,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -62,26 +57,15 @@ class AlignVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -91,25 +75,25 @@ class AlignTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class AlignOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The output of [`AlignVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`AlignTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
|
||||
The output of the [`AlignVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The output of [`AlignVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`AlignTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
|
||||
The output of the [`AlignVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -53,26 +53,26 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->AltCLIP
|
||||
class AltCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -963,35 +963,26 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1003,33 +994,22 @@ class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
@ -46,44 +46,35 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
"""
|
||||
)
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -95,63 +86,35 @@ class AutoFormerDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Autoformer model output that contains the additional trend output.
|
||||
"""
|
||||
)
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
||||
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
||||
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to shift back to the original magnitude.
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||
loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to shift back to the original magnitude.
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -1795,6 +1758,14 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Transformer requires to provide additional features.
|
||||
|
||||
The Autoformer only learns additional embeddings for `static_categorical_features`.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
@ -1804,14 +1775,6 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
|
||||
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
|
||||
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -117,35 +117,26 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -157,33 +148,22 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
@ -44,39 +44,19 @@ from .configuration_beit import BeitConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
|
||||
# Base docstring
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "microsoft/beit-base-patch16-224"
|
||||
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`BeitModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -805,30 +805,21 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -1744,30 +1744,21 @@ class BigBirdPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BigBirdForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1778,30 +1769,17 @@ class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
|
||||
pooler output from BigBigModel
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
|
||||
pooler output from BigBigModel
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,31 +49,31 @@ def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
"""
|
||||
)
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the text decoder.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
|
||||
Prediction scores of the language modeling head of the text decoder model.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
|
||||
The image embeddings obtained after applying the Vision Transformer model to the input image.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the text decoder.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
|
||||
Prediction scores of the language modeling head of the text decoder model.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
|
||||
The image embeddings obtained after applying the Vision Transformer model to the input image.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
@ -94,29 +94,18 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -127,36 +116,25 @@ class BlipTextVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
|
||||
scores.
|
||||
|
||||
Args:
|
||||
itm_score (`torch.FloatTensor`):
|
||||
The image-text similarity scores.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
Last layer hidden-state of the vision of the vision-only branch of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
question_embeds (`torch.FloatTensor`):
|
||||
The question embeddings obtained by the text projection layer.
|
||||
"""
|
||||
)
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
r"""
|
||||
itm_score (`torch.FloatTensor`):
|
||||
The image-text similarity scores.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
Last layer hidden-state of the vision of the vision-only branch of the model.
|
||||
question_embeds (`torch.FloatTensor`):
|
||||
The question embeddings obtained by the text projection layer.
|
||||
"""
|
||||
|
||||
itm_score: Optional[torch.FloatTensor] = None
|
||||
@ -170,25 +148,25 @@ class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class BlipOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -45,21 +45,23 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`Blip2ForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
)
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
@ -78,25 +80,25 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2QFormerModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2VisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2QFormerModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2VisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -115,27 +117,16 @@ class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Blip2
|
||||
class Blip2TextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -145,27 +136,16 @@ class Blip2TextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Blip2
|
||||
class Blip2VisionModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
|
@ -45,28 +45,20 @@ _TOKENIZER_FOR_DOC = "RobertaTokenizer"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BridgeTowerModel`].
|
||||
|
||||
Args:
|
||||
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the text output of the last layer of the model.
|
||||
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the image output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
|
||||
Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
|
||||
token), respectively, after further processing through layers used for auxiliary pretraining tasks.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the text output of the last layer of the model.
|
||||
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the image output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
|
||||
Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
|
||||
token), respectively, after further processing through layers used for auxiliary pretraining tasks.
|
||||
"""
|
||||
|
||||
text_features: Optional[torch.FloatTensor] = None
|
||||
@ -77,28 +69,26 @@ class BridgeTowerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of ['BridgeTowerForContrastiveLearning']
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
|
||||
Image-text contrastive loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
"""
|
||||
)
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Image-text contrastive loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -40,28 +40,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
|
||||
Classification loss.
|
||||
initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores for entity initial tokens (before SoftMax).
|
||||
subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
|
||||
Classification scores for entity sequence tokens (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores for entity initial tokens (before SoftMax).
|
||||
subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
|
||||
Classification scores for entity sequence tokens (before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,32 +49,34 @@ _PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211,
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
|
||||
different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
|
||||
Transformer encoders.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
|
||||
shallow Transformer encoder).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
|
||||
Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
|
||||
weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
|
||||
encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
|
||||
config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
|
||||
initial input to each Transformer encoder. The hidden states of the shallow encoders have length
|
||||
`sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
|
||||
`config.downsampling_rate`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
|
||||
num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
|
||||
config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
|
||||
attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
|
||||
shallow Transformer encoder).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
|
||||
Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
|
||||
weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
|
||||
encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
|
||||
config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
|
||||
initial input to each Transformer encoder. The hidden states of the shallow encoders have length
|
||||
`sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
|
||||
`config.downsampling_rate`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
|
||||
num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
|
||||
config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
|
||||
attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
|
@ -52,27 +52,27 @@ def chinese_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ChineseCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -122,27 +122,16 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
|
||||
class ClapTextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -152,26 +141,15 @@ class ClapTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
ClapAudio model output to mimic the output of the original implementation.
|
||||
|
||||
Args:
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
The Audio embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
r"""
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
The Audio embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
audio_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -181,26 +159,26 @@ class ClapAudioModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
|
||||
class ClapOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for audio-text similarity.
|
||||
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapTextModel`].
|
||||
audio_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapAudioModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for audio-text similarity.
|
||||
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapTextModel`].
|
||||
audio_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapAudioModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1931,11 +1909,11 @@ class ClapModel(ClapPreTrainedModel):
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
||||
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
||||
the features.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -57,26 +57,15 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -86,26 +75,15 @@ class CLIPVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -115,25 +93,25 @@ class CLIPTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,26 +49,26 @@ def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLIPSeg
|
||||
class CLIPSegOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -87,18 +87,11 @@ class CLIPSegOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
@ -107,14 +100,21 @@ class CLIPSegDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegImageSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
...
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Binary cross entropy loss for segmentation.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
|
||||
Conditional embeddings used for segmentation.
|
||||
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
|
||||
Pooled output of the [`CLIPSegVisionModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
decoder_output (`CLIPSegDecoderOutput`):
|
||||
The output of the [`CLIPSegDecoder`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1260,15 +1260,15 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, CLIPSegOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
conditional_pixel_values (`torch.FloatTensor`, *optional*):
|
||||
The pixel values of the conditional images.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
|
||||
The conditional embeddings for the query images. If provided, the model will use this instead of computing
|
||||
the embeddings from the conditional_pixel_values.
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -144,26 +144,20 @@ def _pad_extra_bos_eos_tokens(
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for CLVP encoder's outputs that contains a pooling of the last hidden states as well as a projection
|
||||
output (a linear layer on top of the pooled output).
|
||||
|
||||
Args:
|
||||
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The hidden state of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Pooled output of the `last_hidden_state`.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The hidden state of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Pooled output of the `last_hidden_state`.
|
||||
"""
|
||||
|
||||
embeds: Optional[torch.FloatTensor] = None
|
||||
@ -174,35 +168,35 @@ class ClvpEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ClvpOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for speech-text similarity.
|
||||
speech_ids (`torch.LongTensor`, *optional*):
|
||||
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
|
||||
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
|
||||
model.
|
||||
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
|
||||
model.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the text encoder Model.
|
||||
speech_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the speech encoder Model.
|
||||
decoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the decoder model.
|
||||
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the text encoder model.
|
||||
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the speech encoder model.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for speech-text similarity.
|
||||
speech_ids (`torch.LongTensor`, *optional*):
|
||||
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
|
||||
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
|
||||
model.
|
||||
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
|
||||
model.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the text encoder Model.
|
||||
speech_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the speech encoder Model.
|
||||
decoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the decoder model.
|
||||
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the text encoder model.
|
||||
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the speech encoder model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -28,6 +28,7 @@ from ...utils import ModelOutput, auto_docstring, can_return_tuple
|
||||
from .configuration_colpali import ColPaliConfig
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ColPaliPreTrainedModel(PreTrainedModel):
|
||||
config_class = ColPaliConfig
|
||||
base_model_prefix = "model"
|
||||
@ -51,35 +52,26 @@ class ColPaliPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColPaliForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColPali embeddings output.
|
||||
"""
|
||||
)
|
||||
class ColPaliForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -36,6 +36,7 @@ if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ColQwen2PreTrainedModel(PreTrainedModel):
|
||||
config_class = ColQwen2Config
|
||||
base_model_prefix = "model"
|
||||
@ -62,32 +63,23 @@ class ColQwen2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColQwen2 embeddings output.
|
||||
"""
|
||||
)
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -231,32 +231,23 @@ class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for ColQwen2 embeddings output.
|
||||
"""
|
||||
)
|
||||
class ColQwen2ForRetrievalOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The embeddings of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -39,33 +39,25 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
|
||||
BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
|
||||
of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
|
||||
decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
)
|
||||
class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -73,43 +65,23 @@ class ConditionalDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
|
||||
Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
|
||||
layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
|
||||
losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
)
|
||||
class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -117,53 +89,33 @@ class ConditionalDetrModelOutput(Seq2SeqModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ConditionalDetrForObjectDetection`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->ConditionalDetr
|
||||
class ConditionalDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ConditionalDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -181,59 +133,39 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ConditionalDetrForSegmentation`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrSegmentationOutput with Detr->ConditionalDetr
|
||||
class ConditionalDetrSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ConditionalDetrForSegmentation`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
|
||||
Segmentation masks logits for all queries. See also
|
||||
[`~ConditionalDetrImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~ConditionalDetrImageProcessor.post_process_instance_segmentation`]
|
||||
[`~ConditionalDetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
|
||||
segmentation masks respectively.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
|
||||
Segmentation masks logits for all queries. See also
|
||||
[`~ConditionalDetrImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~ConditionalDetrImageProcessor.post_process_instance_segmentation`]
|
||||
[`~ConditionalDetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
|
||||
segmentation masks respectively.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1022,7 +954,6 @@ class MLP(nn.Module):
|
||||
|
||||
|
||||
@auto_docstring
|
||||
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->ConditionalDetr
|
||||
class ConditionalDetrPreTrainedModel(PreTrainedModel):
|
||||
config_class = ConditionalDetrConfig
|
||||
|
@ -46,49 +46,40 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for the model autoregressive outputs.
|
||||
"""
|
||||
)
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
|
||||
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
|
||||
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the backbone model.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the backbone model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -46,49 +46,40 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for the model autoregressive outputs.
|
||||
"""
|
||||
)
|
||||
class CsmOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
|
||||
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the depth decoder model.
|
||||
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
|
||||
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the backbone model.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction) of the backbone model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -33,19 +33,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithCLSToken(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
|
||||
Classification token at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithCLSToken(ModelOutput):
|
||||
r"""
|
||||
cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
|
||||
Classification token at the output of the last layer of the model.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
|
@ -433,57 +433,41 @@ class DFineDecoderLayer(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the RT-DETR encoder-decoder model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder stage.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
denoising_meta_values (`dict`):
|
||||
Extra dictionary for the denoising related values
|
||||
"""
|
||||
)
|
||||
class DFineModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points used for the first decoder layer.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder stage.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
denoising_meta_values (`dict`):
|
||||
Extra dictionary for the denoising related values.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -507,76 +491,56 @@ class DFineModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DFineForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DFineImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized (absolute) bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder.
|
||||
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
denoising_meta_values (`dict`):
|
||||
Extra dictionary for the denoising related values
|
||||
"""
|
||||
)
|
||||
class DFineObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DFineImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized (absolute) bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder.
|
||||
enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the encoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
denoising_meta_values (`dict`):
|
||||
Extra dictionary for the denoising related values
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1008,38 +972,30 @@ class DFineIntegral(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DFineDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DFineDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
)
|
||||
class DFineDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
|
||||
Stacked intermediate logits (logits of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
|
||||
initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked initial reference points (initial reference points of each layer of the decoder).
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
|
@ -39,34 +39,26 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR decoder. This class adds one attribute to
|
||||
BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output
|
||||
of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary
|
||||
decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -74,44 +66,24 @@ class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to
|
||||
Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder
|
||||
layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding
|
||||
losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points)
|
||||
class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
|
||||
Reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -119,53 +91,33 @@ class DabDetrModelOutput(Seq2SeqModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DabDetrForObjectDetection`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->DabDetr
|
||||
class DabDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`DabDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -29,19 +29,19 @@ from .configuration_dac import DacConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class DacOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
|
||||
Reconstructed audio data.
|
||||
quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
|
||||
Quantized continuous representation of input.
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
|
||||
Codebook indices for each codebook (quantized discrete representation of input).
|
||||
projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
|
||||
Projected latents (continuous representation of input before quantization).
|
||||
r"""
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
|
||||
Reconstructed audio data.
|
||||
quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
|
||||
Quantized continuous representation of input.
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
|
||||
Codebook indices for each codebook (quantized discrete representation of input).
|
||||
projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
|
||||
Projected latents (continuous representation of input before quantization).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -52,17 +52,17 @@ class DacOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class DacEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
|
||||
Quantized continuous representation of input.
|
||||
audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
|
||||
Codebook indices for each codebook (quantized discrete representation of input).
|
||||
projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
|
||||
Projected latents (continuous representation of input before quantization).
|
||||
r"""
|
||||
loss (`torch.Tensor`):
|
||||
Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
|
||||
quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
|
||||
Quantized continuous representation of input.
|
||||
audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
|
||||
Codebook indices for each codebook (quantized discrete representation of input).
|
||||
projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
|
||||
Projected latents (continuous representation of input before quantization).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -72,12 +72,12 @@ class DacEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length
|
||||
class DacDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Dac.
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Dac.
|
||||
"""
|
||||
|
||||
audio_values: Optional[torch.FloatTensor] = None
|
||||
|
@ -43,29 +43,18 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Data2VecVisionModel`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.beit.modeling_beit.BeitModelOutputWithPooling with Beit->Data2VecVision
|
||||
class Data2VecVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
Class for outputs of [`Data2VecVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -711,30 +711,19 @@ class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DecisionTransformerOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
|
||||
Environment state predictions
|
||||
action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
|
||||
Model action predictions
|
||||
return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
|
||||
Predicted returns for each state
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class DecisionTransformerOutput(ModelOutput):
|
||||
r"""
|
||||
state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
|
||||
Environment state predictions
|
||||
action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
|
||||
Model action predictions
|
||||
return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
|
||||
Predicted returns for each state
|
||||
"""
|
||||
|
||||
state_preds: Optional[torch.FloatTensor] = None
|
||||
|
@ -108,32 +108,24 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DeformableDetrDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
)
|
||||
class DeformableDetrDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -145,47 +137,27 @@ class DeformableDetrDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Deformable DETR encoder-decoder model.
|
||||
|
||||
Args:
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
)
|
||||
class DeformableDetrModelOutput(ModelOutput):
|
||||
r"""
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
|
||||
init_reference_points: Optional[torch.FloatTensor] = None
|
||||
@ -203,64 +175,44 @@ class DeformableDetrModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeformableDetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DeformableDetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
|
||||
4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
|
||||
in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
)
|
||||
class DeformableDetrObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -807,27 +807,21 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DeiTForImageClassificationWithTeacher`].
|
||||
|
||||
Args:
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the cls_logits and distillation logits.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the cls_logits and distillation logits.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
|
@ -32,26 +32,17 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DepthProOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for DepthPro's outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
features (`Union[torch.FloatTensor, list[torch.FloatTensor]]`, *optional*):
|
||||
Features from encoders. Can be a single feature or a list of features.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class DepthProOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
|
||||
Features from encoders. Can be a single feature or a list of features.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -61,28 +52,17 @@ class DepthProOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DepthProDepthEstimatorOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for DepthProForDepthEstimation's output.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Predicted depth for each pixel.
|
||||
field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
|
||||
Field of View Scaler.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer and the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, n_patches_per_batch, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class DepthProDepthEstimatorOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
|
||||
Field of View Scaler.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -45,122 +45,74 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
)
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrModelOutput(Seq2SeqModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
)
|
||||
class DetrModelOutput(Seq2SeqModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DetrForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class DetrObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -178,58 +130,38 @@ class DetrObjectDetectionOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetrSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`DetrForSegmentation`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
|
||||
Segmentation masks logits for all queries. See also
|
||||
[`~DetrImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~DetrImageProcessor.post_process_instance_segmentation`]
|
||||
[`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
|
||||
segmentation masks respectively.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class DetrSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
|
||||
Segmentation masks logits for all queries. See also
|
||||
[`~DetrImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~DetrImageProcessor.post_process_instance_segmentation`]
|
||||
[`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
|
||||
segmentation masks respectively.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -57,30 +57,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat encoder's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class DinatEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -90,32 +79,21 @@ class DinatEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class DinatModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -126,32 +104,23 @@ class DinatModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DinatImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Dinat outputs for image classification.
|
||||
"""
|
||||
)
|
||||
class DinatImageClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -38,31 +38,20 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin encoder's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
|
||||
class DonutSwinEncoderOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin encoder's outputs, with potential hidden states and attentions.
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -72,33 +61,22 @@ class DonutSwinEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
|
||||
class DonutSwinModelOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin model's outputs that also contains a pooling of the last hidden states.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -109,33 +87,24 @@ class DonutSwinModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
DonutSwin outputs for image classification.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinImageClassifierOutput with Swin->DonutSwin
|
||||
class DonutSwinImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
DonutSwin outputs for image classification.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -40,26 +40,17 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
"""
|
||||
)
|
||||
class DPRContextEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
@ -68,26 +59,17 @@ class DPRContextEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
"""
|
||||
)
|
||||
class DPRQuestionEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
|
||||
The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
|
||||
hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
|
||||
This output is to be used to embed questions for nearest neighbors queries with context embeddings.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
@ -96,29 +78,20 @@ class DPRQuestionEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRReaderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`DPRQuestionEncoder`].
|
||||
|
||||
Args:
|
||||
start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
Logits of the start index of the span for each passage.
|
||||
end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
Logits of the end index of the span for each passage.
|
||||
relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
|
||||
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
|
||||
question, compared to all the other passages.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class DPRReaderOutput(ModelOutput):
|
||||
r"""
|
||||
start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
Logits of the start index of the span for each passage.
|
||||
end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
|
||||
Logits of the end index of the span for each passage.
|
||||
relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
|
||||
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
|
||||
question, compared to all the other passages.
|
||||
"""
|
||||
|
||||
start_logits: torch.FloatTensor
|
||||
|
@ -42,16 +42,18 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
|
||||
in the context of Vision models.:
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
r"""
|
||||
last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
"""
|
||||
|
||||
last_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -59,32 +61,21 @@ class BaseModelOutputWithIntermediateActivations(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
|
||||
activations that can be used by the model at later stages.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) after further processing
|
||||
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
|
||||
the classification token after processing through a linear layer and a tanh activation function. The linear
|
||||
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
"""
|
||||
)
|
||||
class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) after further processing
|
||||
through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
|
||||
the classification token after processing through a linear layer and a tanh activation function. The linear
|
||||
layer weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Intermediate activations that can be used to compute hidden states of the model at various layers.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
|
@ -667,26 +667,17 @@ class ElectraPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElectraForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ElectraForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class ElectraForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -38,13 +38,13 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
"""
|
||||
|
||||
audio_codes: Optional[torch.LongTensor] = None
|
||||
@ -52,13 +52,13 @@ class EncodecOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||
Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
|
||||
Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
|
||||
"""
|
||||
|
||||
audio_codes: Optional[torch.LongTensor] = None
|
||||
@ -66,11 +66,11 @@ class EncodecEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class EncodecDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Encodec.
|
||||
"""
|
||||
|
||||
audio_values: Optional[torch.FloatTensor] = None
|
||||
|
@ -647,31 +647,22 @@ class ErniePreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`ErnieForPreTraining`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->Ernie
|
||||
class ErnieForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`ErnieForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -53,59 +53,61 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EsmForProteinFoldingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`EsmForProteinFoldingOutput`].
|
||||
|
||||
Args:
|
||||
frames (`torch.FloatTensor`):
|
||||
Output frames.
|
||||
sidechain_frames (`torch.FloatTensor`):
|
||||
Output sidechain frames.
|
||||
unnormalized_angles (`torch.FloatTensor`):
|
||||
Predicted unnormalized backbone and side chain torsion angles.
|
||||
angles (`torch.FloatTensor`):
|
||||
Predicted backbone and side chain torsion angles.
|
||||
positions (`torch.FloatTensor`):
|
||||
Predicted positions of the backbone and side chain atoms.
|
||||
states (`torch.FloatTensor`):
|
||||
Hidden states from the protein folding trunk.
|
||||
s_s (`torch.FloatTensor`):
|
||||
Per-residue embeddings derived by concatenating the hidden states of each layer of the ESM-2 LM stem.
|
||||
s_z (`torch.FloatTensor`):
|
||||
Pairwise residue embeddings.
|
||||
distogram_logits (`torch.FloatTensor`):
|
||||
Input logits to the distogram used to compute residue distances.
|
||||
lm_logits (`torch.FloatTensor`):
|
||||
Logits output by the ESM-2 protein language model stem.
|
||||
aatype (`torch.FloatTensor`):
|
||||
Input amino acids (AlphaFold2 indices).
|
||||
atom14_atom_exists (`torch.FloatTensor`):
|
||||
Whether each atom exists in the atom14 representation.
|
||||
residx_atom14_to_atom37 (`torch.FloatTensor`):
|
||||
Mapping between atoms in the atom14 and atom37 representations.
|
||||
residx_atom37_to_atom14 (`torch.FloatTensor`):
|
||||
Mapping between atoms in the atom37 and atom14 representations.
|
||||
atom37_atom_exists (`torch.FloatTensor`):
|
||||
Whether each atom exists in the atom37 representation.
|
||||
residue_index (`torch.FloatTensor`):
|
||||
The index of each residue in the protein chain. Unless internal padding tokens are used, this will just be
|
||||
a sequence of integers from 0 to `sequence_length`.
|
||||
lddt_head (`torch.FloatTensor`):
|
||||
Raw outputs from the lddt head used to compute plddt.
|
||||
plddt (`torch.FloatTensor`):
|
||||
Per-residue confidence scores. Regions of low confidence may indicate areas where the model's prediction is
|
||||
uncertain, or where the protein structure is disordered.
|
||||
ptm_logits (`torch.FloatTensor`):
|
||||
Raw logits used for computing ptm.
|
||||
ptm (`torch.FloatTensor`):
|
||||
TM-score output representing the model's high-level confidence in the overall structure.
|
||||
aligned_confidence_probs (`torch.FloatTensor`):
|
||||
Per-residue confidence scores for the aligned structure.
|
||||
predicted_aligned_error (`torch.FloatTensor`):
|
||||
Predicted error between the model's prediction and the ground truth.
|
||||
max_predicted_aligned_error (`torch.FloatTensor`):
|
||||
Per-sample maximum predicted error.
|
||||
"""
|
||||
)
|
||||
class EsmForProteinFoldingOutput(ModelOutput):
|
||||
r"""
|
||||
frames (`torch.FloatTensor`):
|
||||
Output frames.
|
||||
sidechain_frames (`torch.FloatTensor`):
|
||||
Output sidechain frames.
|
||||
unnormalized_angles (`torch.FloatTensor`):
|
||||
Predicted unnormalized backbone and side chain torsion angles.
|
||||
angles (`torch.FloatTensor`):
|
||||
Predicted backbone and side chain torsion angles.
|
||||
positions (`torch.FloatTensor`):
|
||||
Predicted positions of the backbone and side chain atoms.
|
||||
states (`torch.FloatTensor`):
|
||||
Hidden states from the protein folding trunk.
|
||||
s_s (`torch.FloatTensor`):
|
||||
Per-residue embeddings derived by concatenating the hidden states of each layer of the ESM-2 LM stem.
|
||||
s_z (`torch.FloatTensor`):
|
||||
Pairwise residue embeddings.
|
||||
distogram_logits (`torch.FloatTensor`):
|
||||
Input logits to the distogram used to compute residue distances.
|
||||
lm_logits (`torch.FloatTensor`):
|
||||
Logits output by the ESM-2 protein language model stem.
|
||||
aatype (`torch.FloatTensor`):
|
||||
Input amino acids (AlphaFold2 indices).
|
||||
atom14_atom_exists (`torch.FloatTensor`):
|
||||
Whether each atom exists in the atom14 representation.
|
||||
residx_atom14_to_atom37 (`torch.FloatTensor`):
|
||||
Mapping between atoms in the atom14 and atom37 representations.
|
||||
residx_atom37_to_atom14 (`torch.FloatTensor`):
|
||||
Mapping between atoms in the atom37 and atom14 representations.
|
||||
atom37_atom_exists (`torch.FloatTensor`):
|
||||
Whether each atom exists in the atom37 representation.
|
||||
residue_index (`torch.FloatTensor`):
|
||||
The index of each residue in the protein chain. Unless internal padding tokens are used, this will just be
|
||||
a sequence of integers from 0 to `sequence_length`.
|
||||
lddt_head (`torch.FloatTensor`):
|
||||
Raw outputs from the lddt head used to compute plddt.
|
||||
plddt (`torch.FloatTensor`):
|
||||
Per-residue confidence scores. Regions of low confidence may indicate areas where the model's prediction is
|
||||
uncertain, or where the protein structure is disordered.
|
||||
ptm_logits (`torch.FloatTensor`):
|
||||
Raw logits used for computing ptm.
|
||||
ptm (`torch.FloatTensor`):
|
||||
TM-score output representing the model's high-level confidence in the overall structure.
|
||||
aligned_confidence_probs (`torch.FloatTensor`):
|
||||
Per-residue confidence scores for the aligned structure.
|
||||
predicted_aligned_error (`torch.FloatTensor`):
|
||||
Predicted error between the model's prediction and the ground truth.
|
||||
max_predicted_aligned_error (`torch.FloatTensor`):
|
||||
Per-sample maximum predicted error.
|
||||
"""
|
||||
|
||||
frames: Optional[torch.FloatTensor] = None
|
||||
|
@ -492,24 +492,19 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the FALCONMAMBA model outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->FALCONMAMBA,Mamba->FalconMamba,FalconMambaCache->MambaCache
|
||||
class FalconMambaOutput(ModelOutput):
|
||||
"""
|
||||
Class for the FALCONMAMBA model outputs.
|
||||
r"""
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -518,26 +513,23 @@ class FalconMambaOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->FalconMamba,FalconMambaCache->MambaCache
|
||||
class FalconMambaCausalLMOutput(ModelOutput):
|
||||
"""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -35,46 +35,21 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FastSpeech2ConformerModel`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
|
||||
The predicted spectrogram.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
|
||||
"""
|
||||
)
|
||||
class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -90,47 +65,23 @@ class FastSpeech2ConformerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FastSpeech2ConformerWithHifiGan`].
|
||||
|
||||
Args:
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
|
||||
Speech output as a result of passing the predicted mel spectrogram through the vocoder.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
|
||||
The predicted spectrogram.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
"""
|
||||
)
|
||||
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Spectrogram generation loss.
|
||||
duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*):
|
||||
Outputs of the duration predictor.
|
||||
pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the pitch predictor.
|
||||
energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*):
|
||||
Outputs of the energy predictor.
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
|
||||
Speech output as a result of passing the predicted mel spectrogram through the vocoder.
|
||||
"""
|
||||
|
||||
waveform: Optional[torch.FloatTensor] = None
|
||||
|
@ -246,27 +246,28 @@ class FlaubertPredLayer(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert
|
||||
class FlaubertSquadHeadOutput(ModelOutput):
|
||||
"""
|
||||
Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
|
||||
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top config.start_n_top start token possibilities (beam-search).
|
||||
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
|
||||
(beam-search).
|
||||
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
|
||||
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top config.start_n_top start token possibilities (beam-search).
|
||||
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
|
||||
(beam-search).
|
||||
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -815,6 +816,14 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, BaseModelOutput]:
|
||||
r"""
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||
also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
|
||||
@ -824,14 +833,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
|
||||
decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
|
||||
hidden-states.
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
@ -1040,6 +1041,14 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, MaskedLMOutput]:
|
||||
r"""
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||
also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
|
||||
@ -1053,14 +1062,6 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
||||
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
|
||||
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
|
||||
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
|
||||
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the *language name
|
||||
to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
|
||||
*language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the [multilingual documentation](../multilingual).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
@ -1413,37 +1414,28 @@ class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models using a `SquadHead`.
|
||||
"""
|
||||
)
|
||||
# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert
|
||||
class FlaubertForQuestionAnsweringOutput(ModelOutput):
|
||||
"""
|
||||
Base class for outputs of question answering models using a `SquadHead`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
|
||||
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top config.start_n_top start token possibilities (beam-search).
|
||||
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
|
||||
(beam-search).
|
||||
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
|
||||
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top config.start_n_top start token possibilities (beam-search).
|
||||
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
|
||||
(beam-search).
|
||||
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -49,27 +49,29 @@ FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalC
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlavaModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output from FlavaModel containing embeddings and outputs from individual encoders.
|
||||
|
||||
Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
|
||||
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
|
||||
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
|
||||
|
||||
Args:
|
||||
image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`].
|
||||
image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`].
|
||||
text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
"""
|
||||
)
|
||||
class FlavaModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`].
|
||||
image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`].
|
||||
text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
"""
|
||||
|
||||
image_embeddings: Optional[torch.FloatTensor] = None
|
||||
@ -87,24 +89,27 @@ class FlavaModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class representing pretraining losses from FLAVA model
|
||||
"""
|
||||
)
|
||||
class FlavaLosses(ModelOutput):
|
||||
"""Class representing pretraining losses from FLAVA model
|
||||
|
||||
Args:
|
||||
mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
|
||||
Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
|
||||
mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
|
||||
Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
|
||||
itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
|
||||
Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
|
||||
masked pairs in FLAVA.
|
||||
global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
|
||||
Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
|
||||
data. This is calculated on unmasked images and texts.
|
||||
mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
|
||||
Masked Multimodal Modeling loss's image component calculated on paired image-text data.
|
||||
mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
|
||||
Masked Multimodal Modeling loss's text component calculated on paired image-text data.
|
||||
r"""
|
||||
mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
|
||||
Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
|
||||
mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
|
||||
Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
|
||||
itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
|
||||
Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
|
||||
masked pairs in FLAVA.
|
||||
global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
|
||||
Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
|
||||
data. This is calculated on unmasked images and texts.
|
||||
mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
|
||||
Masked Multimodal Modeling loss's image component calculated on paired image-text data.
|
||||
mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
|
||||
Masked Multimodal Modeling loss's text component calculated on paired image-text data.
|
||||
"""
|
||||
|
||||
mim: Optional[torch.FloatTensor] = None
|
||||
@ -124,69 +129,69 @@ class FlavaLosses(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlavaForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.
|
||||
|
||||
Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
|
||||
transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
|
||||
`text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
|
||||
Total loss calculated for this model.
|
||||
loss_info (`FlavaLosses`):
|
||||
Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
|
||||
the keys.
|
||||
image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`].
|
||||
image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`].
|
||||
text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
|
||||
image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
|
||||
to create masked images.
|
||||
image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
|
||||
text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
|
||||
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
|
||||
The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
|
||||
returned when `bool_masked_pos` has some of the patches masked.
|
||||
mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
|
||||
The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
|
||||
the tokens masked.
|
||||
itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
|
||||
mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
|
||||
output is returned when `bool_masked_pos` has some of the patches masked.
|
||||
mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
|
||||
some of the tokens masked.
|
||||
contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
|
||||
`image_projection` and `text_projection` layers respectively. This represents the image-text similarity
|
||||
scores. This is calculated on unmasked images and texts.
|
||||
contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
|
||||
`text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
|
||||
texts.
|
||||
"""
|
||||
)
|
||||
class FlavaForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
|
||||
Total loss calculated for this model.
|
||||
loss_info (`FlavaLosses`):
|
||||
Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
|
||||
the keys.
|
||||
image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`].
|
||||
image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`].
|
||||
text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
|
||||
The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
|
||||
to create masked images.
|
||||
image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
|
||||
The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
|
||||
text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
|
||||
The text embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
|
||||
The output of the [`FlavaTextModel`].
|
||||
multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
|
||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||
multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The output of the [`FlavaMultimodalModel`].
|
||||
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
|
||||
The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
|
||||
returned when `bool_masked_pos` has some of the patches masked.
|
||||
mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
|
||||
The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
|
||||
the tokens masked.
|
||||
itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||
The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
|
||||
contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
|
||||
`image_projection` and `text_projection` layers respectively. This represents the image-text similarity
|
||||
scores. This is calculated on unmasked images and texts.
|
||||
contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
|
||||
`text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
|
||||
texts.
|
||||
mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
|
||||
output is returned when `bool_masked_pos` has some of the patches masked.
|
||||
mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
|
||||
The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
|
||||
some of the tokens masked.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1207,12 +1212,12 @@ class FlavaModel(FlavaPreTrainedModel):
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
|
||||
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
||||
skip_multimodal_encoder (*bool*, *optional*):
|
||||
Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
|
||||
image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
|
||||
- 1 for pixel values that are real (i.e., **not masked**),
|
||||
- 0 for pixel values that are padding (i.e., **masked**).
|
||||
skip_multimodal_encoder (*bool*, *optional*):
|
||||
Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
|
||||
|
||||
Examples:
|
||||
|
||||
@ -1681,6 +1686,8 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
|
||||
to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
|
||||
[`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
|
||||
codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
|
||||
Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
|
||||
token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
@ -1714,8 +1721,6 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
|
||||
The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
|
||||
return_loss (`bool`, *optional*, default to None):
|
||||
Whether to return calculated loss or not.
|
||||
codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
|
||||
Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
|
@ -409,23 +409,21 @@ class FNetPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FNetForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FNetForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class FNetForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -37,25 +37,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet encoder's outputs, with potential hidden states.
|
||||
"""
|
||||
)
|
||||
class FocalNetEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -64,26 +58,21 @@ class FocalNetEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class FocalNetModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -93,26 +82,23 @@ class FocalNetModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet masked image model outputs.
|
||||
"""
|
||||
)
|
||||
class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
|
||||
Masked image modeling (MLM) loss.
|
||||
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Reconstructed pixel values.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
|
||||
Masked image modeling (MLM) loss.
|
||||
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Reconstructed pixel values.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -122,26 +108,23 @@ class FocalNetMaskedImageModelingOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FocalNetImageClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
FocalNet outputs for image classification.
|
||||
"""
|
||||
)
|
||||
class FocalNetImageClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, hidden_size, height, width)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -804,26 +804,17 @@ class FunnelClassificationHead(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunnelForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`FunnelForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA-style objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class FunnelForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss of the ELECTRA-style objective.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -290,12 +290,12 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
|
||||
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
|
||||
Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
|
||||
hidden size of the model.
|
||||
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Tensor of indices of the image patches in the input_ids tensor.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
|
||||
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Tensor of indices of the image patches in the input_ids tensor.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -48,68 +48,48 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Gemma3 outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Gemma3 causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class Gemma3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
import copy
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -346,12 +345,10 @@ class Gemma3Config(PretrainedConfig):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gemma3CausalLMOutputWithPast(PaligemmaCausalLMOutputWithPast):
|
||||
pass
|
||||
|
||||
|
@ -49,27 +49,16 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git
|
||||
class GitVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
|
@ -289,27 +289,16 @@ class GotOcr2VisionLayer(GradientCheckpointingLayer):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2VisionEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for got_ocr2 vision model's outputs that also contains image embeddings obtained by applying the projection
|
||||
layer to the pooler_output.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class GotOcr2VisionEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -505,35 +494,26 @@ class GotOcr2MultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for GotOcr2 causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -545,33 +525,22 @@ class GotOcr2CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for GotOcr2 outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
@ -597,36 +597,27 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of models predicting if two sentences are consecutive or not.
|
||||
"""
|
||||
)
|
||||
class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
|
||||
Multiple choice classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
|
||||
past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
|
||||
sequence_length, embed_size_per_head)`).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
|
||||
Multiple choice classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
|
||||
past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
|
||||
sequence_length, embed_size_per_head)`).
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -33,32 +33,23 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class GraniteSpeechCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -382,12 +373,12 @@ class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, Genera
|
||||
The tensors corresponding to the input audios. input features can be obtained using
|
||||
[`AutoFeatureExtractor`]. See [`GraniteSpeechFeatureExtractor.__call__`] for details.
|
||||
[`GraniteSpeechProcessor`] uses [`GraniteSpeechFeatureExtractor`] for processing audio.
|
||||
input_features_mask (`torch.Tensor`, *optional*):
|
||||
Mask to be applied to audio features prior to scattering into the language embeddings.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
input_features_mask (`torch.Tensor`, *optional*):
|
||||
Mask to be applied to audio features prior to scattering into the language embeddings.
|
||||
"""
|
||||
# TODO (@alex-jw-brooks) add an example to this docstring once models are released
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
@ -102,28 +102,20 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
"""
|
||||
)
|
||||
class GroundingDinoDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -134,30 +126,27 @@ class GroundingDinoDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to:
|
||||
- vision and text last hidden states
|
||||
- vision and text intermediate hidden states
|
||||
|
||||
Args:
|
||||
last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the vision encoder.
|
||||
last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the text encoder.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads.
|
||||
"""
|
||||
)
|
||||
class GroundingDinoEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the vision encoder.
|
||||
last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the text encoder.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
"""
|
||||
|
||||
last_hidden_state_vision: Optional[torch.FloatTensor] = None
|
||||
@ -168,55 +157,49 @@ class GroundingDinoEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Grounding DINO encoder-decoder model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
|
||||
bi-attention heads.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
|
||||
region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
|
||||
background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
"""
|
||||
)
|
||||
class GroundingDinoModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
|
||||
bi-attention heads.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
|
||||
region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
|
||||
background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -237,73 +220,62 @@ class GroundingDinoModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`GroundingDinoForObjectDetection`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
|
||||
weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
|
||||
multi-scale deformable attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
|
||||
region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
|
||||
background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Encoded candidate labels sequence. Used in processor to post process object detection result.
|
||||
"""
|
||||
)
|
||||
class GroundingDinoObjectDetectionOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
|
||||
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
|
||||
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
|
||||
scale-invariant IoU loss.
|
||||
loss_dict (`Dict`, *optional*):
|
||||
A dictionary containing the individual losses. Useful for logging.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
|
||||
Classification logits (including no-object) for all queries.
|
||||
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
|
||||
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
|
||||
possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
|
||||
unnormalized bounding boxes.
|
||||
auxiliary_outputs (`list[Dict]`, *optional*):
|
||||
Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
|
||||
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
|
||||
`pred_boxes`) for each decoder layer.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
|
||||
layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
|
||||
output of each layer plus the initial embedding outputs.
|
||||
encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
|
||||
of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
|
||||
each layer plus the initial embedding outputs.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
|
||||
region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
|
||||
background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
|
||||
Logits of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
|
||||
Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Encoded candidate labels sequence. Used in processor to post process object detection result.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -259,38 +259,37 @@ class GroupViTTokenAssign(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class GroupViTModelOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
|
||||
Classification scores for each pixel.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
|
||||
Classification scores for each pixel.
|
||||
|
||||
<Tip warning={true}>
|
||||
<Tip warning={true}>
|
||||
|
||||
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
|
||||
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
|
||||
original image size as post-processing. You should always check your logits shape and resize as needed.
|
||||
The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
|
||||
to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
|
||||
original image size as post-processing. You should always check your logits shape and resize as needed.
|
||||
|
||||
</Tip>
|
||||
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`GroupViTTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`GroupViTVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`GroupViTTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`GroupViTVisionModel`].
|
||||
</Tip>
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`GroupViTTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`GroupViTVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`GroupViTTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`GroupViTVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -42,30 +42,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera encoder's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class HieraEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Thesre are the unrolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -75,36 +64,25 @@ class HieraEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class HieraModelOutput(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
|
||||
Average pooling of the last layer hidden-state.
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -117,32 +95,34 @@ class HieraModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Hiera image classification outputs.
|
||||
"""
|
||||
)
|
||||
class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
|
||||
Loss value for the training task.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
||||
Prediction scores of the classification head (logits of the output layer).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
|
||||
Loss value for the training task.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
|
||||
Prediction scores of the classification head (logits of the output layer).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
|
||||
include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -153,31 +133,25 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class HieraForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for HieraForPreTraining's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`):
|
||||
Pixel reconstruction loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
|
||||
Pixel reconstruction logits.
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs reshaped to include the spatial dimensions.
|
||||
"""
|
||||
)
|
||||
class HieraForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`):
|
||||
Pixel reconstruction loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
|
||||
Pixel reconstruction logits.
|
||||
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor indicating which patches are masked (0) and which are not (1).
|
||||
ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Tensor containing the original index of the (shuffled) masked patches.
|
||||
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs reshaped to include the spatial dimensions.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -52,41 +52,32 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
"""
|
||||
)
|
||||
class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -97,37 +88,28 @@ class IdeficsBaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class IdeficsCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class IdeficsCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1445,16 +1427,16 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, IdeficsCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
|
||||
The output of the image encoder.
|
||||
perceiver_embeddings (`torch.FloatTensor`, *optional*):
|
||||
The output of the perceiver resampler.
|
||||
image_attention_mask (`torch.LongTensor`, *optional*):
|
||||
The attention mask for the image encoder.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -39,35 +39,29 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
)
|
||||
class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -78,33 +72,27 @@ class Idefics2BaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics2 causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2
|
||||
class Idefics2CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
Base class for Idefics2 causal language model (or autoregressive) outputs.
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -39,35 +39,29 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics3 model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder
|
||||
"""
|
||||
)
|
||||
class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -78,33 +72,26 @@ class Idefics3BaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Idefics3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Idefics causal language model (or autoregressive) outputs.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder
|
||||
"""
|
||||
)
|
||||
class Idefics3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -44,22 +44,24 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`InstructBlipForConditionalGeneration`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlip
|
||||
class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
Class defining the outputs of [`InstructBlipForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
@ -1147,21 +1147,23 @@ class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
)
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -189,7 +188,6 @@ class InstructBlipVideoQFormerModel(InstructBlipQFormerModel):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForConditionalGenerationModelOutput):
|
||||
pass
|
||||
|
||||
|
@ -209,28 +209,17 @@ class InternVLVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`InternVLVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
"""
|
||||
|
||||
|
||||
@ -569,33 +558,22 @@ class InternVLMultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for InternVL outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class InternVLModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -805,35 +783,26 @@ class InternVLModel(InternVLPreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for InternVL causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -171,28 +171,17 @@ class InternVLVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`InternVLVisionModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
"""
|
||||
|
||||
|
||||
|
@ -87,14 +87,17 @@ class JanusPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus VQ-VAE mode model outputs.
|
||||
Args:
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
Embedding loss.
|
||||
"""
|
||||
)
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
r"""
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
Embedding loss.
|
||||
"""
|
||||
|
||||
decoded_pixel_values: Optional[torch.FloatTensor] = None
|
||||
@ -102,41 +105,32 @@ class JanusVQVAEOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
"""
|
||||
)
|
||||
class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -147,37 +141,28 @@ class JanusBaseModelOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class JanusCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
|
||||
sequence_length, hidden_size)`.
|
||||
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -408,26 +408,27 @@ class JanusPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Janus VQ-VAE mode model outputs.
|
||||
Args:
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
Embedding loss.
|
||||
"""
|
||||
)
|
||||
class JanusVQVAEOutput(ModelOutput):
|
||||
r"""
|
||||
decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
Reconstructed pixel values after encoding and decoding the input.
|
||||
embedding_loss (`torch.FloatTensor`):
|
||||
Embedding loss.
|
||||
"""
|
||||
|
||||
decoded_pixel_values: Optional[torch.FloatTensor] = None
|
||||
embedding_loss: torch.FloatTensor = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class JanusCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
|
||||
pass
|
||||
|
||||
|
@ -90,43 +90,32 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
|
||||
|
||||
|
||||
@dataclass
|
||||
class Kosmos2ModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class Kosmos2ModelOutput(ModelOutput):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output(`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -145,45 +134,36 @@ class Kosmos2ModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Model output class for `Kosmos2ForConditionalGeneration`.
|
||||
"""
|
||||
)
|
||||
class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output(`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
|
||||
the weighted average in the self-attention heads.
|
||||
vision_model_output (`BaseModelOutputWithPooling`, *optional*):
|
||||
The output of the [`Kosmos2VisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1332,6 +1312,8 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
|
||||
1]`:
|
||||
@ -1343,8 +1325,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
"""
|
||||
return self.model(
|
||||
input_ids=input_ids,
|
||||
@ -1423,6 +1403,8 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
|
||||
1]`:
|
||||
@ -1438,8 +1420,6 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
||||
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
@ -1794,12 +1774,12 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
|
||||
- 1 for places where to put the image features,
|
||||
- 0 for places that are not for image features (i.e. for text tokens).
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
||||
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -1132,41 +1132,36 @@ class LEDPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput with Longformer->LEDEncoder
|
||||
class LEDEncoderBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.
|
||||
r"""
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
@ -1176,60 +1171,32 @@ class LEDEncoderBaseModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
|
||||
decoding.
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -1244,58 +1211,30 @@ class LEDSeq2SeqModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for sequence-to-sequence language models outputs.
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1311,58 +1250,30 @@ class LEDSeq2SeqLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sequence-to-sequence sentence classification models.
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1378,60 +1289,28 @@ class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sequence-to-sequence question answering models.
|
||||
"""
|
||||
)
|
||||
class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
|
||||
num_heads, sequence_length, embed_size_per_head)`).
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
|
||||
used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -38,23 +38,21 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LevitForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LevitForImageClassificationWithTeacher`].
|
||||
|
||||
Args:
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the `cls_logits` and `distillation_logits`.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class LevitForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the `cls_logits` and `distillation_logits`.
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
|
@ -36,36 +36,38 @@ from .configuration_lightglue import LightGlueConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of LightGlue keypoint matching models. Due to the nature of keypoint detection and matching,
|
||||
the number of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the
|
||||
batch of images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask
|
||||
tensor is used to indicate which values in the keypoints, matches, matching_scores and prune tensors are keypoint
|
||||
matching information.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Index of keypoint matched in the other image.
|
||||
matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Scores of predicted matches.
|
||||
keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
|
||||
Absolute (x, y) coordinates of predicted keypoints in a given image.
|
||||
prune (`torch.IntTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Pruning mask indicating which keypoints are removed and at which layer.
|
||||
mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Mask indicating which values in matches, matching_scores, keypoints and prune are keypoint matching
|
||||
information.
|
||||
hidden_states (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
|
||||
num_keypoints)` returned when `output_hidden_states=True` is passed or when
|
||||
`config.output_hidden_states=True`
|
||||
attentions (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
|
||||
num_keypoints)` returned when `output_attentions=True` is passed or when
|
||||
`config.output_attentions=True`
|
||||
"""
|
||||
)
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Index of keypoint matched in the other image.
|
||||
matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Scores of predicted matches.
|
||||
keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
|
||||
Absolute (x, y) coordinates of predicted keypoints in a given image.
|
||||
prune (`torch.IntTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Pruning mask indicating which keypoints are removed and at which layer.
|
||||
mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Mask indicating which values in matches, matching_scores, keypoints and prune are keypoint matching
|
||||
information.
|
||||
hidden_states (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
|
||||
num_keypoints)` returned when `output_hidden_states=True` is passed or when
|
||||
`config.output_hidden_states=True`
|
||||
attentions (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
|
||||
num_keypoints)` returned when `output_attentions=True` is passed or when
|
||||
`config.output_attentions=True`
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -37,9 +37,6 @@ from ..superpoint import SuperPointConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC_ = "LightGlueConfig"
|
||||
_CHECKPOINT_FOR_DOC_ = "ETH-CVG/lightglue_superpoint"
|
||||
|
||||
|
||||
class LightGlueConfig(PretrainedConfig):
|
||||
r"""
|
||||
@ -158,36 +155,38 @@ class LightGlueConfig(PretrainedConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of LightGlue keypoint matching models. Due to the nature of keypoint detection and matching,
|
||||
the number of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the
|
||||
batch of images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask
|
||||
tensor is used to indicate which values in the keypoints, matches, matching_scores and prune tensors are keypoint
|
||||
matching information.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Index of keypoint matched in the other image.
|
||||
matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Scores of predicted matches.
|
||||
keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
|
||||
Absolute (x, y) coordinates of predicted keypoints in a given image.
|
||||
prune (`torch.IntTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Pruning mask indicating which keypoints are removed and at which layer.
|
||||
mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Mask indicating which values in matches, matching_scores, keypoints and prune are keypoint matching
|
||||
information.
|
||||
hidden_states (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
|
||||
num_keypoints)` returned when `output_hidden_states=True` is passed or when
|
||||
`config.output_hidden_states=True`
|
||||
attentions (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
|
||||
num_keypoints)` returned when `output_attentions=True` is passed or when
|
||||
`config.output_attentions=True`
|
||||
"""
|
||||
)
|
||||
class LightGlueKeypointMatchingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
|
||||
Loss computed during training.
|
||||
matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Index of keypoint matched in the other image.
|
||||
matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
|
||||
Scores of predicted matches.
|
||||
keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
|
||||
Absolute (x, y) coordinates of predicted keypoints in a given image.
|
||||
prune (`torch.IntTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Pruning mask indicating which keypoints are removed and at which layer.
|
||||
mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
|
||||
Mask indicating which values in matches, matching_scores, keypoints and prune are keypoint matching
|
||||
information.
|
||||
hidden_states (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
|
||||
num_keypoints)` returned when `output_hidden_states=True` is passed or when
|
||||
`config.output_hidden_states=True`
|
||||
attentions (`Tuple[torch.FloatTensor, ...]`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
|
||||
num_keypoints)` returned when `output_attentions=True` is passed or when
|
||||
`config.output_attentions=True`
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -719,35 +719,26 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Llama4CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class Llama4CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -36,68 +36,48 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LlavaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class LlavaCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -145,68 +145,48 @@ def unpad_image(tensor, original_size):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNext causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class LlavaNextCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -43,37 +43,25 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -82,39 +70,29 @@ class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaNextVideo causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -182,9 +181,17 @@ class LlavaNextVideoConfig(PretrainedConfig):
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast):
|
||||
"""
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
@ -193,9 +200,21 @@ class LlavaNextVideoModelOutputWithPast(LlavaNextModelOutputWithPast):
|
||||
video_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaNextVideoCausalLMOutputWithPast(LlavaNextCausalLMOutputWithPast):
|
||||
"""
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
@ -49,37 +49,25 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Llava outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -88,39 +76,29 @@ class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for LlavaOnevision causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
video_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
|
||||
video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -35,40 +35,35 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Longformer's outputs, with potential hidden states, local and global attentions.
|
||||
"""
|
||||
)
|
||||
class LongformerBaseModelOutput(ModelOutput):
|
||||
r"""
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
@ -78,44 +73,39 @@ class LongformerBaseModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Longformer's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
|
||||
prediction (classification) objective during pretraining.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
|
||||
prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
@ -126,42 +116,39 @@ class LongformerBaseModelOutputWithPooling(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerMaskedLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for masked language models outputs.
|
||||
"""
|
||||
)
|
||||
class LongformerMaskedLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked language modeling (MLM) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked language modeling (MLM) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -172,44 +159,37 @@ class LongformerMaskedLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering Longformer models.
|
||||
"""
|
||||
)
|
||||
class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -221,42 +201,39 @@ class LongformerQuestionAnsweringModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of sentence classification models.
|
||||
"""
|
||||
)
|
||||
class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -267,44 +244,41 @@ class LongformerSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of multiple choice Longformer models.
|
||||
"""
|
||||
)
|
||||
class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
Classification scores (before SoftMax).
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -315,42 +289,39 @@ class LongformerMultipleChoiceModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LongformerTokenClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
"""
|
||||
)
|
||||
class LongformerTokenClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
|
||||
attention_window + 1)`, where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Local attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token in the sequence to every token with
|
||||
global attention (first `x` values) and to every token in the attention window (remaining `attention_window
|
||||
+ 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
|
||||
remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
|
||||
token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
|
||||
(succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
|
||||
If the attention window contains a token with global attention, the attention weight at the corresponding
|
||||
index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
|
||||
attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
|
||||
accessed from `global_attentions`.
|
||||
global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
|
||||
where `x` is the number of tokens with global attention mask.
|
||||
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
Global attentions weights after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads. Those are the attention weights from every token with global attention to every token
|
||||
in the sequence.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -36,30 +36,22 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the LUKE model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length +
|
||||
entity_length, sequence_length + entity_length)`. Attentions weights after the attention softmax, used to
|
||||
compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token) further processed by a
|
||||
Linear layer and a Tanh activation function.
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
entity_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -67,30 +59,19 @@ class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class BaseLukeModelOutput(BaseModelOutput):
|
||||
"""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
|
||||
Sequence of entity hidden-states at the output of the last layer of the model.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
entity_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -98,36 +79,27 @@ class BaseLukeModelOutput(BaseModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeMaskedLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs, with potential hidden states and attentions.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
The sum of masked language modeling (MLM) loss and entity prediction loss.
|
||||
mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked language modeling (MLM) loss.
|
||||
mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked entity prediction (MEP) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class LukeMaskedLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
The sum of masked language modeling (MLM) loss and entity prediction loss.
|
||||
mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked language modeling (MLM) loss.
|
||||
mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Masked entity prediction (MEP) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -141,27 +113,21 @@ class LukeMaskedLMOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class EntityClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -172,27 +138,21 @@ class EntityClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityPairClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity pair classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class EntityPairClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -203,27 +163,21 @@ class EntityPairClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntitySpanClassificationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of entity span classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class EntitySpanClassificationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -234,30 +188,21 @@ class EntitySpanClassificationOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeSequenceClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of sentence classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class LukeSequenceClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -268,30 +213,21 @@ class LukeSequenceClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeTokenClassifierOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class LukeTokenClassifierOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -302,32 +238,19 @@ class LukeTokenClassifierOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of question answering models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -339,32 +262,23 @@ class LukeQuestionAnsweringModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LukeMultipleChoiceModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of multiple choice models.
|
||||
"""
|
||||
)
|
||||
class LukeMultipleChoiceModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
|
||||
*num_choices* is the second dimension of the input tensors. (see *input_ids* above).
|
||||
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Classification scores (before SoftMax).
|
||||
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
|
||||
layer plus the initial entity embedding outputs.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -42,39 +42,40 @@ class GeLU(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
|
||||
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
|
||||
encoder")
|
||||
|
||||
|
||||
Args:
|
||||
language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the language encoder.
|
||||
vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the visual encoder.
|
||||
pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
|
||||
by a Linear layer and a Tanh activation function. The Linear
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class LxmertModelOutput(ModelOutput):
|
||||
r"""
|
||||
language_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the language encoder.
|
||||
vision_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the visual encoder.
|
||||
pooled_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
|
||||
by a Linear layer and a Tanh activation function. The Linear
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
language_output: Optional[torch.FloatTensor] = None
|
||||
@ -88,34 +89,36 @@ class LxmertModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LxmertForQuestionAnswering`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.k.
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.k.
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -128,40 +131,41 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class LxmertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`LxmertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the textual matching objective (classification) head (scores of True/False
|
||||
continuation before SoftMax).
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
|
||||
"""
|
||||
)
|
||||
class LxmertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the textual matching objective (classification) head (scores of True/False
|
||||
continuation before SoftMax).
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
language_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -429,23 +429,18 @@ class MambaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the MAMBA model outputs.
|
||||
"""
|
||||
)
|
||||
class MambaOutput(ModelOutput):
|
||||
r"""
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -454,25 +449,22 @@ class MambaOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaCausalLMOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class MambaCausalLMOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`MambaCache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -763,24 +763,19 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for the MAMBA2 model outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaOutput with MAMBA->MAMBA2,Mamba->Mamba2
|
||||
class Mamba2Output(ModelOutput):
|
||||
"""
|
||||
Class for the MAMBA2 model outputs.
|
||||
r"""
|
||||
cache_params (`Mamba2Cache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
cache_params (`Mamba2Cache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -789,26 +784,23 @@ class Mamba2Output(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaCausalLMOutput with Mamba->Mamba2
|
||||
class Mamba2CausalLMOutput(ModelOutput):
|
||||
"""
|
||||
Base class for causal language model (or autoregressive) outputs.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`Mamba2Cache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cache_params (`Mamba2Cache`):
|
||||
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
|
||||
avoid providing the old `input_ids`.
|
||||
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
Includes both the State space model state matrices after the selective scan, and the Convolutional states
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -44,22 +44,24 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Mask2Former's pixel decoder module output, practically a Multi-Scale Deformable Attention based decoder. It returns
|
||||
the mask features and the multiscale features.
|
||||
|
||||
Args:
|
||||
multi_scale_features (`tuple(torch.FloatTensor)`):
|
||||
Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
|
||||
width)`from the Multi-Scale Deformable Attenntion based Pixel Decoder.
|
||||
mask_features (`torch.FloatTensor`):
|
||||
Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel Decoder
|
||||
Layer.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from pixel decoder. Returned when `output_attentions=True` is passed
|
||||
or when `config.output_attentions=True`
|
||||
"""
|
||||
)
|
||||
class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
multi_scale_features (`tuple(torch.FloatTensor)`):
|
||||
Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
|
||||
width)`from the Multi-Scale Deformable Attenntion based Pixel Decoder.
|
||||
mask_features (`torch.FloatTensor`):
|
||||
Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel Decoder
|
||||
Layer.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from pixel decoder. Returned when `output_attentions=True` is passed
|
||||
or when `config.output_attentions=True`
|
||||
"""
|
||||
|
||||
multi_scale_features: tuple[torch.FloatTensor] = None
|
||||
@ -68,28 +70,28 @@ class Mask2FormerPixelDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Transformer decoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions for mask predictions logits and a tuple of intermediate decoder activations,
|
||||
i.e. the output of each decoder layer, each of them gone through a layernorm.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs. Returned when `output_hidden_states=True`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads. Returned when `output_attentions=True`.
|
||||
masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`):
|
||||
Tuple of mask predictions from all layers of the transformer decoder.
|
||||
intermediate_hidden_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
)
|
||||
class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs. Returned when `output_hidden_states=True`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads. Returned when `output_attentions=True`.
|
||||
masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`):
|
||||
Tuple of mask predictions from all layers of the transformer decoder.
|
||||
intermediate_hidden_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -100,28 +102,30 @@ class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Mask2Former's pixel level module output. It returns the output of the encoder (optional) and all hidden states
|
||||
(multi-scale features) from the `decoder`. By default, the `encoder` is a Swin Backbone and the `decoder` is a
|
||||
Multi-Scale Deformable Attention based decoder.
|
||||
|
||||
The `decoder_last_hidden_state` are the **per-pixel embeddings** while `decoder_hidden_states` refer to multi-scale
|
||||
feature maps produced using **multi-scaling strategy** defined in the paper.
|
||||
|
||||
Args:
|
||||
encoder_last_hidden_state (`torch.FloatTensor`):
|
||||
Last hidden states (final feature map of shape `(batch_size, num_channels, height, width)`) of the last
|
||||
stage of the encoder.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
|
||||
called feature maps) of the model at the output of each stage. Returned if output_hidden_states is set to
|
||||
True.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)):
|
||||
1/4 scale features from the last Pixel Decoder Layer.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`):
|
||||
Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
|
||||
called feature maps) of the model at the output of each stage.
|
||||
"""
|
||||
)
|
||||
class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor`):
|
||||
Last hidden states (final feature map of shape `(batch_size, num_channels, height, width)`) of the last
|
||||
stage of the encoder.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
|
||||
called feature maps) of the model at the output of each stage. Returned if output_hidden_states is set to
|
||||
True.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)):
|
||||
1/4 scale features from the last Pixel Decoder Layer.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`):
|
||||
Tuple of `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`. Hidden states (also
|
||||
called feature maps) of the model at the output of each stage.
|
||||
"""
|
||||
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -131,38 +135,40 @@ class Mask2FormerPixelLevelModuleOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Mask2FormerModel`]. This class returns all the needed hidden states to compute the logits.
|
||||
|
||||
Args:
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone). Returned when
|
||||
`output_hidden_states=True` is passed.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, , *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
transformer_decoder_intermediate_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`)
|
||||
Mask Predictions from each layer in the transformer decoder.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Self attentions weights from transformer decoder.
|
||||
"""
|
||||
)
|
||||
class Mask2FormerModelOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone). Returned when
|
||||
`output_hidden_states=True` is passed.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, , *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage. Returned when `output_hidden_states=True` is passed.
|
||||
transformer_decoder_intermediate_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`)
|
||||
Mask Predictions from each layer in the transformer decoder.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Self attentions weights from transformer decoder.
|
||||
"""
|
||||
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -177,47 +183,49 @@ class Mask2FormerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`Mask2FormerForUniversalSegmentationOutput`].
|
||||
|
||||
This output can be directly passed to [`~Mask2FormerImageProcessor.post_process_semantic_segmentation`] or
|
||||
[`~Mask2FormerImageProcessor.post_process_instance_segmentation`] or
|
||||
[`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
|
||||
[`~Mask2FormerImageProcessor] for details regarding usage.
|
||||
|
||||
Args:
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
|
||||
query. Note the `+ 1` is needed because we incorporate the null class.
|
||||
masks_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
|
||||
query.
|
||||
auxiliary_logits (`list[Dict(str, torch.FloatTensor)]`, *optional*):
|
||||
List of class and mask predictions from each layer of the transformer decoder.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
||||
"""
|
||||
)
|
||||
class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
|
||||
query. Note the `+ 1` is needed because we incorporate the null class.
|
||||
masks_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
|
||||
query.
|
||||
auxiliary_logits (`list[Dict(str, torch.FloatTensor)]`, *optional*):
|
||||
List of class and mask predictions from each layer of the transformer decoder.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model.
|
||||
transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
|
||||
Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -53,59 +53,53 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
|
||||
namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
|
||||
gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput
|
||||
class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
||||
r"""
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
||||
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
||||
layernorm.
|
||||
"""
|
||||
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
MaskFormer's pixel level module output. It returns both the last and (optionally) the hidden states from the
|
||||
`encoder` and `decoder`. By default, the `encoder` is a MaskFormerSwin Transformer and the `decoder` is a Feature
|
||||
Pyramid Network (FPN).
|
||||
|
||||
The `encoder_last_hidden_state` are referred on the paper as **images features**, while `decoder_last_hidden_state`
|
||||
as **pixel embeddings**
|
||||
|
||||
Args:
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
the output of each stage.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the decoder.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
the output of each stage.
|
||||
"""
|
||||
)
|
||||
class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder.
|
||||
decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the decoder.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
the output of each stage.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
|
||||
the output of each stage.
|
||||
"""
|
||||
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -115,22 +109,16 @@ class MaskFormerPixelLevelModuleOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
MaskFormer's pixel decoder module output, practically a Feature Pyramid Network. It returns the last hidden state
|
||||
and (optionally) the hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the model.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -139,36 +127,34 @@ class MaskFormerPixelDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`MaskFormerModel`]. This class returns all the needed hidden states to compute the logits.
|
||||
|
||||
Args:
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model (FPN).
|
||||
transformer_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Last hidden states (final feature map) of the last stage of the transformer decoder model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage.
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class MaskFormerModelOutput(ModelOutput):
|
||||
r"""
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model (FPN).
|
||||
transformer_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Last hidden states (final feature map) of the last stage of the transformer decoder model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
|
||||
transformer decoder at the output of each stage.
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`
|
||||
"""
|
||||
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -182,49 +168,49 @@ class MaskFormerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`MaskFormerForInstanceSegmentation`].
|
||||
|
||||
This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or or
|
||||
[`~MaskFormerImageProcessor.post_process_instance_segmentation`] or
|
||||
[`~MaskFormerImageProcessor.post_process_panoptic_segmentation`] depending on the task. Please, see
|
||||
[`~MaskFormerImageProcessor] for details regarding usage.
|
||||
|
||||
Args:
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
|
||||
query. Note the `+ 1` is needed because we incorporate the null class.
|
||||
masks_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
|
||||
query.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model (FPN).
|
||||
transformer_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Last hidden states (final feature map) of the last stage of the transformer decoder model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the transformer decoder at the output
|
||||
of each stage.
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights from Detr's decoder after the attention softmax, used to compute the
|
||||
weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class MaskFormerForInstanceSegmentationOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.Tensor`, *optional*):
|
||||
The computed loss, returned when labels are present.
|
||||
class_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
|
||||
query. Note the `+ 1` is needed because we incorporate the null class.
|
||||
masks_queries_logits (`torch.FloatTensor`):
|
||||
A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
|
||||
query.
|
||||
auxiliary_logits (`Dict[str, torch.FloatTensor]`, *optional*, returned when `output_auxiliary_logits=True`):
|
||||
Dictionary containing auxiliary predictions for each decoder layer when auxiliary losses are enabled.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the encoder model (backbone).
|
||||
pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Last hidden states (final feature map) of the last stage of the pixel decoder model (FPN).
|
||||
transformer_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Last hidden states (final feature map) of the last stage of the transformer decoder model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
|
||||
model at the output of each stage.
|
||||
pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
|
||||
decoder model at the output of each stage.
|
||||
transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the transformer decoder at the output
|
||||
of each stage.
|
||||
hidden_states `tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` containing `encoder_hidden_states`, `pixel_decoder_hidden_states` and
|
||||
`decoder_hidden_states`.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -30,36 +30,25 @@ from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BackboneOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
|
||||
from ...utils import torch_int
|
||||
from ...utils import auto_docstring, torch_int
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from .configuration_maskformer_swin import MaskFormerSwinConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for MaskFormerSwinModel's outputs that also contains the spatial dimensions of the hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state after a mean pooling operation.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot be inferred before the
|
||||
`forward` method.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Last layer hidden-state after a mean pooling operation.
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot be inferred before the
|
||||
`forward` method.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -70,28 +59,17 @@ class MaskFormerSwinModelOutputWithPooling(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MaskFormerSwinBaseModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for SwinEncoder's outputs.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward`
|
||||
method.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class MaskFormerSwinBaseModelOutput(ModelOutput):
|
||||
r"""
|
||||
hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*):
|
||||
A tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to
|
||||
`batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward`
|
||||
method.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -759,12 +737,8 @@ class MaskFormerSwinEncoder(nn.Module):
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MaskFormerSwinPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = MaskFormerSwinConfig
|
||||
base_model_prefix = "model"
|
||||
main_input_name = "pixel_values"
|
||||
|
@ -700,31 +700,22 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`MegatronBertForPreTraining`].
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
|
||||
class MegatronBertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of [`MegatronBertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -69,35 +69,26 @@ class MgpstrDropPath(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MgpstrModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
class MgpstrModelOutput(ModelOutput):
|
||||
r"""
|
||||
logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size,
|
||||
config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size,
|
||||
config.max_token_length, config.num_bpe_labels)`, + one for the output of wordpiece of shape `(batch_size,
|
||||
config.max_token_length, config.num_wordpiece_labels)`) .
|
||||
|
||||
Args:
|
||||
logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size,
|
||||
config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size,
|
||||
config.max_token_length, config.num_bpe_labels)`, + one for the output of wordpiece of shape `(batch_size,
|
||||
config.max_token_length, config.num_wordpiece_labels)`) .
|
||||
Classification scores (before SoftMax) of character, bpe and wordpiece.
|
||||
a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one
|
||||
for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`.
|
||||
|
||||
Classification scores (before SoftMax) of character, bpe and wordpiece.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, config.max_token_length,
|
||||
sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one
|
||||
for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
logits: tuple[torch.FloatTensor] = None
|
||||
|
@ -47,29 +47,29 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
The model will output the same cache format that is fed as input.
|
||||
The model will output the same cache format that is fed as input.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
decoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
decoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
The model will output the same cache format that is fed as input.
|
||||
The model will output the same cache format that is fed as input.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
"""
|
||||
|
||||
audio_codes: Optional[torch.LongTensor] = None
|
||||
@ -79,19 +79,19 @@ class MimiOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
r"""
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
|
||||
Discret code embeddings computed using `model.encode`.
|
||||
encoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
The model will output the same cache format that is fed as input.
|
||||
The model will output the same cache format that is fed as input.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
"""
|
||||
|
||||
audio_codes: Optional[torch.LongTensor] = None
|
||||
@ -99,19 +99,19 @@ class MimiEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MimiDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
decoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
r"""
|
||||
audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
|
||||
Decoded audio values, obtained using the decoder part of Mimi.
|
||||
decoder_past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
|
||||
This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
The model will output the same cache format that is fed as input.
|
||||
The model will output the same cache format that is fed as input.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
|
||||
have their past key value states given to this model).
|
||||
"""
|
||||
|
||||
audio_values: Optional[torch.FloatTensor] = None
|
||||
|
@ -1209,17 +1209,6 @@ class MiniMaxForQuestionAnswering(MiniMaxPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -760,17 +760,6 @@ class MistralForQuestionAnswering(MistralPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -29,8 +29,6 @@ from .configuration_mistral import MistralConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1"
|
||||
|
||||
|
||||
class MistralMLP(LlamaMLP):
|
||||
def __init__(self, config):
|
||||
@ -247,17 +245,6 @@ class MistralForQuestionAnswering(LlamaForQuestionAnswering):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -123,35 +123,26 @@ class Mistral3MultiModalProjector(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Mistral3 causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -163,33 +154,22 @@ class Mistral3CausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mistral3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Mistral3 outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class Mistral3ModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
@ -992,17 +992,6 @@ class MixtralForQuestionAnswering(MixtralPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
|
@ -1531,10 +1531,6 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
|
||||
For each text token (in seq_length):
|
||||
- 1 indicates the token **should attend** to the corresponding image tile
|
||||
- 0 indicates the token **should not attend** to the corresponding image tile
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
full_text_row_masked_out_mask (`tuple[torch.Tensor, torch.Tensor]`, *optional*):
|
||||
A tuple containing two tensors that mask out rows in the cross-attention mechanism:
|
||||
- The first tensor has shape `(batch_size, 1, seq_length, 1)` and contains values of 0 or 1.
|
||||
@ -1544,6 +1540,10 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
|
||||
the forward pass of cross-attention layers.
|
||||
This mask is derived from the cross_attention_mask and is used to handle cases where a text token
|
||||
should not attend to any image token.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -678,30 +678,21 @@ class MobileBertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MobileBertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`MobileBertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class MobileBertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -50,41 +50,43 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Outputs of [`MoshiForConditionalConditionalGeneration.generate`].
|
||||
|
||||
Args:
|
||||
audio_sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, 1, sequence_length)`, *optional*):
|
||||
The generated audio waveforms.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
|
||||
The generated text sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
|
||||
if all batches finished early due to the `eos_token_id`.
|
||||
sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
|
||||
Final beam scores of the generated `sequences`.
|
||||
scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
|
||||
Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
|
||||
of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
|
||||
Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
|
||||
with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
|
||||
logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
|
||||
Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
|
||||
at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
|
||||
each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
|
||||
beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
|
||||
Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
|
||||
`(batch_size*num_return_sequences, sequence_length)`.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
|
||||
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
||||
`torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
|
||||
hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
||||
`torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
|
||||
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
|
||||
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size*num_return_sequences, num_codeooks, sequence_length)`, *optional*):
|
||||
The generated audio codes. Returned if `return_audio_codes=True`. Intermediate audio "tokens" which transforms to `audio_sequences` once passed through the audio decoder.
|
||||
"""
|
||||
)
|
||||
class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
r"""
|
||||
audio_sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, 1, sequence_length)`, *optional*):
|
||||
The generated audio waveforms.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
|
||||
The generated text sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
|
||||
if all batches finished early due to the `eos_token_id`.
|
||||
sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
|
||||
Final beam scores of the generated `sequences`.
|
||||
scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
|
||||
Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
|
||||
of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
|
||||
Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
|
||||
with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
|
||||
logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
|
||||
Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
|
||||
at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
|
||||
each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
|
||||
beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
|
||||
Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
|
||||
`(batch_size*num_return_sequences, sequence_length)`.
|
||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
|
||||
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
||||
`torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
|
||||
hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
||||
`torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
|
||||
Contains the model cache, used to speed up decoding. Different models have a different cache format, check
|
||||
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
|
||||
audio_codes (`torch.LongTensor` of shape `(batch_size*num_return_sequences, num_codeooks, sequence_length)`, *optional*):
|
||||
The generated audio codes. Returned if `return_audio_codes=True`. Intermediate audio "tokens" which transforms to `audio_sequences` once passed through the audio decoder.
|
||||
"""
|
||||
|
||||
audio_sequences: Optional[torch.Tensor] = None
|
||||
@ -100,34 +102,23 @@ class MoshiConditionalGenerationGenerateOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
`MoshiForCausalLM` outputs.
|
||||
"""
|
||||
)
|
||||
class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -139,45 +130,34 @@ class MoshiCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
`MoshiForConditionalGeneration` outputs.
|
||||
"""
|
||||
)
|
||||
class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `text_labels` is provided):
|
||||
Text language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the text language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `text_labels` is provided):
|
||||
Text language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the text language modeling head (scores for each vocabulary token before SoftMax).
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
depth_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `audio_labels` is provided):
|
||||
Audio language modeling loss (for next-token prediction).
|
||||
audio_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the audio language modeling heads.
|
||||
depth_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Past key-values of the depth decoder.
|
||||
depth_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Hidden states of the depth decoder
|
||||
depth_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Depth decoder's Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
depth_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `audio_labels` is provided):
|
||||
Audio language modeling loss (for next-token prediction).
|
||||
audio_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the audio language modeling heads.
|
||||
depth_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Past key-values of the depth decoder.
|
||||
depth_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Hidden states of the depth decoder
|
||||
depth_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Depth decoder's Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -194,18 +174,18 @@ class MoshiConditionalGenerationOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class MoshiUnconditionalInput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
input_ids (`torch.Tensor `of shape `(batch_size, sequence_length), *optional*):
|
||||
The sequence used as a text prompt for the generation.
|
||||
user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*):
|
||||
The audio codes used as audio user prompt for the generation. Has priority over `user_input_values` and represents the audio "tokens" of `user_input_values` once passed through the audio encoder.
|
||||
moshi_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*):
|
||||
The audio codes used as audio Moshi prompt for the generation. Has priority over `moshi_input_values` and represents the audio "tokens" of `moshi_input_values` once passed through the audio encoder.
|
||||
attention_mask (`torch.LongTensor`) of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
|
||||
1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
|
||||
r"""
|
||||
input_ids (`torch.Tensor `of shape `(batch_size, sequence_length), *optional*):
|
||||
The sequence used as a text prompt for the generation.
|
||||
user_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*):
|
||||
The audio codes used as audio user prompt for the generation. Has priority over `user_input_values` and represents the audio "tokens" of `user_input_values` once passed through the audio encoder.
|
||||
moshi_audio_codes (`torch.Tensor `of shape `(batch_size, num_codebooks, sequence_length), *optional*):
|
||||
The audio codes used as audio Moshi prompt for the generation. Has priority over `moshi_input_values` and represents the audio "tokens" of `moshi_input_values` once passed through the audio encoder.
|
||||
attention_mask (`torch.LongTensor`) of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
|
||||
1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
|
||||
"""
|
||||
|
||||
input_ids: Optional[torch.LongTensor] = None
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user