mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Fix undeterministic order in modular dependencies (#39005)
* sort correctly * Update modeling_minimax.py * Update modular_model_converter.py
This commit is contained in:
parent
bdf5fb70aa
commit
e1e11b0299
@ -14,6 +14,7 @@ class MyNewModelConfig(PretrainedConfig):
|
|||||||
This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
|
This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
|
||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||||
defaults will yield a similar configuration to that of the MyNewModel-7B.
|
defaults will yield a similar configuration to that of the MyNewModel-7B.
|
||||||
|
e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf)
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
@ -4,37 +4,25 @@
|
|||||||
# the file from the modular. If any change should be done, please apply the change to the
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
# modular_dummy.py file directly. One of our CI enforces this.
|
# modular_dummy.py file directly. One of our CI enforces this.
|
||||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
from typing import Callable, Optional, Union
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
from ...cache_utils import Cache, DynamicCache
|
||||||
from ...integrations import use_kernel_forward_from_hub
|
from ...integrations import use_kernel_forward_from_hub
|
||||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
from ...masking_utils import create_causal_mask
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_layers import GradientCheckpointingLayer
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutputWithPast
|
from ...modeling_outputs import BaseModelOutputWithPast
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, can_return_tuple, logging
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
can_return_tuple,
|
|
||||||
is_torch_flex_attn_available,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_dummy import DummyConfig
|
from .configuration_dummy import DummyConfig
|
||||||
|
|
||||||
|
|
||||||
if is_torch_flex_attn_available():
|
|
||||||
from torch.nn.attention.flex_attention import BlockMask
|
|
||||||
|
|
||||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -232,15 +220,8 @@ class DummyAttention(nn.Module):
|
|||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
attention_interface: Callable = eager_attention_forward
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
|
||||||
if self.config._attn_implementation != "eager":
|
if self.config._attn_implementation != "eager":
|
||||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
logger.warning_once(
|
|
||||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
|
||||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
||||||
|
|
||||||
attn_output, attn_weights = attention_interface(
|
attn_output, attn_weights = attention_interface(
|
||||||
self,
|
self,
|
||||||
@ -311,27 +292,7 @@ class DummyDecoderLayer(GradientCheckpointingLayer):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
DUMMY_START_DOCSTRING = r"""
|
@auto_docstring
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`DummyConfig`]):
|
|
||||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
||||||
load the weights associated with the model, only the configuration. Check out the
|
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Dummy Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
DUMMY_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class DummyPreTrainedModel(PreTrainedModel):
|
class DummyPreTrainedModel(PreTrainedModel):
|
||||||
config_class = DummyConfig
|
config_class = DummyConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -360,88 +321,8 @@ class DummyPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
DUMMY_INPUTS_DOCSTRING = r"""
|
@auto_docstring
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
||||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
||||||
it.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
|
||||||
but you can also pass a `BlockMask` object directly here.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
|
||||||
`past_key_values`).
|
|
||||||
|
|
||||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
|
||||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
|
||||||
information on the default strategy.
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.n_positions - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
past_key_values (`Cache`, *optional*):
|
|
||||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
|
||||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
|
||||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
||||||
|
|
||||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
|
||||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
|
||||||
of shape `(batch_size, sequence_length)`.
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
|
||||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
|
||||||
the complete sequence length.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Dummy Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
DUMMY_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class DummyModel(DummyPreTrainedModel):
|
class DummyModel(DummyPreTrainedModel):
|
||||||
"""
|
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DummyDecoderLayer`]
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: DummyConfig
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: DummyConfig):
|
def __init__(self, config: DummyConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -465,7 +346,7 @@ class DummyModel(DummyPreTrainedModel):
|
|||||||
self.embed_tokens = value
|
self.embed_tokens = value
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(DUMMY_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
@ -513,8 +394,12 @@ class DummyModel(DummyPreTrainedModel):
|
|||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = cache_position.unsqueeze(0)
|
position_ids = cache_position.unsqueeze(0)
|
||||||
|
|
||||||
causal_mask = self._update_causal_mask(
|
causal_mask = create_causal_mask(
|
||||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
config=self.config,
|
||||||
|
input_embeds=inputs_embeds,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = inputs_embeds
|
hidden_states = inputs_embeds
|
||||||
@ -559,126 +444,3 @@ class DummyModel(DummyPreTrainedModel):
|
|||||||
hidden_states=all_hidden_states,
|
hidden_states=all_hidden_states,
|
||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _update_causal_mask(
|
|
||||||
self,
|
|
||||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
|
||||||
input_tensor: torch.Tensor,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
past_key_values: Cache,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
):
|
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
|
||||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
|
||||||
return attention_mask
|
|
||||||
return None
|
|
||||||
if self.config._attn_implementation == "flex_attention":
|
|
||||||
if isinstance(attention_mask, torch.Tensor):
|
|
||||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
|
||||||
return attention_mask
|
|
||||||
|
|
||||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
|
||||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
|
||||||
# to infer the attention mask.
|
|
||||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
||||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
|
||||||
|
|
||||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
|
||||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
|
||||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
|
||||||
attention_mask,
|
|
||||||
inputs_embeds=input_tensor,
|
|
||||||
past_key_values_length=past_seen_tokens,
|
|
||||||
is_training=self.training,
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
dtype = input_tensor.dtype
|
|
||||||
sequence_length = input_tensor.shape[1]
|
|
||||||
if using_static_cache:
|
|
||||||
target_length = past_key_values.get_max_cache_shape()
|
|
||||||
else:
|
|
||||||
target_length = (
|
|
||||||
attention_mask.shape[-1]
|
|
||||||
if isinstance(attention_mask, torch.Tensor)
|
|
||||||
else past_seen_tokens + sequence_length + 1
|
|
||||||
)
|
|
||||||
|
|
||||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
|
||||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask,
|
|
||||||
sequence_length=sequence_length,
|
|
||||||
target_length=target_length,
|
|
||||||
dtype=dtype,
|
|
||||||
cache_position=cache_position,
|
|
||||||
batch_size=input_tensor.shape[0],
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
self.config._attn_implementation == "sdpa"
|
|
||||||
and attention_mask is not None
|
|
||||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
|
||||||
and not output_attentions
|
|
||||||
):
|
|
||||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
|
||||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
|
||||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask: torch.Tensor,
|
|
||||||
sequence_length: int,
|
|
||||||
target_length: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
batch_size: int,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
|
||||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attention_mask (`torch.Tensor`):
|
|
||||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
|
||||||
`(batch_size, 1, query_length, key_value_length)`.
|
|
||||||
sequence_length (`int`):
|
|
||||||
The sequence length being processed.
|
|
||||||
target_length (`int`):
|
|
||||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
|
||||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
|
||||||
dtype (`torch.dtype`):
|
|
||||||
The dtype to use for the 4D attention mask.
|
|
||||||
cache_position (`torch.Tensor`):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence.
|
|
||||||
batch_size (`torch.Tensor`):
|
|
||||||
Batch size.
|
|
||||||
"""
|
|
||||||
if attention_mask is not None and attention_mask.dim() == 4:
|
|
||||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
|
||||||
causal_mask = attention_mask
|
|
||||||
else:
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = torch.full(
|
|
||||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
|
||||||
)
|
|
||||||
if sequence_length != 1:
|
|
||||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
|
||||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
|
||||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
|
||||||
if attention_mask is not None:
|
|
||||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
|
||||||
mask_length = attention_mask.shape[-1]
|
|
||||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
|
||||||
causal_mask.device
|
|
||||||
)
|
|
||||||
padding_mask = padding_mask == 0
|
|
||||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
|
||||||
padding_mask, min_dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
@ -14,24 +14,16 @@ from torch import nn
|
|||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||||
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, get_torch_version, logging
|
||||||
add_code_sample_docstrings,
|
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
get_torch_version,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_dummy_bert import DummyBertConfig
|
from .configuration_dummy_bert import DummyBertConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
_CHECKPOINT_FOR_DOC = "google-dummy_bert/dummy_bert-base-uncased"
|
|
||||||
_CONFIG_FOR_DOC = "DummyBertConfig"
|
|
||||||
|
|
||||||
|
|
||||||
class DummyBertEmbeddings(nn.Module):
|
class DummyBertEmbeddings(nn.Module):
|
||||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||||
@ -432,7 +424,7 @@ class DummyBertOutput(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class DummyBertLayer(nn.Module):
|
class DummyBertLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||||
@ -557,27 +549,15 @@ class DummyBertEncoder(nn.Module):
|
|||||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
layer_outputs = layer_module(
|
||||||
layer_outputs = self._gradient_checkpointing_func(
|
hidden_states,
|
||||||
layer_module.__call__,
|
attention_mask,
|
||||||
hidden_states,
|
layer_head_mask,
|
||||||
attention_mask,
|
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||||
layer_head_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
encoder_hidden_states,
|
past_key_value=past_key_value,
|
||||||
encoder_attention_mask,
|
output_attentions=output_attentions,
|
||||||
past_key_value,
|
)
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
layer_outputs = layer_module(
|
|
||||||
hidden_states,
|
|
||||||
attention_mask,
|
|
||||||
layer_head_mask,
|
|
||||||
encoder_hidden_states,
|
|
||||||
encoder_attention_mask,
|
|
||||||
past_key_value,
|
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
if use_cache:
|
if use_cache:
|
||||||
@ -739,12 +719,8 @@ def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
class DummyBertPreTrainedModel(PreTrainedModel):
|
class DummyBertPreTrainedModel(PreTrainedModel):
|
||||||
"""
|
|
||||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
|
||||||
models.
|
|
||||||
"""
|
|
||||||
|
|
||||||
config_class = DummyBertConfig
|
config_class = DummyBertConfig
|
||||||
load_tf_weights = load_tf_weights_in_dummy_bert
|
load_tf_weights = load_tf_weights_in_dummy_bert
|
||||||
base_model_prefix = "dummy_bert"
|
base_model_prefix = "dummy_bert"
|
||||||
@ -770,79 +746,8 @@ class DummyBertPreTrainedModel(PreTrainedModel):
|
|||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
|
||||||
|
|
||||||
DUMMY_BERT_START_DOCSTRING = r"""
|
@auto_docstring(
|
||||||
|
custom_intro="""
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`DummyBertConfig`]): Model configuration class with all the parameters of the model.
|
|
||||||
Initializing with a config file does not load the weights associated with the model, only the
|
|
||||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DUMMY_BERT_INPUTS_DOCSTRING = r"""
|
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
|
||||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
|
||||||
1]`:
|
|
||||||
|
|
||||||
- 0 corresponds to a *sentence A* token,
|
|
||||||
- 1 corresponds to a *sentence B* token.
|
|
||||||
|
|
||||||
[What are token type IDs?](../glossary#token-type-ids)
|
|
||||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.max_position_embeddings - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
|
||||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare DummyBert Model transformer outputting raw hidden-states without any specific head on top.",
|
|
||||||
DUMMY_BERT_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class DummyBertModel(DummyBertPreTrainedModel):
|
|
||||||
"""
|
|
||||||
|
|
||||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||||
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
||||||
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
||||||
@ -852,10 +757,15 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
|||||||
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
||||||
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
||||||
"""
|
"""
|
||||||
|
)
|
||||||
|
class DummyBertModel(DummyBertPreTrainedModel):
|
||||||
_no_split_modules = ["DummyBertEmbeddings", "DummyBertLayer"]
|
_no_split_modules = ["DummyBertEmbeddings", "DummyBertLayer"]
|
||||||
|
|
||||||
def __init__(self, config, add_pooling_layer=True):
|
def __init__(self, config, add_pooling_layer=True):
|
||||||
|
r"""
|
||||||
|
add_pooling_layer (bool, *optional*, defaults to `True`):
|
||||||
|
Whether to add a pooling layer
|
||||||
|
"""
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
@ -884,12 +794,7 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(DUMMY_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
@auto_docstring
|
||||||
@add_code_sample_docstrings(
|
|
||||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
|
||||||
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
|
|
||||||
config_class=_CONFIG_FOR_DOC,
|
|
||||||
)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor] = None,
|
input_ids: Optional[torch.Tensor] = None,
|
||||||
@ -906,26 +811,6 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
|||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||||
r"""
|
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
|
||||||
the model is configured as a decoder.
|
|
||||||
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
|
|
||||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
|
||||||
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
|
||||||
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
|
||||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
|
||||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
"""
|
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
@ -10,6 +10,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig
|
from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig
|
||||||
@ -138,7 +139,7 @@ class FromUppercaseModelMLP(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class FromUppercaseModelEncoderLayer(nn.Module):
|
class FromUppercaseModelEncoderLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config: Union[FromUppercaseModelVisionConfig, FromUppercaseModelTextConfig]):
|
def __init__(self, config: Union[FromUppercaseModelVisionConfig, FromUppercaseModelTextConfig]):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.embed_dim = config.hidden_size
|
self.embed_dim = config.hidden_size
|
||||||
|
@ -4,37 +4,25 @@
|
|||||||
# the file from the modular. If any change should be done, please apply the change to the
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
# modular_multimodal1.py file directly. One of our CI enforces this.
|
# modular_multimodal1.py file directly. One of our CI enforces this.
|
||||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
from typing import Callable, Optional, Union
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
from ...cache_utils import Cache, DynamicCache
|
||||||
from ...integrations import use_kernel_forward_from_hub
|
from ...integrations import use_kernel_forward_from_hub
|
||||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
from ...masking_utils import create_causal_mask
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_layers import GradientCheckpointingLayer
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutputWithPast
|
from ...modeling_outputs import BaseModelOutputWithPast
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, can_return_tuple, logging
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
can_return_tuple,
|
|
||||||
is_torch_flex_attn_available,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_multimodal1 import Multimodal1TextConfig
|
from .configuration_multimodal1 import Multimodal1TextConfig
|
||||||
|
|
||||||
|
|
||||||
if is_torch_flex_attn_available():
|
|
||||||
from torch.nn.attention.flex_attention import BlockMask
|
|
||||||
|
|
||||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -232,15 +220,8 @@ class Multimodal1TextAttention(nn.Module):
|
|||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
attention_interface: Callable = eager_attention_forward
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
|
||||||
if self.config._attn_implementation != "eager":
|
if self.config._attn_implementation != "eager":
|
||||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
logger.warning_once(
|
|
||||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
|
||||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
||||||
|
|
||||||
attn_output, attn_weights = attention_interface(
|
attn_output, attn_weights = attention_interface(
|
||||||
self,
|
self,
|
||||||
@ -311,27 +292,7 @@ class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
MULTIMODAL1_TEXT_START_DOCSTRING = r"""
|
@auto_docstring
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`Multimodal1TextConfig`]):
|
|
||||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
||||||
load the weights associated with the model, only the configuration. Check out the
|
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
MULTIMODAL1_TEXT_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class Multimodal1TextPreTrainedModel(PreTrainedModel):
|
class Multimodal1TextPreTrainedModel(PreTrainedModel):
|
||||||
config_class = Multimodal1TextConfig
|
config_class = Multimodal1TextConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -360,88 +321,8 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r"""
|
@auto_docstring
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
||||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
||||||
it.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
|
||||||
but you can also pass a `BlockMask` object directly here.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
|
||||||
`past_key_values`).
|
|
||||||
|
|
||||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
|
||||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
|
||||||
information on the default strategy.
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.n_positions - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
past_key_values (`Cache`, *optional*):
|
|
||||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
|
||||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
|
||||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
||||||
|
|
||||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
|
||||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
|
||||||
of shape `(batch_size, sequence_length)`.
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
|
||||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
|
||||||
the complete sequence length.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
MULTIMODAL1_TEXT_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
||||||
"""
|
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Multimodal1TextDecoderLayer`]
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Multimodal1TextConfig
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: Multimodal1TextConfig):
|
def __init__(self, config: Multimodal1TextConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -465,7 +346,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
|||||||
self.embed_tokens = value
|
self.embed_tokens = value
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(MULTIMODAL1_TEXT_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
@ -513,8 +394,12 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
|||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = cache_position.unsqueeze(0)
|
position_ids = cache_position.unsqueeze(0)
|
||||||
|
|
||||||
causal_mask = self._update_causal_mask(
|
causal_mask = create_causal_mask(
|
||||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
config=self.config,
|
||||||
|
input_embeds=inputs_embeds,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = inputs_embeds
|
hidden_states = inputs_embeds
|
||||||
@ -559,126 +444,3 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
|||||||
hidden_states=all_hidden_states,
|
hidden_states=all_hidden_states,
|
||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _update_causal_mask(
|
|
||||||
self,
|
|
||||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
|
||||||
input_tensor: torch.Tensor,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
past_key_values: Cache,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
):
|
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
|
||||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
|
||||||
return attention_mask
|
|
||||||
return None
|
|
||||||
if self.config._attn_implementation == "flex_attention":
|
|
||||||
if isinstance(attention_mask, torch.Tensor):
|
|
||||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
|
||||||
return attention_mask
|
|
||||||
|
|
||||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
|
||||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
|
||||||
# to infer the attention mask.
|
|
||||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
||||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
|
||||||
|
|
||||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
|
||||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
|
||||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
|
||||||
attention_mask,
|
|
||||||
inputs_embeds=input_tensor,
|
|
||||||
past_key_values_length=past_seen_tokens,
|
|
||||||
is_training=self.training,
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
dtype = input_tensor.dtype
|
|
||||||
sequence_length = input_tensor.shape[1]
|
|
||||||
if using_static_cache:
|
|
||||||
target_length = past_key_values.get_max_cache_shape()
|
|
||||||
else:
|
|
||||||
target_length = (
|
|
||||||
attention_mask.shape[-1]
|
|
||||||
if isinstance(attention_mask, torch.Tensor)
|
|
||||||
else past_seen_tokens + sequence_length + 1
|
|
||||||
)
|
|
||||||
|
|
||||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
|
||||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask,
|
|
||||||
sequence_length=sequence_length,
|
|
||||||
target_length=target_length,
|
|
||||||
dtype=dtype,
|
|
||||||
cache_position=cache_position,
|
|
||||||
batch_size=input_tensor.shape[0],
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
self.config._attn_implementation == "sdpa"
|
|
||||||
and attention_mask is not None
|
|
||||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
|
||||||
and not output_attentions
|
|
||||||
):
|
|
||||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
|
||||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
|
||||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask: torch.Tensor,
|
|
||||||
sequence_length: int,
|
|
||||||
target_length: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
batch_size: int,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
|
||||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attention_mask (`torch.Tensor`):
|
|
||||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
|
||||||
`(batch_size, 1, query_length, key_value_length)`.
|
|
||||||
sequence_length (`int`):
|
|
||||||
The sequence length being processed.
|
|
||||||
target_length (`int`):
|
|
||||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
|
||||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
|
||||||
dtype (`torch.dtype`):
|
|
||||||
The dtype to use for the 4D attention mask.
|
|
||||||
cache_position (`torch.Tensor`):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence.
|
|
||||||
batch_size (`torch.Tensor`):
|
|
||||||
Batch size.
|
|
||||||
"""
|
|
||||||
if attention_mask is not None and attention_mask.dim() == 4:
|
|
||||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
|
||||||
causal_mask = attention_mask
|
|
||||||
else:
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = torch.full(
|
|
||||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
|
||||||
)
|
|
||||||
if sequence_length != 1:
|
|
||||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
|
||||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
|
||||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
|
||||||
if attention_mask is not None:
|
|
||||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
|
||||||
mask_length = attention_mask.shape[-1]
|
|
||||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
|
||||||
causal_mask.device
|
|
||||||
)
|
|
||||||
padding_mask = padding_mask == 0
|
|
||||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
|
||||||
padding_mask, min_dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
@ -13,15 +13,10 @@ from torch import nn
|
|||||||
from transformers.utils import add_start_docstrings
|
from transformers.utils import add_start_docstrings
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, can_return_tuple, logging, torch_int
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
can_return_tuple,
|
|
||||||
logging,
|
|
||||||
replace_return_docstrings,
|
|
||||||
torch_int,
|
|
||||||
)
|
|
||||||
from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig
|
from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig
|
||||||
|
|
||||||
|
|
||||||
@ -229,7 +224,7 @@ class Multimodal2Attention(nn.Module):
|
|||||||
return attn_output, attn_weights
|
return attn_output, attn_weights
|
||||||
|
|
||||||
|
|
||||||
class Multimodal2VisionEncoderLayer(nn.Module):
|
class Multimodal2VisionEncoderLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.embed_dim = config.hidden_size
|
self.embed_dim = config.hidden_size
|
||||||
@ -344,21 +339,12 @@ class Multimodal2VisionEncoder(nn.Module):
|
|||||||
for idx, encoder_layer in enumerate(self.layers):
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
if self.gradient_checkpointing and self.training:
|
layer_outputs = encoder_layer(
|
||||||
layer_outputs = self._gradient_checkpointing_func(
|
hidden_states,
|
||||||
encoder_layer.__call__,
|
attention_mask,
|
||||||
hidden_states,
|
causal_attention_mask,
|
||||||
attention_mask,
|
output_attentions=output_attentions,
|
||||||
causal_attention_mask,
|
)
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
layer_outputs = encoder_layer(
|
|
||||||
hidden_states,
|
|
||||||
attention_mask,
|
|
||||||
causal_attention_mask,
|
|
||||||
output_attentions=output_attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
@ -458,24 +444,6 @@ class Multimodal2VisionEmbeddings(nn.Module):
|
|||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
MULTIMODAL2_VISION_INPUTS_DOCSTRING = r"""
|
|
||||||
Args:
|
|
||||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
|
||||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
|
||||||
[`AutoImageProcessor`]. See [`Multimodal2ImageProcessor.__call__`] for details.
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
|
|
||||||
Whether to interpolate the pre-trained position encodings.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class Multimodal2VisionTransformer(nn.Module):
|
class Multimodal2VisionTransformer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -488,8 +456,7 @@ class Multimodal2VisionTransformer(nn.Module):
|
|||||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
pixel_values: Optional[torch.FloatTensor] = None,
|
pixel_values: Optional[torch.FloatTensor] = None,
|
||||||
@ -497,10 +464,6 @@ class Multimodal2VisionTransformer(nn.Module):
|
|||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
interpolate_pos_encoding: Optional[bool] = False,
|
interpolate_pos_encoding: Optional[bool] = False,
|
||||||
) -> BaseModelOutputWithPooling:
|
) -> BaseModelOutputWithPooling:
|
||||||
r"""
|
|
||||||
Returns:
|
|
||||||
|
|
||||||
"""
|
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
@ -530,17 +493,15 @@ class Multimodal2VisionTransformer(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
class Multimodal2VisionPreTrainedModel(PreTrainedModel):
|
class Multimodal2VisionPreTrainedModel(PreTrainedModel):
|
||||||
"""
|
|
||||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
|
||||||
models.
|
|
||||||
"""
|
|
||||||
|
|
||||||
config_class = Multimodal2Config
|
config_class = Multimodal2Config
|
||||||
base_model_prefix = "multimodal2_vision"
|
base_model_prefix = "multimodal2_vision"
|
||||||
supports_gradient_checkpointing = True
|
supports_gradient_checkpointing = True
|
||||||
_supports_sdpa = True
|
_supports_sdpa = True
|
||||||
_supports_flash_attn_2 = True
|
_supports_flash_attn_2 = True
|
||||||
|
_supports_flex_attn = True
|
||||||
|
_supports_attention_backend = True
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
@ -567,8 +528,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
|
|||||||
return self.vision_model.embeddings.patch_embedding
|
return self.vision_model.embeddings.patch_embedding
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
pixel_values: Optional[torch.FloatTensor] = None,
|
pixel_values: Optional[torch.FloatTensor] = None,
|
||||||
@ -577,9 +537,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
|
|||||||
interpolate_pos_encoding: bool = False,
|
interpolate_pos_encoding: bool = False,
|
||||||
) -> BaseModelOutputWithPooling:
|
) -> BaseModelOutputWithPooling:
|
||||||
r"""
|
r"""
|
||||||
Returns:
|
Example:
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from PIL import Image
|
>>> from PIL import Image
|
||||||
|
@ -4,36 +4,24 @@
|
|||||||
# the file from the modular. If any change should be done, please apply the change to the
|
# the file from the modular. If any change should be done, please apply the change to the
|
||||||
# modular_my_new_model2.py file directly. One of our CI enforces this.
|
# modular_my_new_model2.py file directly. One of our CI enforces this.
|
||||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||||
from typing import Callable, Optional, Union
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
from ...cache_utils import Cache, DynamicCache
|
||||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
from ...masking_utils import create_causal_mask
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_layers import GradientCheckpointingLayer
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
|
from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, can_return_tuple, logging
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
can_return_tuple,
|
|
||||||
is_torch_flex_attn_available,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_my_new_model2 import MyNewModel2Config
|
from .configuration_my_new_model2 import MyNewModel2Config
|
||||||
|
|
||||||
|
|
||||||
if is_torch_flex_attn_available():
|
|
||||||
from torch.nn.attention.flex_attention import BlockMask
|
|
||||||
|
|
||||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -230,15 +218,8 @@ class MyNewModel2Attention(nn.Module):
|
|||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
attention_interface: Callable = eager_attention_forward
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
|
||||||
if self.config._attn_implementation != "eager":
|
if self.config._attn_implementation != "eager":
|
||||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
logger.warning_once(
|
|
||||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
|
||||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
||||||
|
|
||||||
attn_output, attn_weights = attention_interface(
|
attn_output, attn_weights = attention_interface(
|
||||||
self,
|
self,
|
||||||
@ -309,27 +290,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
MY_NEW_MODEL2_START_DOCSTRING = r"""
|
@auto_docstring
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`MyNewModel2Config`]):
|
|
||||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
||||||
load the weights associated with the model, only the configuration. Check out the
|
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare MyNewModel2 Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
MY_NEW_MODEL2_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class MyNewModel2PreTrainedModel(PreTrainedModel):
|
class MyNewModel2PreTrainedModel(PreTrainedModel):
|
||||||
config_class = MyNewModel2Config
|
config_class = MyNewModel2Config
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -358,88 +319,8 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
MY_NEW_MODEL2_INPUTS_DOCSTRING = r"""
|
@auto_docstring
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
||||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
||||||
it.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
|
||||||
but you can also pass a `BlockMask` object directly here.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
|
||||||
`past_key_values`).
|
|
||||||
|
|
||||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
|
||||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
|
||||||
information on the default strategy.
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.n_positions - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
past_key_values (`Cache`, *optional*):
|
|
||||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
|
||||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
|
||||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
||||||
|
|
||||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
|
||||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
|
||||||
of shape `(batch_size, sequence_length)`.
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
|
||||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
|
||||||
the complete sequence length.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare MyNewModel2 Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
MY_NEW_MODEL2_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||||
"""
|
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MyNewModel2DecoderLayer`]
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: MyNewModel2Config
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: MyNewModel2Config):
|
def __init__(self, config: MyNewModel2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -463,19 +344,19 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||||||
self.embed_tokens = value
|
self.embed_tokens = value
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
|
past_key_values: Optional[Cache] = None,
|
||||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
cache_position: Optional[torch.LongTensor] = None,
|
||||||
**kwargs, # NOOP kwarg for now
|
**kwargs: Unpack[FlashAttentionKwargs],
|
||||||
) -> BaseModelOutputWithPast:
|
) -> BaseModelOutputWithPast:
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
@ -507,8 +388,12 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = cache_position.unsqueeze(0)
|
position_ids = cache_position.unsqueeze(0)
|
||||||
|
|
||||||
causal_mask = self._update_causal_mask(
|
causal_mask = create_causal_mask(
|
||||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
config=self.config,
|
||||||
|
input_embeds=inputs_embeds,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
)
|
)
|
||||||
|
|
||||||
# embed positions
|
# embed positions
|
||||||
@ -540,6 +425,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
cache_position=cache_position,
|
cache_position=cache_position,
|
||||||
position_embeddings=position_embeddings,
|
position_embeddings=position_embeddings,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
@ -560,132 +446,9 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||||||
attentions=all_self_attns,
|
attentions=all_self_attns,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _update_causal_mask(
|
|
||||||
self,
|
|
||||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
|
||||||
input_tensor: torch.Tensor,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
past_key_values: Cache,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
):
|
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
|
||||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
|
||||||
return attention_mask
|
|
||||||
return None
|
|
||||||
if self.config._attn_implementation == "flex_attention":
|
|
||||||
if isinstance(attention_mask, torch.Tensor):
|
|
||||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
|
||||||
return attention_mask
|
|
||||||
|
|
||||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
@auto_docstring(
|
||||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
custom_intro="""
|
||||||
# to infer the attention mask.
|
|
||||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
||||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
|
||||||
|
|
||||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
|
||||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
|
||||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
|
||||||
attention_mask,
|
|
||||||
inputs_embeds=input_tensor,
|
|
||||||
past_key_values_length=past_seen_tokens,
|
|
||||||
is_training=self.training,
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
dtype = input_tensor.dtype
|
|
||||||
sequence_length = input_tensor.shape[1]
|
|
||||||
if using_static_cache:
|
|
||||||
target_length = past_key_values.get_max_cache_shape()
|
|
||||||
else:
|
|
||||||
target_length = (
|
|
||||||
attention_mask.shape[-1]
|
|
||||||
if isinstance(attention_mask, torch.Tensor)
|
|
||||||
else past_seen_tokens + sequence_length + 1
|
|
||||||
)
|
|
||||||
|
|
||||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
|
||||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask,
|
|
||||||
sequence_length=sequence_length,
|
|
||||||
target_length=target_length,
|
|
||||||
dtype=dtype,
|
|
||||||
cache_position=cache_position,
|
|
||||||
batch_size=input_tensor.shape[0],
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
self.config._attn_implementation == "sdpa"
|
|
||||||
and attention_mask is not None
|
|
||||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
|
||||||
and not output_attentions
|
|
||||||
):
|
|
||||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
|
||||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
|
||||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask: torch.Tensor,
|
|
||||||
sequence_length: int,
|
|
||||||
target_length: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
batch_size: int,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
|
||||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attention_mask (`torch.Tensor`):
|
|
||||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
|
||||||
`(batch_size, 1, query_length, key_value_length)`.
|
|
||||||
sequence_length (`int`):
|
|
||||||
The sequence length being processed.
|
|
||||||
target_length (`int`):
|
|
||||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
|
||||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
|
||||||
dtype (`torch.dtype`):
|
|
||||||
The dtype to use for the 4D attention mask.
|
|
||||||
cache_position (`torch.Tensor`):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence.
|
|
||||||
batch_size (`torch.Tensor`):
|
|
||||||
Batch size.
|
|
||||||
"""
|
|
||||||
if attention_mask is not None and attention_mask.dim() == 4:
|
|
||||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
|
||||||
causal_mask = attention_mask
|
|
||||||
else:
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = torch.full(
|
|
||||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
|
||||||
)
|
|
||||||
if sequence_length != 1:
|
|
||||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
|
||||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
|
||||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
|
||||||
if attention_mask is not None:
|
|
||||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
|
||||||
mask_length = attention_mask.shape[-1]
|
|
||||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
|
||||||
causal_mask.device
|
|
||||||
)
|
|
||||||
padding_mask = padding_mask == 0
|
|
||||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
|
||||||
padding_mask, min_dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"""
|
|
||||||
The MyNewModel2 Model transformer with a sequence classification head on top (linear layer).
|
The MyNewModel2 Model transformer with a sequence classification head on top (linear layer).
|
||||||
|
|
||||||
[`MyNewModel2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
[`MyNewModel2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
||||||
@ -696,8 +459,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||||||
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
||||||
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
||||||
each row of the batch).
|
each row of the batch).
|
||||||
""",
|
"""
|
||||||
MY_NEW_MODEL2_START_DOCSTRING,
|
|
||||||
)
|
)
|
||||||
class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
|
class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@ -716,7 +478,7 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
|
|||||||
self.model.embed_tokens = value
|
self.model.embed_tokens = value
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
|
@ -22,68 +22,48 @@ from .configuration_new_task_model import NewTaskModelConfig
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class NewTaskModelModelOutputWithPast(BaseModelOutputWithPast):
|
@auto_docstring(
|
||||||
"""
|
custom_intro="""
|
||||||
Base class for NewTaskModel outputs, with hidden states and attentions.
|
Base class for NewTaskModel outputs, with hidden states and attentions.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
class NewTaskModelModelOutputWithPast(BaseModelOutputWithPast):
|
||||||
|
r"""
|
||||||
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||||
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||||
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||||
|
|
||||||
Args:
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
`past_key_values` input) to speed up sequential decoding.
|
||||||
Sequence of hidden-states at the output of the last layer of the model.
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
|
||||||
|
|
||||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
|
||||||
`past_key_values` input) to speed up sequential decoding.
|
|
||||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
|
||||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
|
||||||
|
|
||||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
|
||||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
||||||
sequence_length)`.
|
|
||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
|
||||||
heads.
|
|
||||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
|
||||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
|
||||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class NewTaskModelCausalLMOutputWithPast(ModelOutput):
|
@auto_docstring(
|
||||||
"""
|
custom_intro="""
|
||||||
Base class for NewTaskModel causal language model (or autoregressive) outputs.
|
Base class for NewTaskModel causal language model (or autoregressive) outputs.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
class NewTaskModelCausalLMOutputWithPast(ModelOutput):
|
||||||
|
r"""
|
||||||
|
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||||
|
Language modeling loss (for next-token prediction).
|
||||||
|
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||||
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||||
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||||
|
|
||||||
Args:
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
`past_key_values` input) to speed up sequential decoding.
|
||||||
Language modeling loss (for next-token prediction).
|
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
|
||||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
|
||||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
|
||||||
|
|
||||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
|
||||||
`past_key_values` input) to speed up sequential decoding.
|
|
||||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
|
||||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
|
||||||
|
|
||||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
|
||||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
||||||
sequence_length)`.
|
|
||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
|
||||||
heads.
|
|
||||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
|
||||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
|
||||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@ -157,6 +137,12 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
|
|||||||
def set_input_embeddings(self, value):
|
def set_input_embeddings(self, value):
|
||||||
self.language_model.set_input_embeddings(value)
|
self.language_model.set_input_embeddings(value)
|
||||||
|
|
||||||
|
def set_decoder(self, decoder):
|
||||||
|
self.language_model = decoder
|
||||||
|
|
||||||
|
def get_decoder(self):
|
||||||
|
return self.language_model
|
||||||
|
|
||||||
def _update_causal_mask(
|
def _update_causal_mask(
|
||||||
self,
|
self,
|
||||||
attention_mask,
|
attention_mask,
|
||||||
@ -406,10 +392,13 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
|
|||||||
self.lm_head = new_embeddings
|
self.lm_head = new_embeddings
|
||||||
|
|
||||||
def set_decoder(self, decoder):
|
def set_decoder(self, decoder):
|
||||||
self.model = decoder
|
self.model.set_decoder(decoder)
|
||||||
|
|
||||||
def get_decoder(self):
|
def get_decoder(self):
|
||||||
return self.model
|
return self.model.get_decoder()
|
||||||
|
|
||||||
|
def get_image_features(self, pixel_values):
|
||||||
|
return self.model.get_image_features(pixel_values)
|
||||||
|
|
||||||
# Make modules available throught conditional class for BC
|
# Make modules available throught conditional class for BC
|
||||||
@property
|
@property
|
||||||
|
@ -14,24 +14,16 @@ from packaging import version
|
|||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||||
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, get_torch_version, logging
|
||||||
add_code_sample_docstrings,
|
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
get_torch_version,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_roberta import RobertaConfig
|
from .configuration_roberta import RobertaConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
_CHECKPOINT_FOR_DOC = "google-roberta/roberta-base-uncased"
|
|
||||||
_CONFIG_FOR_DOC = "RobertaConfig"
|
|
||||||
|
|
||||||
|
|
||||||
class RobertaEmbeddings(nn.Module):
|
class RobertaEmbeddings(nn.Module):
|
||||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||||
@ -435,7 +427,7 @@ class RobertaOutput(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class RobertaLayer(nn.Module):
|
class RobertaLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||||
@ -560,27 +552,15 @@ class RobertaEncoder(nn.Module):
|
|||||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
layer_outputs = layer_module(
|
||||||
layer_outputs = self._gradient_checkpointing_func(
|
hidden_states,
|
||||||
layer_module.__call__,
|
attention_mask,
|
||||||
hidden_states,
|
layer_head_mask,
|
||||||
attention_mask,
|
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||||
layer_head_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
encoder_hidden_states,
|
past_key_value=past_key_value,
|
||||||
encoder_attention_mask,
|
output_attentions=output_attentions,
|
||||||
past_key_value,
|
)
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
layer_outputs = layer_module(
|
|
||||||
hidden_states,
|
|
||||||
attention_mask,
|
|
||||||
layer_head_mask,
|
|
||||||
encoder_hidden_states,
|
|
||||||
encoder_attention_mask,
|
|
||||||
past_key_value,
|
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
if use_cache:
|
if use_cache:
|
||||||
@ -742,12 +722,8 @@ def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
class RobertaPreTrainedModel(PreTrainedModel):
|
class RobertaPreTrainedModel(PreTrainedModel):
|
||||||
"""
|
|
||||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
|
||||||
models.
|
|
||||||
"""
|
|
||||||
|
|
||||||
config_class = RobertaConfig
|
config_class = RobertaConfig
|
||||||
load_tf_weights = load_tf_weights_in_roberta
|
load_tf_weights = load_tf_weights_in_roberta
|
||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
@ -773,79 +749,8 @@ class RobertaPreTrainedModel(PreTrainedModel):
|
|||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
|
||||||
|
|
||||||
ROBERTA_START_DOCSTRING = r"""
|
@auto_docstring(
|
||||||
|
custom_intro="""
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`RobertaConfig`]): Model configuration class with all the parameters of the model.
|
|
||||||
Initializing with a config file does not load the weights associated with the model, only the
|
|
||||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
|
||||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
|
||||||
1]`:
|
|
||||||
|
|
||||||
- 0 corresponds to a *sentence A* token,
|
|
||||||
- 1 corresponds to a *sentence B* token.
|
|
||||||
|
|
||||||
[What are token type IDs?](../glossary#token-type-ids)
|
|
||||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.max_position_embeddings - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
|
||||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Roberta Model transformer outputting raw hidden-states without any specific head on top.",
|
|
||||||
ROBERTA_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class RobertaModel(RobertaPreTrainedModel):
|
|
||||||
"""
|
|
||||||
|
|
||||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||||
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
||||||
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
||||||
@ -855,10 +760,15 @@ class RobertaModel(RobertaPreTrainedModel):
|
|||||||
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
||||||
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
||||||
"""
|
"""
|
||||||
|
)
|
||||||
|
class RobertaModel(RobertaPreTrainedModel):
|
||||||
_no_split_modules = ["RobertaEmbeddings", "RobertaLayer"]
|
_no_split_modules = ["RobertaEmbeddings", "RobertaLayer"]
|
||||||
|
|
||||||
def __init__(self, config, add_pooling_layer=True):
|
def __init__(self, config, add_pooling_layer=True):
|
||||||
|
r"""
|
||||||
|
add_pooling_layer (bool, *optional*, defaults to `True`):
|
||||||
|
Whether to add a pooling layer
|
||||||
|
"""
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
@ -887,12 +797,7 @@ class RobertaModel(RobertaPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
@auto_docstring
|
||||||
@add_code_sample_docstrings(
|
|
||||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
|
||||||
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
|
|
||||||
config_class=_CONFIG_FOR_DOC,
|
|
||||||
)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor] = None,
|
input_ids: Optional[torch.Tensor] = None,
|
||||||
@ -909,26 +814,6 @@ class RobertaModel(RobertaPreTrainedModel):
|
|||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||||
r"""
|
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
|
||||||
the model is configured as a decoder.
|
|
||||||
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
|
|
||||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
|
||||||
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
|
||||||
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
|
||||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
|
||||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
"""
|
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
@ -12,33 +12,17 @@ from torch import nn
|
|||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...cache_utils import Cache, StaticCache
|
from ...cache_utils import Cache
|
||||||
from ...integrations import use_kernel_forward_from_hub
|
from ...integrations import use_kernel_forward_from_hub
|
||||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
|
||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_layers import GradientCheckpointingLayer
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import (
|
from ...utils import auto_docstring, can_return_tuple
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
can_return_tuple,
|
|
||||||
is_torch_flex_attn_available,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
from .configuration_super import SuperConfig
|
from .configuration_super import SuperConfig
|
||||||
|
|
||||||
|
|
||||||
if is_torch_flex_attn_available():
|
|
||||||
from torch.nn.attention.flex_attention import BlockMask
|
|
||||||
|
|
||||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@use_kernel_forward_from_hub("RMSNorm")
|
@use_kernel_forward_from_hub("RMSNorm")
|
||||||
class SuperRMSNorm(nn.Module):
|
class SuperRMSNorm(nn.Module):
|
||||||
def __init__(self, hidden_size, eps=1e-6):
|
def __init__(self, hidden_size, eps=1e-6):
|
||||||
@ -233,15 +217,8 @@ class SuperAttention(nn.Module):
|
|||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
attention_interface: Callable = eager_attention_forward
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
|
||||||
if self.config._attn_implementation != "eager":
|
if self.config._attn_implementation != "eager":
|
||||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
logger.warning_once(
|
|
||||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
|
||||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
||||||
|
|
||||||
attn_output, attn_weights = attention_interface(
|
attn_output, attn_weights = attention_interface(
|
||||||
self,
|
self,
|
||||||
@ -312,27 +289,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
SUPER_START_DOCSTRING = r"""
|
@auto_docstring
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`SuperConfig`]):
|
|
||||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
||||||
load the weights associated with the model, only the configuration. Check out the
|
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Super Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
SUPER_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class SuperPreTrainedModel(PreTrainedModel):
|
class SuperPreTrainedModel(PreTrainedModel):
|
||||||
config_class = SuperConfig
|
config_class = SuperConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -361,88 +318,8 @@ class SuperPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
SUPER_INPUTS_DOCSTRING = r"""
|
@auto_docstring
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
||||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
||||||
it.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
|
||||||
but you can also pass a `BlockMask` object directly here.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
|
||||||
`past_key_values`).
|
|
||||||
|
|
||||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
|
||||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
|
||||||
information on the default strategy.
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
|
||||||
config.n_positions - 1]`.
|
|
||||||
|
|
||||||
[What are position IDs?](../glossary#position-ids)
|
|
||||||
past_key_values (`Cache`, *optional*):
|
|
||||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
|
||||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
|
||||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
||||||
|
|
||||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
|
||||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
|
||||||
of shape `(batch_size, sequence_length)`.
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
|
||||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
|
||||||
model's internal embedding lookup matrix.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
|
||||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
|
||||||
the complete sequence length.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"The bare Super Model outputting raw hidden-states without any specific head on top.",
|
|
||||||
SUPER_START_DOCSTRING,
|
|
||||||
)
|
|
||||||
class SuperModel(SuperPreTrainedModel):
|
class SuperModel(SuperPreTrainedModel):
|
||||||
"""
|
|
||||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SuperDecoderLayer`]
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: SuperConfig
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: SuperConfig):
|
def __init__(self, config: SuperConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -466,7 +343,7 @@ class SuperModel(SuperPreTrainedModel):
|
|||||||
self.embed_tokens = value
|
self.embed_tokens = value
|
||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@add_start_docstrings_to_model_forward(SUPER_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
@ -494,126 +371,3 @@ class SuperModel(SuperPreTrainedModel):
|
|||||||
)
|
)
|
||||||
out.logits *= 2**4
|
out.logits *= 2**4
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def _update_causal_mask(
|
|
||||||
self,
|
|
||||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
|
||||||
input_tensor: torch.Tensor,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
past_key_values: Cache,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
):
|
|
||||||
if self.config._attn_implementation == "flash_attention_2":
|
|
||||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
|
||||||
return attention_mask
|
|
||||||
return None
|
|
||||||
if self.config._attn_implementation == "flex_attention":
|
|
||||||
if isinstance(attention_mask, torch.Tensor):
|
|
||||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
|
||||||
return attention_mask
|
|
||||||
|
|
||||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
|
||||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
|
||||||
# to infer the attention mask.
|
|
||||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
||||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
|
||||||
|
|
||||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
|
||||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
|
||||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
|
||||||
attention_mask,
|
|
||||||
inputs_embeds=input_tensor,
|
|
||||||
past_key_values_length=past_seen_tokens,
|
|
||||||
is_training=self.training,
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
dtype = input_tensor.dtype
|
|
||||||
sequence_length = input_tensor.shape[1]
|
|
||||||
if using_static_cache:
|
|
||||||
target_length = past_key_values.get_max_cache_shape()
|
|
||||||
else:
|
|
||||||
target_length = (
|
|
||||||
attention_mask.shape[-1]
|
|
||||||
if isinstance(attention_mask, torch.Tensor)
|
|
||||||
else past_seen_tokens + sequence_length + 1
|
|
||||||
)
|
|
||||||
|
|
||||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
|
||||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask,
|
|
||||||
sequence_length=sequence_length,
|
|
||||||
target_length=target_length,
|
|
||||||
dtype=dtype,
|
|
||||||
cache_position=cache_position,
|
|
||||||
batch_size=input_tensor.shape[0],
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
self.config._attn_implementation == "sdpa"
|
|
||||||
and attention_mask is not None
|
|
||||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
|
||||||
and not output_attentions
|
|
||||||
):
|
|
||||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
|
||||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
|
||||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
attention_mask: torch.Tensor,
|
|
||||||
sequence_length: int,
|
|
||||||
target_length: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
cache_position: torch.Tensor,
|
|
||||||
batch_size: int,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
|
||||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
attention_mask (`torch.Tensor`):
|
|
||||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
|
||||||
`(batch_size, 1, query_length, key_value_length)`.
|
|
||||||
sequence_length (`int`):
|
|
||||||
The sequence length being processed.
|
|
||||||
target_length (`int`):
|
|
||||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
|
||||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
|
||||||
dtype (`torch.dtype`):
|
|
||||||
The dtype to use for the 4D attention mask.
|
|
||||||
cache_position (`torch.Tensor`):
|
|
||||||
Indices depicting the position of the input sequence tokens in the sequence.
|
|
||||||
batch_size (`torch.Tensor`):
|
|
||||||
Batch size.
|
|
||||||
"""
|
|
||||||
if attention_mask is not None and attention_mask.dim() == 4:
|
|
||||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
|
||||||
causal_mask = attention_mask
|
|
||||||
else:
|
|
||||||
min_dtype = torch.finfo(dtype).min
|
|
||||||
causal_mask = torch.full(
|
|
||||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
|
||||||
)
|
|
||||||
if sequence_length != 1:
|
|
||||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
|
||||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
|
||||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
|
||||||
if attention_mask is not None:
|
|
||||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
|
||||||
mask_length = attention_mask.shape[-1]
|
|
||||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
|
||||||
causal_mask.device
|
|
||||||
)
|
|
||||||
padding_mask = padding_mask == 0
|
|
||||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
|
||||||
padding_mask, min_dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
return causal_mask
|
|
||||||
|
@ -14,13 +14,9 @@ from ...cache_utils import Cache
|
|||||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||||
from ...processing_utils import Unpack
|
from ...processing_utils import Unpack
|
||||||
from ...utils import logging
|
|
||||||
from .configuration_switch_function import SwitchFunctionConfig
|
from .configuration_switch_function import SwitchFunctionConfig
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_half(x):
|
def rotate_half(x):
|
||||||
# Split and rotate. Note that this function is different from e.g. Llama.
|
# Split and rotate. Note that this function is different from e.g. Llama.
|
||||||
x1 = x[..., ::2]
|
x1 = x[..., ::2]
|
||||||
@ -145,15 +141,8 @@ class SwitchFunctionAttention(nn.Module):
|
|||||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||||
|
|
||||||
attention_interface: Callable = eager_attention_forward
|
attention_interface: Callable = eager_attention_forward
|
||||||
|
|
||||||
if self.config._attn_implementation != "eager":
|
if self.config._attn_implementation != "eager":
|
||||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||||
logger.warning_once(
|
|
||||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
|
||||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
||||||
|
|
||||||
attn_output, attn_weights = attention_interface(
|
attn_output, attn_weights = attention_interface(
|
||||||
self,
|
self,
|
||||||
|
@ -16,17 +16,11 @@ from torch import Tensor, nn
|
|||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...integrations import use_kernel_forward_from_hub
|
from ...integrations import use_kernel_forward_from_hub
|
||||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||||
|
from ...modeling_layers import GradientCheckpointingLayer
|
||||||
from ...modeling_outputs import BaseModelOutput
|
from ...modeling_outputs import BaseModelOutput
|
||||||
from ...modeling_utils import PreTrainedModel
|
from ...modeling_utils import PreTrainedModel
|
||||||
from ...pytorch_utils import meshgrid
|
from ...pytorch_utils import meshgrid
|
||||||
from ...utils import (
|
from ...utils import ModelOutput, auto_docstring, is_timm_available, requires_backends
|
||||||
ModelOutput,
|
|
||||||
add_start_docstrings,
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
is_timm_available,
|
|
||||||
replace_return_docstrings,
|
|
||||||
requires_backends,
|
|
||||||
)
|
|
||||||
from ...utils.backbone_utils import load_backbone
|
from ...utils.backbone_utils import load_backbone
|
||||||
from .configuration_test_detr import TestDetrConfig
|
from .configuration_test_detr import TestDetrConfig
|
||||||
|
|
||||||
@ -34,8 +28,6 @@ from .configuration_test_detr import TestDetrConfig
|
|||||||
if is_timm_available():
|
if is_timm_available():
|
||||||
from timm import create_model
|
from timm import create_model
|
||||||
|
|
||||||
_CONFIG_FOR_DOC = "TestDetrConfig"
|
|
||||||
|
|
||||||
|
|
||||||
@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
|
@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
|
||||||
class MultiScaleDeformableAttention(nn.Module):
|
class MultiScaleDeformableAttention(nn.Module):
|
||||||
@ -93,32 +85,24 @@ class MultiScaleDeformableAttention(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TestDetrDecoderOutput(ModelOutput):
|
@auto_docstring(
|
||||||
"""
|
custom_intro="""
|
||||||
Base class for outputs of the TestDetrDecoder. This class adds two attributes to
|
Base class for outputs of the TestDetrDecoder. This class adds two attributes to
|
||||||
BaseModelOutputWithCrossAttentions, namely:
|
BaseModelOutputWithCrossAttentions, namely:
|
||||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||||
- a stacked tensor of intermediate reference points.
|
- a stacked tensor of intermediate reference points.
|
||||||
|
"""
|
||||||
Args:
|
)
|
||||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
class TestDetrDecoderOutput(ModelOutput):
|
||||||
Sequence of hidden-states at the output of the last layer of the model.
|
r"""
|
||||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||||
plus the initial embedding outputs.
|
used to compute the weighted average in the cross-attention heads.
|
||||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
||||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
|
||||||
the self-attention heads.
|
|
||||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
||||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
|
||||||
used to compute the weighted average in the cross-attention heads.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
@ -130,47 +114,27 @@ class TestDetrDecoderOutput(ModelOutput):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TestDetrModelOutput(ModelOutput):
|
@auto_docstring(
|
||||||
"""
|
custom_intro="""
|
||||||
Base class for outputs of the Deformable DETR encoder-decoder model.
|
Base class for outputs of the Deformable DETR encoder-decoder model.
|
||||||
|
"""
|
||||||
Args:
|
)
|
||||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
class TestDetrModelOutput(ModelOutput):
|
||||||
Initial reference points sent through the Transformer decoder.
|
r"""
|
||||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
Initial reference points sent through the Transformer decoder.
|
||||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||||
plus the initial embedding outputs.
|
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
foreground and background).
|
||||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||||
average in the self-attention heads.
|
Logits of predicted bounding boxes coordinates in the first stage.
|
||||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
|
||||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
|
||||||
weighted average in the cross-attention heads.
|
|
||||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
|
||||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
|
||||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
|
||||||
layer plus the initial embedding outputs.
|
|
||||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
|
||||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
|
||||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
|
||||||
self-attention heads.
|
|
||||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
|
||||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
|
||||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
|
||||||
foreground and background).
|
|
||||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
|
||||||
Logits of predicted bounding boxes coordinates in the first stage.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
init_reference_points: Optional[torch.FloatTensor] = None
|
init_reference_points: Optional[torch.FloatTensor] = None
|
||||||
@ -635,7 +599,7 @@ class TestDetrMultiheadAttention(nn.Module):
|
|||||||
return attn_output, attn_weights_reshaped
|
return attn_output, attn_weights_reshaped
|
||||||
|
|
||||||
|
|
||||||
class TestDetrEncoderLayer(nn.Module):
|
class TestDetrEncoderLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config: TestDetrConfig):
|
def __init__(self, config: TestDetrConfig):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.embed_dim = config.d_model
|
self.embed_dim = config.d_model
|
||||||
@ -724,7 +688,7 @@ class TestDetrEncoderLayer(nn.Module):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
class TestDetrDecoderLayer(nn.Module):
|
class TestDetrDecoderLayer(GradientCheckpointingLayer):
|
||||||
def __init__(self, config: TestDetrConfig):
|
def __init__(self, config: TestDetrConfig):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.embed_dim = config.d_model
|
self.embed_dim = config.d_model
|
||||||
@ -837,6 +801,7 @@ class TestDetrDecoderLayer(nn.Module):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@auto_docstring
|
||||||
class TestDetrPreTrainedModel(PreTrainedModel):
|
class TestDetrPreTrainedModel(PreTrainedModel):
|
||||||
config_class = TestDetrConfig
|
config_class = TestDetrConfig
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
@ -1001,29 +966,16 @@ class TestDetrEncoder(TestDetrPreTrainedModel):
|
|||||||
for i, encoder_layer in enumerate(self.layers):
|
for i, encoder_layer in enumerate(self.layers):
|
||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
encoder_states = encoder_states + (hidden_states,)
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
if self.gradient_checkpointing and self.training:
|
layer_outputs = encoder_layer(
|
||||||
layer_outputs = self._gradient_checkpointing_func(
|
hidden_states,
|
||||||
encoder_layer.__call__,
|
attention_mask,
|
||||||
hidden_states,
|
position_embeddings=position_embeddings,
|
||||||
attention_mask,
|
reference_points=reference_points,
|
||||||
position_embeddings,
|
spatial_shapes=spatial_shapes,
|
||||||
reference_points,
|
spatial_shapes_list=spatial_shapes_list,
|
||||||
spatial_shapes,
|
level_start_index=level_start_index,
|
||||||
spatial_shapes_list,
|
output_attentions=output_attentions,
|
||||||
level_start_index,
|
)
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
layer_outputs = encoder_layer(
|
|
||||||
hidden_states,
|
|
||||||
attention_mask,
|
|
||||||
position_embeddings=position_embeddings,
|
|
||||||
reference_points=reference_points,
|
|
||||||
spatial_shapes=spatial_shapes,
|
|
||||||
spatial_shapes_list=spatial_shapes_list,
|
|
||||||
level_start_index=level_start_index,
|
|
||||||
output_attentions=output_attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
@ -1155,31 +1107,17 @@ class TestDetrDecoder(TestDetrPreTrainedModel):
|
|||||||
if output_hidden_states:
|
if output_hidden_states:
|
||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
layer_outputs = decoder_layer(
|
||||||
layer_outputs = self._gradient_checkpointing_func(
|
hidden_states,
|
||||||
decoder_layer.__call__,
|
position_embeddings,
|
||||||
hidden_states,
|
reference_points_input,
|
||||||
position_embeddings,
|
spatial_shapes,
|
||||||
reference_points_input,
|
spatial_shapes_list,
|
||||||
spatial_shapes,
|
level_start_index,
|
||||||
spatial_shapes_list,
|
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||||
level_start_index,
|
encoder_attention_mask,
|
||||||
encoder_hidden_states,
|
output_attentions,
|
||||||
encoder_attention_mask,
|
)
|
||||||
output_attentions,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
layer_outputs = decoder_layer(
|
|
||||||
hidden_states,
|
|
||||||
position_embeddings=position_embeddings,
|
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
|
||||||
reference_points=reference_points_input,
|
|
||||||
spatial_shapes=spatial_shapes,
|
|
||||||
spatial_shapes_list=spatial_shapes_list,
|
|
||||||
level_start_index=level_start_index,
|
|
||||||
encoder_attention_mask=encoder_attention_mask,
|
|
||||||
output_attentions=output_attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = layer_outputs[0]
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
@ -1253,67 +1191,11 @@ def build_position_encoding(config):
|
|||||||
return position_embedding
|
return position_embedding
|
||||||
|
|
||||||
|
|
||||||
TEST_DETR_START_DOCSTRING = r"""
|
@auto_docstring(
|
||||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
custom_intro="""
|
||||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
|
||||||
etc.)
|
|
||||||
|
|
||||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
|
||||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
|
||||||
and behavior.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
config ([`TestDetrConfig`]):
|
|
||||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
||||||
load the weights associated with the model, only the configuration. Check out the
|
|
||||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
TEST_DETR_INPUTS_DOCSTRING = r"""
|
|
||||||
Args:
|
|
||||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
|
||||||
Pixel values. Padding will be ignored by default should you provide it.
|
|
||||||
|
|
||||||
Pixel values can be obtained using [`AutoImageProcessor`]. See [`TestDetrImageProcessor.__call__`]
|
|
||||||
for details.
|
|
||||||
|
|
||||||
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for pixels that are real (i.e. **not masked**),
|
|
||||||
- 0 for pixels that are padding (i.e. **masked**).
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
|
|
||||||
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
|
||||||
Not used by default. Can be used to mask object queries.
|
|
||||||
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
|
|
||||||
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
|
|
||||||
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
|
|
||||||
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
|
|
||||||
can choose to directly pass a flattened representation of an image.
|
|
||||||
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
|
|
||||||
embedded representation.
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
|
||||||
tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
|
||||||
more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
|
||||||
"""
|
|
||||||
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
|
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
|
||||||
hidden-states without any specific head on top.
|
hidden-states without any specific head on top.
|
||||||
""",
|
"""
|
||||||
TEST_DETR_START_DOCSTRING,
|
|
||||||
)
|
)
|
||||||
class TestDetrModel(TestDetrPreTrainedModel):
|
class TestDetrModel(TestDetrPreTrainedModel):
|
||||||
def __init__(self, config: TestDetrConfig):
|
def __init__(self, config: TestDetrConfig):
|
||||||
@ -1486,8 +1368,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
|||||||
object_query = self.enc_output_norm(self.enc_output(object_query))
|
object_query = self.enc_output_norm(self.enc_output(object_query))
|
||||||
return object_query, output_proposals
|
return object_query, output_proposals
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(TEST_DETR_INPUTS_DOCSTRING)
|
@auto_docstring
|
||||||
@replace_return_docstrings(output_type=TestDetrModelOutput, config_class=_CONFIG_FOR_DOC)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
pixel_values: torch.FloatTensor,
|
pixel_values: torch.FloatTensor,
|
||||||
@ -1501,7 +1382,14 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
|||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
|
) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
|
||||||
r"""
|
r"""
|
||||||
Returns:
|
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
||||||
|
Not used by default. Can be used to mask object queries.
|
||||||
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||||
|
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
|
||||||
|
can choose to directly pass a flattened representation of an image.
|
||||||
|
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||||
|
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
|
||||||
|
embedded representation.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
|
@ -469,10 +469,10 @@ class MiniMaxSparseMoeBlock(nn.Module):
|
|||||||
# this will be used to easily index which expert is going to be sollicitated
|
# this will be used to easily index which expert is going to be sollicitated
|
||||||
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
|
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
|
||||||
|
|
||||||
expert_hitted = (expert_mask.sum(dim=(-1, -2)) > 0).nonzero(as_tuple=True)[0].tolist()
|
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
|
||||||
for expert_idx in expert_hitted:
|
for expert_idx in expert_hitted:
|
||||||
expert_layer = self.experts[expert_idx]
|
expert_layer = self.experts[expert_idx]
|
||||||
idx, top_x = torch.where(expert_mask[expert_idx])
|
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
|
||||||
# Index the correct hidden states and compute the expert hidden state for
|
# Index the correct hidden states and compute the expert hidden state for
|
||||||
# the current expert. We need to make sure to multiply the output hidden
|
# the current expert. We need to make sure to multiply the output hidden
|
||||||
# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
|
# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
|
||||||
|
@ -1439,7 +1439,7 @@ class ModularFileMapper(ModuleMapper):
|
|||||||
|
|
||||||
original_dependencies = []
|
original_dependencies = []
|
||||||
other_files_dependencies = defaultdict(list)
|
other_files_dependencies = defaultdict(list)
|
||||||
for dep in tuple(missing_dependencies):
|
for dep in sorted(missing_dependencies):
|
||||||
if dep in self.added_objects_file_mapping:
|
if dep in self.added_objects_file_mapping:
|
||||||
file = self.added_objects_file_mapping[dep]
|
file = self.added_objects_file_mapping[dep]
|
||||||
other_files_dependencies[file].append(dep)
|
other_files_dependencies[file].append(dep)
|
||||||
|
Loading…
Reference in New Issue
Block a user