This commit is contained in:
Cyril Vallez 2025-07-02 22:59:10 +02:00
parent fb8b32ef35
commit 98882f1353
No known key found for this signature in database
21 changed files with 116 additions and 209 deletions

View File

@ -1674,21 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel):
return_dict: Optional[bool] = None,
**loss_kwargs,
) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
r"""
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
embedded representation.
labels (`list[Dict]` of len `(batch_size,)`, *optional*):
Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
Examples:
"""
```python
>>> import torch
>>> from transformers.image_utils import load_image
@ -1729,7 +1715,8 @@ class DFineForObjectDetection(DFinePreTrainedModel):
Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
```"""
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

View File

@ -797,11 +797,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -746,7 +746,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
r"""
"""
Examples:
```python

View File

@ -1365,11 +1365,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -1530,11 +1530,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
**kwargs,
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -523,11 +523,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -548,11 +548,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
**loss_kwargs,
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -646,11 +646,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
**loss_kwargs,
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -1825,11 +1825,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
**loss_kwargs,
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs):
class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Glm4vImagesKwargs
videos_kwargs: Glm4vVideosProcessorKwargs
_defaults = {
"text_kwargs": {
"padding": False,
},
}
images_kwargs: Glm4vImagesKwargs
videos_kwargs: Glm4vVideosProcessorKwargs
class Glm4vProcessor(ProcessorMixin):

View File

@ -522,11 +522,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -1504,41 +1504,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
r"""
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
to serve as text prompt, which the Q-Former model will encode.
Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
details.
[What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
provided to serve as text prompt, which the language model can continue.
Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
details.
[What are input IDs?](../glossary#input-ids)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
Only relevant in case an encoder-decoder language model (like T5) is used.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size]`
Examples:
```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch

View File

@ -904,11 +904,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[tuple, InternVLCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -223,12 +223,11 @@ def apply_rotary_pos_emb_vision(
class MLCDAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper
Multi-headed attention with RoPE. Refer to papers:
- Attention is all you need:
https://huggingface.co/papers/1706.03762
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
https://huggingface.co/papers/2104.09864
"""Multi-headed attention with RoPE. Refer to papers:
- Attention is all you need:
https://huggingface.co/papers/1706.03762
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
https://huggingface.co/papers/2104.09864
"""
def __init__(self, config: MLCDVisionConfig):

View File

@ -221,8 +221,6 @@ class SamHQMaskDecoderConfig(PretrainedConfig):
The dimensionality of the hidden states in the IoU head module.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
vit_dim (`int`, *optional*, defaults to 768):
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
"""

View File

@ -71,8 +71,37 @@ class SamHQVisionConfig(SamVisionConfig):
class SamHQMaskDecoderConfig(SamMaskDecoderConfig):
r"""
vit_dim (`int`, *optional*, defaults to 768):
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
This is the configuration class to store the configuration of a [`SamHQMaskDecoder`]. It is used to instantiate a SAM_HQ
mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
will yield a similar configuration to that of the SAM_HQ-vit-h
[facebook/sam_hq-vit-huge](https://huggingface.co/facebook/sam_hq-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 256):
Dimensionality of the hidden states.
hidden_act (`str`, *optional*, defaults to `"relu"`):
The non-linear activation function used inside the `SamHQMaskDecoder` module.
mlp_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 2):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
attention_downsample_rate (`int`, *optional*, defaults to 2):
The downsampling rate of the attention layer.
num_multimask_outputs (`int`, *optional*, defaults to 3):
The number of outputs from the `SamHQMaskDecoder` module. In the Segment Anything paper, this is set to 3.
iou_head_depth (`int`, *optional*, defaults to 3):
The number of layers in the IoU head module.
iou_head_hidden_dim (`int`, *optional*, defaults to 256):
The dimensionality of the hidden states in the IoU head module.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
vit_dim (`int`, *optional*, defaults to 768):
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
"""
def __init__(

View File

@ -877,16 +877,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
r"""
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
Mask to avoid performing attention on padding pixel indices.
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
The hidden states of the image encoder after modality projection.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
```python

View File

@ -26,81 +26,80 @@ from ...configuration_utils import PretrainedConfig, layer_type_validation
class T5GemmaModuleConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the T5GemmaModule-7B.
e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 256000):
Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`T5GemmaModuleModel`]
hidden_size (`int`, *optional*, defaults to 2304):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 9216):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 26):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 4):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
head_dim (`int`, *optional*, defaults to 256):
The attention head dimension.
hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
max_position_embeddings (`int`, *optional*, defaults to 8192):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
eos_token_id (`int`, *optional*, defaults to 1):
End of stream token id.
bos_token_id (`int`, *optional*, defaults to 2):
Beginning of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
scaling factor used on the attention scores
sliding_window (`int`, *optional*, defaults to 4096):
in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
layer_types (`list`, *optional*):
Attention pattern for each layer.
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
scaling factor when applying tanh softcapping on the attention scores.
This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the T5GemmaModule-7B.
e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 256000):
Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`T5GemmaModuleModel`]
hidden_size (`int`, *optional*, defaults to 2304):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 9216):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 26):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 4):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
head_dim (`int`, *optional*, defaults to 256):
The attention head dimension.
hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
max_position_embeddings (`int`, *optional*, defaults to 8192):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
eos_token_id (`int`, *optional*, defaults to 1):
End of stream token id.
bos_token_id (`int`, *optional*, defaults to 2):
Beginning of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
scaling factor used on the attention scores
sliding_window (`int`, *optional*, defaults to 4096):
in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
layer_types (`list`, *optional*):
Attention pattern for each layer.
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
scaling factor when applying tanh softcapping on the attention scores.
```python
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
>>> configuration = T5GemmaModuleConfig()
>>> # Initializing a model from the t5_gemma_module-7b style configuration
>>> model = T5GemmaModuleModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
Module config (encoder or decoder): the same as Gemma2Config."""
```python
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
>>> configuration = T5GemmaModuleConfig()
>>> # Initializing a model from the t5_gemma_module-7b style configuration
>>> model = T5GemmaModuleModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "t5_gemma_module"
keys_to_ignore_at_inference = ["past_key_values"]

View File

@ -65,10 +65,7 @@ logger = logging.get_logger(__name__)
class T5GemmaModuleConfig(Gemma2Config):
"""Module config (encoder or decoder): the same as Gemma2Config."""
def __init__(self, **super_kwargs):
super().__init__(**super_kwargs)
pass
class T5GemmaConfig(PretrainedConfig):

View File

@ -319,17 +319,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
class Zamba2Attention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".
Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Multi-headed attention from 'Attention Is All You Need' paper.
Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:

View File

@ -23,7 +23,6 @@ FILES_TO_PARSE = [
os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"),
os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"),
os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),