mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
apply it
This commit is contained in:
parent
fb8b32ef35
commit
98882f1353
@ -1674,21 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
**loss_kwargs,
|
||||
) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]:
|
||||
r"""
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
|
||||
can choose to directly pass a flattened representation of an image.
|
||||
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
|
||||
embedded representation.
|
||||
labels (`list[Dict]` of len `(batch_size,)`, *optional*):
|
||||
Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
|
||||
following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
|
||||
respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
|
||||
in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
|
||||
|
||||
Examples:
|
||||
|
||||
"""
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from transformers.image_utils import load_image
|
||||
@ -1729,7 +1715,8 @@ class DFineForObjectDetection(DFinePreTrainedModel):
|
||||
Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33]
|
||||
Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57]
|
||||
Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74]
|
||||
```"""
|
||||
```
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
|
@ -797,11 +797,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -746,7 +746,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
|
||||
output_attentions: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> BackboneOutput:
|
||||
r"""
|
||||
"""
|
||||
Examples:
|
||||
|
||||
```python
|
||||
|
@ -1365,11 +1365,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -1530,11 +1530,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
|
||||
**kwargs,
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -523,11 +523,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -548,11 +548,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
|
||||
**loss_kwargs,
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -646,11 +646,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
|
||||
**loss_kwargs,
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -1825,11 +1825,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
|
||||
**loss_kwargs,
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs):
|
||||
|
||||
|
||||
class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
|
||||
images_kwargs: Glm4vImagesKwargs
|
||||
videos_kwargs: Glm4vVideosProcessorKwargs
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
}
|
||||
images_kwargs: Glm4vImagesKwargs
|
||||
videos_kwargs: Glm4vVideosProcessorKwargs
|
||||
|
||||
|
||||
class Glm4vProcessor(ProcessorMixin):
|
||||
|
@ -522,11 +522,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -1504,41 +1504,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
|
||||
to serve as text prompt, which the Q-Former model will encode.
|
||||
|
||||
Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
|
||||
details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
|
||||
provided to serve as text prompt, which the language model can continue.
|
||||
|
||||
Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
|
||||
details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
|
||||
be used by default.
|
||||
|
||||
Only relevant in case an encoder-decoder language model (like T5) is used.
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
|
||||
1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
||||
config.vocab_size]`
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
|
||||
>>> import torch
|
||||
|
@ -904,11 +904,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, InternVLCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -223,12 +223,11 @@ def apply_rotary_pos_emb_vision(
|
||||
|
||||
|
||||
class MLCDAttention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper
|
||||
Multi-headed attention with RoPE. Refer to papers:
|
||||
- Attention is all you need:
|
||||
https://huggingface.co/papers/1706.03762
|
||||
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
|
||||
https://huggingface.co/papers/2104.09864
|
||||
"""Multi-headed attention with RoPE. Refer to papers:
|
||||
- Attention is all you need:
|
||||
https://huggingface.co/papers/1706.03762
|
||||
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
|
||||
https://huggingface.co/papers/2104.09864
|
||||
"""
|
||||
|
||||
def __init__(self, config: MLCDVisionConfig):
|
||||
|
@ -221,8 +221,6 @@ class SamHQMaskDecoderConfig(PretrainedConfig):
|
||||
The dimensionality of the hidden states in the IoU head module.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the layer normalization layers.
|
||||
|
||||
|
||||
vit_dim (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
|
||||
"""
|
||||
|
@ -71,8 +71,37 @@ class SamHQVisionConfig(SamVisionConfig):
|
||||
|
||||
class SamHQMaskDecoderConfig(SamMaskDecoderConfig):
|
||||
r"""
|
||||
vit_dim (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
|
||||
This is the configuration class to store the configuration of a [`SamHQMaskDecoder`]. It is used to instantiate a SAM_HQ
|
||||
mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
|
||||
will yield a similar configuration to that of the SAM_HQ-vit-h
|
||||
[facebook/sam_hq-vit-huge](https://huggingface.co/facebook/sam_hq-vit-huge) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 256):
|
||||
Dimensionality of the hidden states.
|
||||
hidden_act (`str`, *optional*, defaults to `"relu"`):
|
||||
The non-linear activation function used inside the `SamHQMaskDecoder` module.
|
||||
mlp_dim (`int`, *optional*, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 2):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
attention_downsample_rate (`int`, *optional*, defaults to 2):
|
||||
The downsampling rate of the attention layer.
|
||||
num_multimask_outputs (`int`, *optional*, defaults to 3):
|
||||
The number of outputs from the `SamHQMaskDecoder` module. In the Segment Anything paper, this is set to 3.
|
||||
iou_head_depth (`int`, *optional*, defaults to 3):
|
||||
The number of layers in the IoU head module.
|
||||
iou_head_hidden_dim (`int`, *optional*, defaults to 256):
|
||||
The dimensionality of the hidden states in the IoU head module.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the layer normalization layers.
|
||||
vit_dim (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -877,16 +877,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel indices.
|
||||
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
||||
The hidden states of the image encoder after modality projection.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
|
||||
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
|
||||
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -26,81 +26,80 @@ from ...configuration_utils import PretrainedConfig, layer_type_validation
|
||||
|
||||
class T5GemmaModuleConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the T5GemmaModule-7B.
|
||||
e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`T5GemmaModuleModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 2304):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 9216):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 26):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 4):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details, check out [this
|
||||
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
head_dim (`int`, *optional*, defaults to 256):
|
||||
The attention head dimension.
|
||||
hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
|
||||
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 1):
|
||||
End of stream token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
|
||||
scaling factor used on the attention scores
|
||||
sliding_window (`int`, *optional*, defaults to 4096):
|
||||
in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
|
||||
layer_types (`list`, *optional*):
|
||||
Attention pattern for each layer.
|
||||
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
|
||||
scaling factor when applying tanh softcapping on the logits.
|
||||
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
|
||||
scaling factor when applying tanh softcapping on the attention scores.
|
||||
This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the T5GemmaModule-7B.
|
||||
e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`T5GemmaModuleModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 2304):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 9216):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 26):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 4):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details, check out [this
|
||||
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
head_dim (`int`, *optional*, defaults to 256):
|
||||
The attention head dimension.
|
||||
hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
|
||||
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 1):
|
||||
End of stream token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
|
||||
scaling factor used on the attention scores
|
||||
sliding_window (`int`, *optional*, defaults to 4096):
|
||||
in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
|
||||
layer_types (`list`, *optional*):
|
||||
Attention pattern for each layer.
|
||||
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
|
||||
scaling factor when applying tanh softcapping on the logits.
|
||||
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
|
||||
scaling factor when applying tanh softcapping on the attention scores.
|
||||
|
||||
```python
|
||||
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
|
||||
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
|
||||
>>> configuration = T5GemmaModuleConfig()
|
||||
>>> # Initializing a model from the t5_gemma_module-7b style configuration
|
||||
>>> model = T5GemmaModuleModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
Module config (encoder or decoder): the same as Gemma2Config."""
|
||||
```python
|
||||
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
|
||||
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
|
||||
>>> configuration = T5GemmaModuleConfig()
|
||||
>>> # Initializing a model from the t5_gemma_module-7b style configuration
|
||||
>>> model = T5GemmaModuleModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "t5_gemma_module"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
@ -65,10 +65,7 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class T5GemmaModuleConfig(Gemma2Config):
|
||||
"""Module config (encoder or decoder): the same as Gemma2Config."""
|
||||
|
||||
def __init__(self, **super_kwargs):
|
||||
super().__init__(**super_kwargs)
|
||||
pass
|
||||
|
||||
|
||||
class T5GemmaConfig(PretrainedConfig):
|
||||
|
@ -319,17 +319,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||
|
||||
class Zamba2Attention(nn.Module):
|
||||
"""
|
||||
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
|
||||
and "Generating Long Sequences with Sparse Transformers".
|
||||
|
||||
Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
|
||||
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
|
||||
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
|
||||
(see fig. 2 in https://huggingface.co/papers/2405.16712).
|
||||
Additionally, replaced
|
||||
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
|
||||
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
|
||||
|
||||
Multi-headed attention from 'Attention Is All You Need' paper.
|
||||
|
||||
Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
|
||||
|
@ -23,7 +23,6 @@ FILES_TO_PARSE = [
|
||||
os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
|
||||
os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
|
||||
os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"),
|
||||
os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"),
|
||||
os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
|
||||
os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
|
||||
os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),
|
||||
|
Loading…
Reference in New Issue
Block a user