From 35297d3b75515efa5901f08513581b60cf4e098e Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Wed, 2 Jul 2025 23:05:52 +0200 Subject: [PATCH] and examples --- .../configuration_my_new_model.py | 2 - .../modular-transformers/modeling_dummy.py | 3 +- .../modeling_multimodal1.py | 3 +- .../modeling_my_new_model2.py | 3 +- .../modeling_new_task_model.py | 34 +---- .../modular-transformers/modeling_super.py | 3 +- .../modeling_switch_function.py | 2 +- .../modular_my_new_model.py | 117 +++++++++++++++++- 8 files changed, 129 insertions(+), 38 deletions(-) diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 49d27f7789c..ff359fa416b 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -125,8 +125,6 @@ class MyNewModelConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ``` - new_param (`int`, *optional*, defaults to `False`): - A fun new parameter """ model_type = "my_new_model" diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py index 5fc7d2f7c35..6aaf5b64eb8 100644 --- a/examples/modular-transformers/modeling_dummy.py +++ b/examples/modular-transformers/modeling_dummy.py @@ -203,7 +203,7 @@ class DummyAttention(nn.Module): past_key_value: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -299,6 +299,7 @@ class DummyPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["DummyDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True _supports_flash_attn_2 = True _supports_sdpa = True _supports_flex_attn = True diff --git a/examples/modular-transformers/modeling_multimodal1.py b/examples/modular-transformers/modeling_multimodal1.py index 3ddb9f80948..50083bded5f 100644 --- a/examples/modular-transformers/modeling_multimodal1.py +++ b/examples/modular-transformers/modeling_multimodal1.py @@ -203,7 +203,7 @@ class Multimodal1TextAttention(nn.Module): past_key_value: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -299,6 +299,7 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["Multimodal1TextDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True _supports_flash_attn_2 = True _supports_sdpa = True _supports_flex_attn = True diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index ad27fc25448..36fc4504101 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -201,7 +201,7 @@ class MyNewModel2Attention(nn.Module): past_key_value: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -297,6 +297,7 @@ class MyNewModel2PreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["MyNewModel2DecoderLayer"] _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True _supports_flash_attn_2 = True _supports_sdpa = True _supports_flex_attn = True diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 429adbe6888..c116b55d4d5 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -118,6 +118,8 @@ class NewTaskModelPreTrainedModel(PreTrainedModel): ) class NewTaskModelModel(NewTaskModelPreTrainedModel): _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch + accepts_loss_kwargs = False def __init__(self, config: NewTaskModelConfig): super().__init__(config) @@ -313,9 +315,11 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel): special_image_mask = inputs_embeds == self.get_input_embeddings()( torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) + special_image_mask = special_image_mask.all(-1) else: - special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1) - special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + special_image_mask = input_ids == self.config.image_token_id + + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel(): image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0] @@ -433,32 +437,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): num_logits_to_keep: int = 0, ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. - - Example: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, NewTaskModelForNewTask - - >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224") - >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224") - - >>> prompt = "Where is the cat standing?" - >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, text=prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(**inputs,) - >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Where is the cat standing?\nsnow" - ``` Returns: """ vlm_outputs = super().forward( diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index a99174908d9..623145c5d4b 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -200,7 +200,7 @@ class SuperAttention(nn.Module): past_key_value: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -296,6 +296,7 @@ class SuperPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["SuperDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_3 = True _supports_flash_attn_2 = True _supports_sdpa = True _supports_flex_attn = True diff --git a/examples/modular-transformers/modeling_switch_function.py b/examples/modular-transformers/modeling_switch_function.py index ec49c0fbebc..9cf190e7572 100644 --- a/examples/modular-transformers/modeling_switch_function.py +++ b/examples/modular-transformers/modeling_switch_function.py @@ -124,7 +124,7 @@ class SwitchFunctionAttention(nn.Module): past_key_value: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) diff --git a/examples/modular-transformers/modular_my_new_model.py b/examples/modular-transformers/modular_my_new_model.py index c1ea8b0a724..58b74cd7eb1 100644 --- a/examples/modular-transformers/modular_my_new_model.py +++ b/examples/modular-transformers/modular_my_new_model.py @@ -2,11 +2,122 @@ from transformers.models.llama.configuration_llama import LlamaConfig # Example where we only want to only add a new config argument and new arg doc -# here there is no `ARG` so we are gonna take parent doc class MyNewModelConfig(LlamaConfig): r""" - new_param (`int`, *optional*, defaults to `False`): - A fun new parameter + This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MyNewModel-7B. + e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MyNewModel model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MyNewModelModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens, + MyNewModel 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'my_new_model3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'my_new_model3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'my_new_model3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'my_new_model3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_attention_heads + + ```python + >>> from transformers import MyNewModelModel, MyNewModelConfig + + >>> # Initializing a MyNewModel my_new_model-7b style configuration + >>> configuration = MyNewModelConfig() + + >>> # Initializing a model from the my_new_model-7b style configuration + >>> model = MyNewModelModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` """ def __init__(self, mlp_bias=True, new_param=0, **super_kwargs):