diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index 66ad345280f..666535cb7c4 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -34,15 +34,7 @@ from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ( - ModelOutput, - add_start_docstrings, - add_start_docstrings_to_model_forward, - auto_docstring, - can_return_tuple, - logging, - replace_return_docstrings, -) +from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, replace_return_docstrings from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig @@ -449,6 +441,7 @@ class Aimv2AttentionPoolingHead(nn.Module): return output +@auto_docstring class Aimv2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -458,7 +451,12 @@ class Aimv2PreTrainedModel(PreTrainedModel): config_class = Aimv2Config base_model_prefix = "aimv2" supports_gradient_checkpointing = True - _no_split_modules = ["AIMv2SwiGLUFFN"] + _no_split_modules = [ + "Aimv2EncoderLayer", + "Aimv2AttentionPoolingHead", + "Aimv2VisionEmbeddings", + "Aimv2TextEmbeddings", + ] _supports_sdpa = True def _init_weights(self, module): @@ -482,42 +480,10 @@ class Aimv2PreTrainedModel(PreTrainedModel): module.cls_token.data.normal_(mean=0.0, std=std) -AIMV2_VISION_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -AIMV2_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults `False`): - Whether to interpolate the pre-trained position encodings. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - """The vision model from AIMv2 without any head or projection on top.""", - AIMV2_VISION_START_DOCSTRING, +@auto_docstring( + custom_intro=""" + The Vision model from AIMv2 without any head or projection on top. + """ ) class Aimv2VisionModel(Aimv2PreTrainedModel): main_input_name = "pixel_values" @@ -540,7 +506,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel): return self.embeddings.patch_embed @can_return_tuple - @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING) + @auto_docstring @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig) def forward( self, @@ -597,6 +563,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel): ) +@auto_docstring( + custom_intro=""" + The text model from AIMv2 without any head or projection on top. + """ +) class Aimv2TextModel(Aimv2PreTrainedModel): main_input_name = "input_ids" @@ -618,6 +589,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel): self.embeddings.token_embedding = value @can_return_tuple + @auto_docstring def forward( self, input_ids, diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 9572dd672be..a45f03ebf43 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -27,8 +27,7 @@ from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel from ...utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, + auto_docstring, can_return_tuple, logging, replace_return_docstrings, @@ -289,8 +288,6 @@ class Aimv2Config(SiglipConfig): del self.initializer_factor - pass - class Aimv2Output(SiglipOutput): pass @@ -434,39 +431,7 @@ class Aimv2AttentionPoolingHead(nn.Module): return output -AIMV2_VISION_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -AIMV2_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults `False`): - Whether to interpolate the pre-trained position encodings. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - +@auto_docstring class Aimv2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -476,7 +441,12 @@ class Aimv2PreTrainedModel(PreTrainedModel): config_class = Aimv2Config base_model_prefix = "aimv2" supports_gradient_checkpointing = True - _no_split_modules = ["AIMv2SwiGLUFFN"] + _no_split_modules = [ + "Aimv2EncoderLayer", + "Aimv2AttentionPoolingHead", + "Aimv2VisionEmbeddings", + "Aimv2TextEmbeddings", + ] _supports_sdpa = True def _init_weights(self, module): @@ -500,9 +470,10 @@ class Aimv2PreTrainedModel(PreTrainedModel): module.cls_token.data.normal_(mean=0.0, std=std) -@add_start_docstrings( - """The vision model from AIMv2 without any head or projection on top.""", - AIMV2_VISION_START_DOCSTRING, +@auto_docstring( + custom_intro=""" + The Vision model from AIMv2 without any head or projection on top. + """ ) class Aimv2VisionModel(Aimv2PreTrainedModel): main_input_name = "pixel_values" @@ -525,7 +496,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel): return self.embeddings.patch_embed @can_return_tuple - @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING) + @auto_docstring @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig) def forward( self, @@ -582,6 +553,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel): ) +@auto_docstring( + custom_intro=""" + The text model from AIMv2 without any head or projection on top. + """ +) class Aimv2TextModel(Aimv2PreTrainedModel): main_input_name = "input_ids" @@ -603,6 +579,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel): self.embeddings.token_embedding = value @can_return_tuple + @auto_docstring def forward( self, input_ids, @@ -648,6 +625,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel): ) +@auto_docstring class Aimv2Model(CLIPModel, nn.Module): def __init__(self, config: Aimv2Config): nn.Module().__init__(config) diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index fa5e3f6dc84..0756180e590 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -510,7 +510,6 @@ class SiglipPreTrainedModel(PreTrainedModel): _no_split_modules = [ "SiglipTextEmbeddings", - "SiglipEncoderLayer", "SiglipVisionEmbeddings", "SiglipEncoderLayer", "SiglipMultiheadAttentionPoolingHead", diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index eb3bf5d4a34..f3e79f143bb 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -742,7 +742,6 @@ class Siglip2PreTrainedModel(PreTrainedModel): _no_split_modules = [ "Siglip2TextEmbeddings", - "Siglip2EncoderLayer", "Siglip2VisionEmbeddings", "Siglip2EncoderLayer", "Siglip2MultiheadAttentionPoolingHead", diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py index 93f99f4269c..ee66e94090d 100644 --- a/tests/models/aimv2/test_modeling_aimv2.py +++ b/tests/models/aimv2/test_modeling_aimv2.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.