diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 66ad345280f..666535cb7c4 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -34,15 +34,7 @@ from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    auto_docstring,
-    can_return_tuple,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, replace_return_docstrings
 from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 
 
@@ -449,6 +441,7 @@ class Aimv2AttentionPoolingHead(nn.Module):
         return output
 
 
+@auto_docstring
 class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -458,7 +451,12 @@ class Aimv2PreTrainedModel(PreTrainedModel):
     config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _no_split_modules = [
+        "Aimv2EncoderLayer",
+        "Aimv2AttentionPoolingHead",
+        "Aimv2VisionEmbeddings",
+        "Aimv2TextEmbeddings",
+    ]
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -482,42 +480,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
-AIMV2_VISION_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-AIMV2_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_VISION_START_DOCSTRING,
+@auto_docstring(
+    custom_intro="""
+    The Vision model from AIMv2 without any head or projection on top.
+    """
 )
 class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
@@ -540,7 +506,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
-    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @auto_docstring
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
@@ -597,6 +563,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    The text model from AIMv2 without any head or projection on top.
+    """
+)
 class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
@@ -618,6 +589,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         self.embeddings.token_embedding = value
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9572dd672be..a45f03ebf43 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -27,8 +27,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
+    auto_docstring,
     can_return_tuple,
     logging,
     replace_return_docstrings,
@@ -289,8 +288,6 @@ class Aimv2Config(SiglipConfig):
 
         del self.initializer_factor
 
-    pass
-
 
 class Aimv2Output(SiglipOutput):
     pass
@@ -434,39 +431,7 @@ class Aimv2AttentionPoolingHead(nn.Module):
         return output
 
 
-AIMV2_VISION_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-AIMV2_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
+@auto_docstring
 class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -476,7 +441,12 @@ class Aimv2PreTrainedModel(PreTrainedModel):
     config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _no_split_modules = [
+        "Aimv2EncoderLayer",
+        "Aimv2AttentionPoolingHead",
+        "Aimv2VisionEmbeddings",
+        "Aimv2TextEmbeddings",
+    ]
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -500,9 +470,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
-@add_start_docstrings(
-    """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_VISION_START_DOCSTRING,
+@auto_docstring(
+    custom_intro="""
+    The Vision model from AIMv2 without any head or projection on top.
+    """
 )
 class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
@@ -525,7 +496,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
-    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @auto_docstring
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
@@ -582,6 +553,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    The text model from AIMv2 without any head or projection on top.
+    """
+)
 class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
@@ -603,6 +579,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         self.embeddings.token_embedding = value
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids,
@@ -648,6 +625,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring
 class Aimv2Model(CLIPModel, nn.Module):
     def __init__(self, config: Aimv2Config):
         nn.Module().__init__(config)
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index fa5e3f6dc84..0756180e590 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -510,7 +510,6 @@ class SiglipPreTrainedModel(PreTrainedModel):
 
     _no_split_modules = [
         "SiglipTextEmbeddings",
-        "SiglipEncoderLayer",
         "SiglipVisionEmbeddings",
         "SiglipEncoderLayer",
         "SiglipMultiheadAttentionPoolingHead",
diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py
index eb3bf5d4a34..f3e79f143bb 100644
--- a/src/transformers/models/siglip2/modeling_siglip2.py
+++ b/src/transformers/models/siglip2/modeling_siglip2.py
@@ -742,7 +742,6 @@ class Siglip2PreTrainedModel(PreTrainedModel):
 
     _no_split_modules = [
         "Siglip2TextEmbeddings",
-        "Siglip2EncoderLayer",
         "Siglip2VisionEmbeddings",
         "Siglip2EncoderLayer",
         "Siglip2MultiheadAttentionPoolingHead",
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 93f99f4269c..ee66e94090d 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.