Support custom dosctrings in modular (#36726)

* Override docstrings in modular if not none * Update doc
2025-07-03 21:00:08 +06:00 · 2025-03-18 14:00:54 -04:00 · 2025-03-18 14:00:54 -04:00 · 12f2ebef63
commit 12f2ebef63
parent 00915d3041
5 changed files with 27 additions and 25 deletions
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -546,7 +546,7 @@ This makes it very easy to switch decorators and makes it explicit that the only
 ## Docstring variables
-If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable always uses the definition found in the source file instead of the modular file.
+If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file.
 This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2).
@ -561,6 +561,8 @@ class Starcoder2Model(MistralModel):
        ...
 ```
 Setting the variable to anything other than `None` will override the docstring, so that you can customize the docstrings if needed.
 ## Special naming
 The linter automatically renames everything when inheriting from a class. For consistency, you should always use the same class name prefix when inheriting from different classes from the same file.
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@ -57,7 +57,7 @@ _CONFIG_FOR_DOC = "Gemma3Config"
 logger = logging.get_logger(__name__)
-GEMMA3_INPUTS_DOCSTRING = ""
+GEMMA3_INPUTS_DOCSTRING = None  # Will be picked up by modular
 class Gemma3TextConfig(Gemma2Config):
--- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
@ -625,10 +625,10 @@ GOT_OCR2_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.
            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`GotOcr2Processor`] uses
+            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
-            [`CLIPImageProcessor`] for processing images).
+            [`GotOcr2ImageProcessor`] for processing images.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@ -667,13 +667,6 @@ GOT_OCR2_INPUTS_DOCSTRING = r"""
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
            The index of the layer to select the vision feature. If multiple indices are provided,
            the vision feature of the corresponding indices will be concatenated to form the
            vision features.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
--- a/src/transformers/models/got_ocr2/modular_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py
@ -290,6 +290,10 @@ GOT_OCR2_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.
            [What are input IDs?](../glossary#input-ids)
        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
            The tensors corresponding to the input images. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
            [`GotOcr2ImageProcessor`] for processing images.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@ -331,10 +335,6 @@ GOT_OCR2_INPUTS_DOCSTRING = r"""
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@ -343,10 +343,10 @@ GOT_OCR2_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            The tensors corresponding to the input images. Pixel values can be obtained using
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            [`GotOcr2ImageProcessor`] for processing images.
+            the complete sequence length.
 """
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@ -537,6 +537,9 @@ def find_all_dependencies(
 # Top-level variables that match the following patterns will always use the value in the `modular_xxx.py` file
 ASSIGNMENTS_REGEX_TO_KEEP = [r"_CHECKPOINT", r"_EXPECTED", r"_FOR_DOC"]
 # Top-level variables that match the following patterns will use the value in the `modular_xxx.py` file only if they are not None
 ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE = [r"_DOCSTRING"]
 class ClassDependencyMapper(CSTVisitor):
    """A visitor which is designed to analyze a single class node to get all its dependencies that are shared with the set of
@ -854,13 +857,17 @@ class ModelFileMapper(ModuleMapper):
        """Update the global nodes with the assignment from the modular file.
        Merging rule: if any assignment with the same name was redefined in the modular, we use it and its dependencies ONLY if it matches
-        a pattern in `ASSIGNMENTS_REGEX_TO_KEEP`. Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the
+        a pattern in `ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE` and its value is not None, or if it matches a pattern in `ASSIGNMENTS_REGEX_TO_KEEP.
-        big docstrings.
+        Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the big docstrings.
        """
        for assignment, node in assignments.items():
            should_keep = any(re.search(pattern, assignment) for pattern in ASSIGNMENTS_REGEX_TO_KEEP)
-            if should_keep or assignment not in self.assignments:
+            should_keep_if_not_none = any(
                re.search(pattern, assignment) for pattern in ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE
            ) and not (hasattr(node.body[0].value, "value") and node.body[0].value.value == "None")
            if should_keep or should_keep_if_not_none or assignment not in self.assignments:
                self.assignments[assignment] = node
                if assignment in object_mapping:
                    self.object_dependency_mapping[assignment] = object_mapping[assignment]