# coding=utf-8
# Copyright 2024 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest


LLAMA_CLM_FORWARD = """        The [`LlamaForCausalLM`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model\'s internal embedding lookup matrix.\n            labels (`Optional[torch.LongTensor]`) of shape `(batch_size, sequence_length)`:\n                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,\n                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored\n                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            output_hidden_states (`Optional[bool]`):\n                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n                more detail.\n            cache_position (`Optional[torch.LongTensor]`):\n                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n                the complete sequence length.\n            logits_to_keep (`Union[int, torch.Tensor]`, defaults to `0`):\n                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all\n                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that\n                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.\n                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.\n                This is useful when using packed tensor format (single dimension for batch and sequence length).\n\n        Returns:\n            [`transformers.modeling_outputs.CausalLMOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.CausalLMOutputWithPast`] or a tuple of\n            `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n            elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n            - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) -- Language modeling loss (for next-token prediction).\n            - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n            - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n              `(batch_size, num_heads, sequence_length, embed_size_per_head)`)\n\n              Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see\n              `past_key_values` input) to speed up sequential decoding.\n            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n              one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n              Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n              sequence_length)`.\n\n              Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n              heads.\n\n        Example:\n\n        ```python\n        >>> from transformers import AutoTokenizer, LlamaForCausalLM\n\n        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")\n        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n\n        >>> prompt = "Hey, are you conscious? Can you talk to me?"\n        >>> inputs = tokenizer(prompt, return_tensors="pt")\n\n        >>> # Generate\n        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)\n        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n        "Hey, are you conscious? Can you talk to me?\\nI\'m not conscious, but I can talk to you."\n        ```"""

LLAMA_MODEL_DOCSTRING = """        The [`LlamaModel`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model's internal embedding lookup matrix.\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            output_hidden_states (`Optional[bool]`):\n                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n                more detail.\n            cache_position (`Optional[torch.LongTensor]`):\n                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n                the complete sequence length.\n\n        Returns:\n            [`transformers.modeling_outputs.BaseModelOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.BaseModelOutputWithPast`] or a tuple of\n            `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n            elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n            - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- Sequence of hidden-states at the output of the last layer of the model.\n\n              If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,\n              hidden_size)` is output.\n            - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n              `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if\n              `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,\n              encoder_sequence_length, embed_size_per_head)`.\n\n              Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if\n              `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`\n              input) to speed up sequential decoding.\n            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n              one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n              Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n              sequence_length)`.\n\n              Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n              heads.\n"""

LLAMA_DECODER = """        The [`LlamaDecoderLayer`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_value (`Optional[~cache_utils.Cache]`):deprecated in favor of `past_key_values`            output_attentions (`Optional[bool]`, defaults to `False`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            use_cache (`Optional[bool]`, defaults to `False`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            cache_position (`Optional[torch.LongTensor]`):\n                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n                the complete sequence length.\n            position_embeddings (`Optional[Tuple[torch.Tensor, torch.Tensor]]`):\n                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,\n                with `head_dim` being the embedding dimension of each attention head.\n\n        Returns:\n            `Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]`"""

LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC = """        The [`LlamaForSequenceClassification`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model\'s internal embedding lookup matrix.\n            labels (`Optional[torch.LongTensor]`):\n                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\n                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\n                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            output_hidden_states (`Optional[bool]`):\n                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n                more detail.\n\n        Returns:\n            [`transformers.modeling_outputs.SequenceClassifierOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.SequenceClassifierOutputWithPast`] or a tuple of\n            `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n            elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n            - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) -- Classification (or regression if config.num_labels==1) loss.\n            - **logits** (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`) -- Classification (or regression if config.num_labels==1) scores (before SoftMax).\n            - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n              `(batch_size, num_heads, sequence_length, embed_size_per_head)`)\n\n              Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see\n              `past_key_values` input) to speed up sequential decoding.\n            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n              one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n              Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n              sequence_length)`.\n\n              Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n              heads.\n\n        Example of single-label classification:\n\n        ```python\n        >>> import torch\n        >>> from transformers import AutoTokenizer, LlamaForSequenceClassification\n\n        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n        >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf")\n\n        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")\n\n        >>> with torch.no_grad():\n        ...     logits = model(**inputs).logits\n\n        >>> predicted_class_id = logits.argmax().item()\n        >>> model.config.id2label[predicted_class_id]\n        ...\n\n        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`\n        >>> num_labels = len(model.config.id2label)\n        >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=num_labels)\n\n        >>> labels = torch.tensor([1])\n        >>> loss = model(**inputs, labels=labels).loss\n        >>> round(loss.item(), 2)\n        ...\n        ```\n\n        Example of multi-label classification:\n\n        ```python\n        >>> import torch\n        >>> from transformers import AutoTokenizer, LlamaForSequenceClassification\n\n        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n        >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", problem_type="multi_label_classification")\n\n        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")\n\n        >>> with torch.no_grad():\n        ...     logits = model(**inputs).logits\n\n        >>> predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]\n\n        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`\n        >>> num_labels = len(model.config.id2label)\n        >>> model = LlamaForSequenceClassification.from_pretrained(\n        ...     "meta-llama/Llama-2-7b-hf", num_labels=num_labels, problem_type="multi_label_classification"\n        ... )\n\n        >>> labels = torch.sum(\n        ...     torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1\n        ... ).to(torch.float)\n        >>> loss = model(**inputs, labels=labels).loss\n        ```\n"""

GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING = """\nConstructs a fast Gemma3 image processor.\n\nParameters:\n    do_resize (`Optional[bool]`, defaults to `True`):\n        Whether to resize the image.\n    size (`Optional[dict[str, int]]`, defaults to `{\'height\': 224, \'width\': 224}`):\n        Describes the maximum input dimensions to the model.\n    default_to_square (`Optional[bool]`, defaults to `True`):\n        Whether to default to a square image when resizing, if size is an int.\n    resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`, defaults to `2`):\n        Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n        has an effect if `do_resize` is set to `True`.\n    do_center_crop (`Optional[bool]`, defaults to `None`):\n        Whether to center crop the image.\n    crop_size (`Optional[dict[str, int]]`, defaults to `None`):\n        Size of the output image after applying `center_crop`.\n    do_rescale (`Optional[bool]`, defaults to `True`):\n        Whether to rescale the image.\n    rescale_factor (`Union[int, float, NoneType]`, defaults to `0.00392156862745098`):\n        Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n    do_normalize (`Optional[bool]`, defaults to `True`):\n        Whether to normalize the image.\n    image_mean (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n        Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n    image_std (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n        Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n        `True`.\n    do_convert_rgb (`Optional[bool]`, defaults to `None`):\n        Whether to convert the image to RGB.\n    return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`, defaults to `None`):\n        Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n    data_format (`Optional[~image_utils.ChannelDimension]`, defaults to `ChannelDimension.FIRST`):\n        Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n    input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`, defaults to `None`):\n        The channel dimension format for the input image. If unset, the channel dimension format is inferred\n        from the input image. Can be one of:\n        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n    device (`Optional[torch.device]`, defaults to `None`):\n        The device to process the images on. If unset, the device is inferred from the input images.\n    do_pan_and_scan (`Optional[bool]`, defaults to `None`):\n        Whether to apply `pan_and_scan` to images.\n    pan_and_scan_min_crop_size (`Optional[int]`, defaults to `None`):\n        Minimum size of each crop in pan and scan.\n    pan_and_scan_max_num_crops (`Optional[int]`, defaults to `None`):\n        Maximum number of crops per image in pan and scan.\n    pan_and_scan_min_ratio_to_activate (`Optional[float]`, defaults to `None`):\n        Minimum aspect ratio to activate pan and scan.\n"""

GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING = """        Args:\n            images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list[\'PIL.Image.Image\'], list[numpy.ndarray], list[\'torch.Tensor\']]`):\n                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n                passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n            do_resize (`Optional[bool]`):\n                Whether to resize the image.\n            size (`Optional[dict[str, int]]`):\n                Describes the maximum input dimensions to the model.\n            default_to_square (`Optional[bool]`):\n                Whether to default to a square image when resizing, if size is an int.\n            resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n                has an effect if `do_resize` is set to `True`.\n            do_center_crop (`Optional[bool]`):\n                Whether to center crop the image.\n            crop_size (`Optional[dict[str, int]]`):\n                Size of the output image after applying `center_crop`.\n            do_rescale (`Optional[bool]`):\n                Whether to rescale the image.\n            rescale_factor (`Union[int, float, NoneType]`):\n                Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n            do_normalize (`Optional[bool]`):\n                Whether to normalize the image.\n            image_mean (`Union[float, list[float], NoneType]`):\n                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n            image_std (`Union[float, list[float], NoneType]`):\n                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n                `True`.\n            do_convert_rgb (`Optional[bool]`):\n                Whether to convert the image to RGB.\n            return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n                Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n            data_format (`Optional[~image_utils.ChannelDimension]`):\n                Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n            input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n                The channel dimension format for the input image. If unset, the channel dimension format is inferred\n                from the input image. Can be one of:\n                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n            device (`Optional[torch.device]`):\n                The device to process the images on. If unset, the device is inferred from the input images.\n            do_pan_and_scan (`Optional[bool]`):\n                Whether to apply `pan_and_scan` to images.\n            pan_and_scan_min_crop_size (`Optional[int]`):\n                Minimum size of each crop in pan and scan.\n            pan_and_scan_max_num_crops (`Optional[int]`):\n                Maximum number of crops per image in pan and scan.\n            pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n                Minimum aspect ratio to activate pan and scan.\n\n        Returns:\n            `<class \'transformers.image_processing_base.BatchFeature\'>`:\n                - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method (\'pixel_values\', etc.).\n                - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n                  initialization.\n"""


class AutoDocstringTest(unittest.TestCase):
    pass
    # def test_modeling_docstring(self):
    #     llama_docstring = "        Args:\n            images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']]`):\n                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n                passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n            do_resize (`Optional[bool]`):\n                Whether to resize the image.\n            size (`Optional[dict[str, int]]`):\n                Describes the maximum input dimensions to the model.\n            default_to_square (`Optional[bool]`):\n                Whether to default to a square image when resizing, if size is an int.\n            resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n                has an effect if `do_resize` is set to `True`.\n            do_center_crop (`Optional[bool]`):\n                Whether to center crop the image.\n            crop_size (`Optional[dict[str, int]]`):\n                Size of the output image after applying `center_crop`.\n            do_rescale (`Optional[bool]`):\n                Whether to rescale the image.\n            rescale_factor (`Union[int, float, NoneType]`):\n                Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n            do_normalize (`Optional[bool]`):\n                Whether to normalize the image.\n            image_mean (`Union[float, list[float], NoneType]`):\n                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n            image_std (`Union[float, list[float], NoneType]`):\n                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n                `True`.\n            do_convert_rgb (`Optional[bool]`):\n                Whether to convert the image to RGB.\n            return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n                Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n            data_format (`Optional[~image_utils.ChannelDimension]`):\n                Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n            input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n                The channel dimension format for the input image. If unset, the channel dimension format is inferred\n                from the input image. Can be one of:\n                - `\"channels_first\"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n                - `\"channels_last\"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n                - `\"none\"` or `ChannelDimension.NONE`: image in (height, width) format.\n            device (`Optional[torch.device]`):\n                The device to process the images on. If unset, the device is inferred from the input images.\n            do_pan_and_scan (`Optional[bool]`):\n                Whether to apply `pan_and_scan` to images.\n            pan_and_scan_min_crop_size (`Optional[int]`):\n                Minimum size of each crop in pan and scan.\n            pan_and_scan_max_num_crops (`Optional[int]`):\n                Maximum number of crops per image in pan and scan.\n            pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n                Minimum aspect ratio to activate pan and scan.\n\n        Returns:\n            `<class 'transformers.image_processing_base.BatchFeature'>`:\n                - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).\n                - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n                  initialization.\n"
    #     self.assertEqual(llama_docstring, LlamaModel.__doc__)

    #     self.assertEqual(LLAMA_MODEL_DOCSTRING, LlamaModel.forward.__doc__)
    #     self.assertEqual(LLAMA_CLM_FORWARD, LlamaForCausalLM.forward.__doc__)
    #     self.assertEqual(LLAMA_DECODER, LlamaDecoderLayer.forward.__doc__)
    #     self.assertEqual(LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC, LlamaForSequenceClassification.forward.__doc__)

    # def test_fast_image_processor_docstring(self):
    #     self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING, Gemma3ImageProcessorFast.__doc__)
    #     self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING, Gemma3ImageProcessorFast.preprocess.__doc__)

    # def test_auto_doc(self):
    #     COOL_CLASS_DOC = """
    #     Args:
    #         input_ids (some):
    #         flash_attn_kwargs (FlashAttentionKwrargs):
    #             parameters that are completely optional and that should be passed.
    #         another_warg (something): should pass
    #         and_another_on (this time):
    #             I want
    #             this to be
    #             quite long

    #     Example

    #     ```python
    #     >>> import
    #     ```
    #     """

    #     @auto_docstring
    #     class MyModel:
    #         @auto_docstring
    #         def __init__(input_ids, flash_attn_kwargs=None, another_warg=True, and_another_on=1):
    #             r"""
    #             Args:
    #                 flash_attn_kwargs (FlashAttentionKwrargs):
    #                     parameters that are completely optional and that should be passed.
    #                 another_warg (something): should pass
    #                 and_another_on (this time):
    #                     I want
    #                     this to be
    #                     quite long

    #             Example

    #             ```python
    #             >>> import
    #             ```
    #             """
    #             pass

    #     self.assertEqual(MyModel.__init__.__doc__, COOL_CLASS_DOC)