transformers/tests/utils/test_auto_docstring.py
Arthur 5f5ccfdc54
[AutoDocstring] Based on inspect parsing of the signature (#33771)
* delete common docstring

* nit

* updates

* push

* fixup

* move stuff around fixup

* no need for dataclas

* damn nice modular

* add auto class docstring

* style

* modular update

* import autodocstring

* fixup

* maybe add original doc!

* more cleanup

* remove class do cas well

* update

* nits

* more celanup

* fix

* wups

* small check

* updatez

* some fixes

* fix doc

* update

* nits

* try?

* nit

* some updates

* a little bit better

* where ever we did not have help we are not really adding it!

* revert llama config

* small fixes and small tests

* test

* fixup

* more fix-copies

* updates

* updates

* fix doc building

* style

* small fixes

* nits

* fix-copies

* fix merge issues faster

* fix merge conf

* nits jamba

* ?

* working autodoc for model class and forward except returns and example

* support return section and unpack kwargs description

* nits and cleanup

* fix-copies

* fix-copies

* nits

* Add support for llava-like models

* fixup

* add class args subset support

* add examples inferred from automodel/pipelines

* update ruff

* autodocstring for Aria, Albert + fixups

* Fix empty return blocks

* fix copies

* fix copies

* add autodoc for all fast image processors + align, altclip

* fix copies

* add auto_doc for audio_spectrogram, auto_former, bark, bamba

* Drastically improve speed + add bart beit bert

* add autodoc to all bert-like models

* Fix broken doc

* fix copies

* fix auto_docstring after merge

* add autodoc to models

* add models

* add models

* add models and improve support for optional, and custom shape in args docstring

* update fast image processors

* refactor auto_method_docstring in args_doc

* add models and fix docstring parsing

* add models

* add models

* remove debugging

* add models

* add fix_auto_docstrings and improve args_docs

* add support for additional_info in args docstring

* refactor (almost) all models

* fix check docstring

* fix -copies

* fill in all missing docstrings

* fix copies

* fix qwen3 moe docstring

* add documentation

* add back labels

* update docs and fix can_return_tuple in modular files

* fix LongformerForMaskedLM docstring

* add auto_docstring to _toctree

* remove auto_docstring tests temporarily

* fix copyrights new files

* fix can_return_tuple granite hybrid

* fix fast beit

* Fix empty config doc

* add support for COMMON_CUSTOM_ARGS in check_docstrings and add missing models

* fix code block not closed flava

* fix can_return_tuple sam hq

* Fix Flaubert dataclass

---------

Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
2025-05-08 17:46:07 -04:00

89 lines
40 KiB
Python

# coding=utf-8
# Copyright 2024 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
LLAMA_CLM_FORWARD = """ The [`LlamaForCausalLM`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model\'s internal embedding lookup matrix.\n labels (`Optional[torch.LongTensor]`) of shape `(batch_size, sequence_length)`:\n Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,\n config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored\n (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n output_hidden_states (`Optional[bool]`):\n Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n more detail.\n cache_position (`Optional[torch.LongTensor]`):\n Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n the complete sequence length.\n logits_to_keep (`Union[int, torch.Tensor]`, defaults to `0`):\n If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all\n `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that\n token can save memory, which becomes pretty significant for long sequences or large vocabulary size.\n If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.\n This is useful when using packed tensor format (single dimension for batch and sequence length).\n\n Returns:\n [`transformers.modeling_outputs.CausalLMOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.CausalLMOutputWithPast`] or a tuple of\n `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) -- Language modeling loss (for next-token prediction).\n - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n `(batch_size, num_heads, sequence_length, embed_size_per_head)`)\n\n Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see\n `past_key_values` input) to speed up sequential decoding.\n - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n sequence_length)`.\n\n Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n heads.\n\n Example:\n\n ```python\n >>> from transformers import AutoTokenizer, LlamaForCausalLM\n\n >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")\n >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n\n >>> prompt = "Hey, are you conscious? Can you talk to me?"\n >>> inputs = tokenizer(prompt, return_tensors="pt")\n\n >>> # Generate\n >>> generate_ids = model.generate(inputs.input_ids, max_length=30)\n >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n "Hey, are you conscious? Can you talk to me?\\nI\'m not conscious, but I can talk to you."\n ```"""
LLAMA_MODEL_DOCSTRING = """ The [`LlamaModel`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model's internal embedding lookup matrix.\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n output_hidden_states (`Optional[bool]`):\n Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n more detail.\n cache_position (`Optional[torch.LongTensor]`):\n Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n the complete sequence length.\n\n Returns:\n [`transformers.modeling_outputs.BaseModelOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.BaseModelOutputWithPast`] or a tuple of\n `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- Sequence of hidden-states at the output of the last layer of the model.\n\n If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,\n hidden_size)` is output.\n - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if\n `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,\n encoder_sequence_length, embed_size_per_head)`.\n\n Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if\n `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`\n input) to speed up sequential decoding.\n - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n sequence_length)`.\n\n Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n heads.\n"""
LLAMA_DECODER = """ The [`LlamaDecoderLayer`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim) attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_value (`Optional[~cache_utils.Cache]`):deprecated in favor of `past_key_values` output_attentions (`Optional[bool]`, defaults to `False`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n use_cache (`Optional[bool]`, defaults to `False`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n cache_position (`Optional[torch.LongTensor]`):\n Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n the complete sequence length.\n position_embeddings (`Optional[Tuple[torch.Tensor, torch.Tensor]]`):\n Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,\n with `head_dim` being the embedding dimension of each attention head.\n\n Returns:\n `Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]`"""
LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC = """ The [`LlamaForSequenceClassification`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model\'s internal embedding lookup matrix.\n labels (`Optional[torch.LongTensor]`):\n Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\n config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\n `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n output_hidden_states (`Optional[bool]`):\n Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n more detail.\n\n Returns:\n [`transformers.modeling_outputs.SequenceClassifierOutputWithPast`] or `tuple(torch.FloatTensor)`: A [`transformers.modeling_outputs.SequenceClassifierOutputWithPast`] or a tuple of\n `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various\n elements depending on the configuration ([`LlamaConfig`]) and inputs.\n\n - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) -- Classification (or regression if config.num_labels==1) loss.\n - **logits** (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`) -- Classification (or regression if config.num_labels==1) scores (before SoftMax).\n - **past_key_values** (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`) -- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape\n `(batch_size, num_heads, sequence_length, embed_size_per_head)`)\n\n Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see\n `past_key_values` input) to speed up sequential decoding.\n - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`) -- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +\n one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.\n\n Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.\n - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) -- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,\n sequence_length)`.\n\n Attentions weights after the attention softmax, used to compute the weighted average in the self-attention\n heads.\n\n Example of single-label classification:\n\n ```python\n >>> import torch\n >>> from transformers import AutoTokenizer, LlamaForSequenceClassification\n\n >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf")\n\n >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")\n\n >>> with torch.no_grad():\n ... logits = model(**inputs).logits\n\n >>> predicted_class_id = logits.argmax().item()\n >>> model.config.id2label[predicted_class_id]\n ...\n\n >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`\n >>> num_labels = len(model.config.id2label)\n >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=num_labels)\n\n >>> labels = torch.tensor([1])\n >>> loss = model(**inputs, labels=labels).loss\n >>> round(loss.item(), 2)\n ...\n ```\n\n Example of multi-label classification:\n\n ```python\n >>> import torch\n >>> from transformers import AutoTokenizer, LlamaForSequenceClassification\n\n >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")\n >>> model = LlamaForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", problem_type="multi_label_classification")\n\n >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")\n\n >>> with torch.no_grad():\n ... logits = model(**inputs).logits\n\n >>> predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]\n\n >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`\n >>> num_labels = len(model.config.id2label)\n >>> model = LlamaForSequenceClassification.from_pretrained(\n ... "meta-llama/Llama-2-7b-hf", num_labels=num_labels, problem_type="multi_label_classification"\n ... )\n\n >>> labels = torch.sum(\n ... torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1\n ... ).to(torch.float)\n >>> loss = model(**inputs, labels=labels).loss\n ```\n"""
GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING = """\nConstructs a fast Gemma3 image processor.\n\nParameters:\n do_resize (`Optional[bool]`, defaults to `True`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`, defaults to `{\'height\': 224, \'width\': 224}`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`, defaults to `True`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`, defaults to `2`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`, defaults to `None`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`, defaults to `None`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`, defaults to `True`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`, defaults to `0.00392156862745098`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`, defaults to `True`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`, defaults to `None`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`, defaults to `None`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`, defaults to `ChannelDimension.FIRST`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`, defaults to `None`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`, defaults to `None`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`, defaults to `None`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`, defaults to `None`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`, defaults to `None`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`, defaults to `None`):\n Minimum aspect ratio to activate pan and scan.\n"""
GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING = """ Args:\n images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list[\'PIL.Image.Image\'], list[numpy.ndarray], list[\'torch.Tensor\']]`):\n Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n do_resize (`Optional[bool]`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n Minimum aspect ratio to activate pan and scan.\n\n Returns:\n `<class \'transformers.image_processing_base.BatchFeature\'>`:\n - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method (\'pixel_values\', etc.).\n - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n initialization.\n"""
class AutoDocstringTest(unittest.TestCase):
pass
# def test_modeling_docstring(self):
# llama_docstring = " Args:\n images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']]`):\n Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n do_resize (`Optional[bool]`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `\"channels_first\"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `\"channels_last\"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `\"none\"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n Minimum aspect ratio to activate pan and scan.\n\n Returns:\n `<class 'transformers.image_processing_base.BatchFeature'>`:\n - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).\n - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at\n initialization.\n"
# self.assertEqual(llama_docstring, LlamaModel.__doc__)
# self.assertEqual(LLAMA_MODEL_DOCSTRING, LlamaModel.forward.__doc__)
# self.assertEqual(LLAMA_CLM_FORWARD, LlamaForCausalLM.forward.__doc__)
# self.assertEqual(LLAMA_DECODER, LlamaDecoderLayer.forward.__doc__)
# self.assertEqual(LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC, LlamaForSequenceClassification.forward.__doc__)
# def test_fast_image_processor_docstring(self):
# self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING, Gemma3ImageProcessorFast.__doc__)
# self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING, Gemma3ImageProcessorFast.preprocess.__doc__)
# def test_auto_doc(self):
# COOL_CLASS_DOC = """
# Args:
# input_ids (some):
# flash_attn_kwargs (FlashAttentionKwrargs):
# parameters that are completely optional and that should be passed.
# another_warg (something): should pass
# and_another_on (this time):
# I want
# this to be
# quite long
# Example
# ```python
# >>> import
# ```
# """
# @auto_docstring
# class MyModel:
# @auto_docstring
# def __init__(input_ids, flash_attn_kwargs=None, another_warg=True, and_another_on=1):
# r"""
# Args:
# flash_attn_kwargs (FlashAttentionKwrargs):
# parameters that are completely optional and that should be passed.
# another_warg (something): should pass
# and_another_on (this time):
# I want
# this to be
# quite long
# Example
# ```python
# >>> import
# ```
# """
# pass
# self.assertEqual(MyModel.__init__.__doc__, COOL_CLASS_DOC)