From da4ab2a1b66e2367f94ea34438d344dd53e2d66e Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 25 Feb 2025 11:09:01 +0100 Subject: [PATCH] Fix doc formatting in forward passes & modular (#36243) * fix indentation issues + modular without magic keyword * style * Update doc.py * style * Fix all decorators indentation * all models * style * style * Update doc.py * fix * general fix * style --- .../modeling_new_task_model.py | 1 - src/transformers/models/aria/modeling_aria.py | 2 -- src/transformers/models/aria/modular_aria.py | 1 - .../models/bamba/modeling_bamba.py | 1 - .../models/bamba/modular_bamba.py | 1 - .../models/chameleon/modeling_chameleon.py | 1 - .../models/cohere/modeling_cohere.py | 1 - .../models/cohere/modular_cohere.py | 1 - .../models/cohere2/modeling_cohere2.py | 1 - src/transformers/models/dbrx/modeling_dbrx.py | 4 +-- .../open_llama/modeling_open_llama.py | 1 - .../models/diffllama/modeling_diffllama.py | 1 - src/transformers/models/emu3/modeling_emu3.py | 2 -- src/transformers/models/emu3/modular_emu3.py | 2 -- .../models/gemma/modeling_gemma.py | 1 - .../models/gemma/modular_gemma.py | 1 - .../models/gemma2/modeling_gemma2.py | 5 ++- .../models/gemma2/modular_gemma2.py | 22 ++++++++++-- src/transformers/models/glm/modeling_glm.py | 1 - .../models/got_ocr2/modeling_got_ocr2.py | 1 - .../models/got_ocr2/modular_got_ocr2.py | 1 - .../models/granite/modeling_granite.py | 1 - .../models/granitemoe/modeling_granitemoe.py | 1 - .../modeling_granitemoeshared.py | 1 - .../models/helium/modeling_helium.py | 1 - .../models/idefics/modeling_idefics.py | 1 - .../models/idefics/modeling_tf_idefics.py | 1 - .../models/idefics2/modeling_idefics2.py | 1 - .../models/idefics3/modeling_idefics3.py | 1 - .../models/jamba/modeling_jamba.py | 1 - .../models/jetmoe/modeling_jetmoe.py | 1 - .../models/llama/modeling_llama.py | 1 - .../models/llava/modeling_llava.py | 1 - .../models/llava_next/modeling_llava_next.py | 1 - .../modeling_llava_next_video.py | 1 - .../modular_llava_next_video.py | 1 - .../modeling_llava_onevision.py | 1 - .../models/mistral/modeling_mistral.py | 1 - .../models/mistral/modeling_tf_mistral.py | 18 +++++----- .../models/mixtral/modeling_mixtral.py | 1 - .../models/mixtral/modular_mixtral.py | 1 - .../models/mllama/modeling_mllama.py | 2 -- .../models/moshi/modeling_moshi.py | 1 - .../models/nemotron/modeling_nemotron.py | 1 - src/transformers/models/olmo/modeling_olmo.py | 1 - .../models/olmo2/modeling_olmo2.py | 1 - .../models/olmoe/modeling_olmoe.py | 1 - .../models/paligemma/modeling_paligemma.py | 1 - .../models/persimmon/modeling_persimmon.py | 1 - src/transformers/models/phi/modeling_phi.py | 1 - src/transformers/models/phi3/modeling_phi3.py | 1 - .../models/phimoe/modeling_phimoe.py | 1 - .../models/qwen2/modeling_qwen2.py | 1 - .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 1 - .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 1 - .../qwen2_audio/modeling_qwen2_audio.py | 1 - .../models/qwen2_moe/modeling_qwen2_moe.py | 1 - .../models/qwen2_vl/modeling_qwen2_vl.py | 1 - .../modeling_recurrent_gemma.py | 1 - .../models/stablelm/modeling_stablelm.py | 1 - .../models/starcoder2/modeling_starcoder2.py | 1 - .../video_llava/modeling_video_llava.py | 1 - .../models/vipllava/modeling_vipllava.py | 1 - .../models/zamba/modeling_zamba.py | 1 - .../models/zamba2/modeling_zamba2.py | 1 - src/transformers/utils/doc.py | 36 ++++++++++++++++--- utils/modular_model_converter.py | 23 ++++++++++-- 67 files changed, 83 insertions(+), 90 deletions(-) diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index a01094737cd..ea2e1a2b9a1 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -349,7 +349,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): num_logits_to_keep: int = 0, ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 90755045b63..d08ecfab7e6 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -1193,7 +1193,6 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -1458,7 +1457,6 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, AriaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 7d579d6e37f..c62c074218d 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1437,7 +1437,6 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, AriaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 6fdce41e5a6..95f9b1a0a7a 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -1495,7 +1495,6 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index 3972d25b51b..755552036fc 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -1205,7 +1205,6 @@ class BambaForCausalLM(LlamaForCausalLM): **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 538a9aaf7fc..ecc954f5e6e 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1554,7 +1554,6 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 69e7c579f9c..12b1d740bd7 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -833,7 +833,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/cohere/modular_cohere.py b/src/transformers/models/cohere/modular_cohere.py index 17eb3f6a343..a39489a346d 100644 --- a/src/transformers/models/cohere/modular_cohere.py +++ b/src/transformers/models/cohere/modular_cohere.py @@ -321,7 +321,6 @@ class CohereForCausalLM(LlamaForCausalLM): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index 75144c65ecf..93763b2cab3 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -834,7 +834,6 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index fceefbe2c75..71484691c25 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1283,9 +1283,7 @@ class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin): logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: - r"""Forward function for causal language modeling. - - Args: + r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index b6043fde047..2a74517d9f1 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -716,7 +716,6 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel): return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index 16aeefcb1c8..4182e1a2032 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -1070,7 +1070,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index b3331cf1293..a1e74030672 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1650,7 +1650,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -1878,7 +1877,6 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index cdb4ee5d6fa..792d4826812 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -1077,7 +1077,6 @@ class Emu3ForCausalLM(LlamaForCausalLM, Emu3PreTrainedModel, GenerationMixin): @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="Emu3TextConfig") def forward(**super_kwargs): r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -1186,7 +1185,6 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index aeb742e16dd..afef380494d 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -803,7 +803,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index dc8ced15f96..564576be760 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -483,7 +483,6 @@ class GemmaModel(LlamaModel): class GemmaForCausalLM(LlamaForCausalLM): def forward(**super_kwargs): r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index c977f873dc8..6ac249bfce0 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -841,7 +841,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -859,9 +858,9 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): Example: ```python - >>> from transformers import AutoTokenizer, GemmaForCausalLM + >>> from transformers import AutoTokenizer, Gemma2ForCausalLM - >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b") + >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b") >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") >>> prompt = "What is your favorite condiment?" diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 805e6ba0d2a..cd3ae3ed0ef 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -591,10 +591,26 @@ class Gemma2ForCausalLM(GemmaForCausalLM): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - ```python - >>> from transformers import AutoTokenizer, GemmaForCausalLM + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b") + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Gemma2ForCausalLM + + >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b") >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") >>> prompt = "What is your favorite condiment?" diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index f1fddcda107..858c03ec214 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -812,7 +812,6 @@ class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index 86598ac0896..7fbb0d39ef4 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -769,7 +769,6 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin): logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, GotOcr2CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index fff434ead2e..e8b0d770d3b 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -848,7 +848,6 @@ class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration): logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, LlavaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index d0579bb8a7a..2a553e04cc0 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -815,7 +815,6 @@ class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 546e78eac14..4a0ab379ffb 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1287,7 +1287,6 @@ class GraniteMoeForCausalLM(GraniteMoePreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 9f6488b9c20..7d4336a9dc9 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -1313,7 +1313,6 @@ class GraniteMoeSharedForCausalLM(GraniteMoeSharedPreTrainedModel, GenerationMix **kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index fc6f862be25..d2a081efe77 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -799,7 +799,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 7a476a13103..4cd4ced761b 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1559,7 +1559,6 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin): cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, IdeficsCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/idefics/modeling_tf_idefics.py b/src/transformers/models/idefics/modeling_tf_idefics.py index f412d28aa80..8ca859f8cf3 100644 --- a/src/transformers/models/idefics/modeling_tf_idefics.py +++ b/src/transformers/models/idefics/modeling_tf_idefics.py @@ -1687,7 +1687,6 @@ class TFIdeficsForVisionText2Text(TFPreTrainedModel, TFCausalLanguageModelingLos training=False, ) -> Union[TFIdeficsCausalLMOutputWithPast, Tuple[tf.Tensor]]: r""" - Args: labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 2c44f998574..872ba10a417 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1537,7 +1537,6 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin) logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, Idefics2CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`). diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 251e11067ff..7e9e33c218a 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1121,7 +1121,6 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin) logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, Idefics3CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index fa95b126883..2e11d2f7be4 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1456,7 +1456,6 @@ class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 283174ba3cf..9cb66828d55 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1299,7 +1299,6 @@ class JetMoeForCausalLM(JetMoePreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 0d65e1417f5..2a911702577 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -801,7 +801,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 610ab417d92..b20ecf2ca93 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -348,7 +348,6 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): **lm_kwargs, ) -> Union[Tuple, LlavaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 3cdf1b34840..1b0b4b93c83 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -561,7 +561,6 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi **lm_kwargs, ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 9ce88c54123..6d86d9c4d42 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -601,7 +601,6 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene **lm_kwargs, ) -> Union[Tuple, LlavaNextVideoCausalLMOutputWithPast]: r""" - Args: pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`LlavaNextVideoVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 8769f8db413..804f6f58355 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -360,7 +360,6 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): **lm_kwargs, ) -> Union[Tuple, LlavaNextVideoCausalLMOutputWithPast]: r""" - Args: pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`LlavaNextVideoVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index f213b8cb85d..5d41f8489e9 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -623,7 +623,6 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene **lm_kwargs, ) -> Union[Tuple, LlavaOnevisionCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index b300c7c646f..50500fa5857 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -802,7 +802,6 @@ class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py index 35e42fc0b24..53ae7cec7e4 100644 --- a/src/transformers/models/mistral/modeling_tf_mistral.py +++ b/src/transformers/models/mistral/modeling_tf_mistral.py @@ -849,11 +849,10 @@ class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLos return_dict: Optional[bool] = None, ) -> Union[Tuple, TFCausalLMOutputWithPast]: r""" - Args: - labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` - or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` + or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) @@ -975,11 +974,10 @@ class TFMistralForSequenceClassification(TFMistralPreTrainedModel, TFSequenceCla return_dict: Optional[bool] = None, ) -> Union[Tuple, TFSequenceClassifierOutputWithPast]: r""" - Args: - labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ transformer_outputs = self.model( diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 170d54eca1b..367ba34b80f 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1022,7 +1022,6 @@ class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py index 7890400934c..b32a8d7987b 100644 --- a/src/transformers/models/mixtral/modular_mixtral.py +++ b/src/transformers/models/mixtral/modular_mixtral.py @@ -480,7 +480,6 @@ class MixtralForCausalLM(MistralForCausalLM): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 4a705083f3b..923e6243487 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1901,7 +1901,6 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -2048,7 +2047,6 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin): logits_to_keep: Union[int, torch.Tensor] = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index d1f4f8a9cab..f7d712cd8b9 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1813,7 +1813,6 @@ class MoshiForCausalLM(MoshiPreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, MoshiCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 829a3283d0a..7c39af637c7 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -1047,7 +1047,6 @@ class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 6b7abaa96af..f48cf3d89db 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -777,7 +777,6 @@ class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index 89ef5e1050b..b26f55626e2 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -778,7 +778,6 @@ class Olmo2ForCausalLM(Olmo2PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 9b0336a32b1..ae830dc5a55 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1206,7 +1206,6 @@ class OlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 35ad047a00d..2bf456047d9 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -438,7 +438,6 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi **lm_kwargs, ) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 9c589036815..7ccd4c2ba06 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -852,7 +852,6 @@ class PersimmonForCausalLM(PersimmonPreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 33d86999fdf..448f2126050 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -775,7 +775,6 @@ class PhiForCausalLM(PhiPreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a26e3a97937..c140af1f3e4 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -877,7 +877,6 @@ class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 1cea9a2ea28..3f17690d6a4 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1388,7 +1388,6 @@ class PhimoeForCausalLM(PhimoePreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index bf135a46c8d..4ae09fbb70d 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -815,7 +815,6 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index e392634e597..ef610b22512 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1742,7 +1742,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi second_per_grid_ts: Optional[torch.Tensor] = None, ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 2d8695b5a40..d12a59926db 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -608,7 +608,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration): second_per_grid_ts: Optional[torch.Tensor] = None, ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 320d2093133..a6c87e9950e 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -1112,7 +1112,6 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi return_dict: Optional[bool] = None, ) -> Union[Tuple, Qwen2AudioCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 3e4aa05a22b..9b3308914db 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1272,7 +1272,6 @@ class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index fd494f04782..9648de7298a 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1619,7 +1619,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin): cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index e2014079f93..4ae3de5a831 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -821,7 +821,6 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, CausalLMOutput]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index c401b772db7..042c950b094 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1109,7 +1109,6 @@ class StableLmForCausalLM(StableLmPreTrainedModel, GenerationMixin): **kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index d64953d72b6..e55e855b698 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -798,7 +798,6 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin): **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index ba4de653744..19170049b63 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -383,7 +383,6 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi **lm_kwargs, ) -> Union[Tuple, VideoLlavaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index ef4b3bff395..6216ef88dac 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -323,7 +323,6 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin) **lm_kwargs, ) -> Union[Tuple, VipLlavaCausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 54f57971a82..7a728f4f9b1 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1228,7 +1228,6 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index 8a5642e0f5d..ab03da6a1a8 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -1665,7 +1665,6 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin): **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" - Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index 7ca1c134714..f01ffc28442 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -16,10 +16,23 @@ Doc utilities: Utilities related to documentation """ import functools +import inspect import re +import textwrap import types +def get_docstring_indentation_level(func): + """Return the indentation level of the start of the docstring of a class or function (or method).""" + # We assume classes are always defined in the global scope + if inspect.isclass(func): + return 4 + source = inspect.getsource(func) + first_line = source.splitlines()[0] + function_def_level = len(first_line) - len(first_line.lstrip()) + return 4 + function_def_level + + def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") @@ -30,10 +43,8 @@ def add_start_docstrings(*docstr): def add_start_docstrings_to_model_forward(*docstr): def docstring_decorator(fn): - docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") class_name = f"[`{fn.__qualname__.split('.')[0]}`]" - intro = f" The {class_name} forward method, overrides the `__call__` special method." - note = r""" + intro = rf""" The {class_name} forward method, overrides the `__call__` special method. @@ -44,7 +55,23 @@ def add_start_docstrings_to_model_forward(*docstr): """ - fn.__doc__ = intro + note + docstring + correct_indentation = get_docstring_indentation_level(fn) + current_doc = fn.__doc__ if fn.__doc__ is not None else "" + try: + first_non_empty = next(line for line in current_doc.splitlines() if line.strip() != "") + doc_indentation = len(first_non_empty) - len(first_non_empty.lstrip()) + except StopIteration: + doc_indentation = correct_indentation + + docs = docstr + # In this case, the correct indentation level (class method, 2 Python levels) was respected, and we should + # correctly reindent everything. Otherwise, the doc uses a single indentation level + if doc_indentation == 4 + correct_indentation: + docs = [textwrap.indent(textwrap.dedent(doc), " " * correct_indentation) for doc in docstr] + intro = textwrap.indent(textwrap.dedent(intro), " " * correct_indentation) + + docstring = "".join(docs) + current_doc + fn.__doc__ = intro + docstring return fn return docstring_decorator @@ -1153,6 +1180,7 @@ def add_code_sample_docstrings( built_doc = built_doc.replace( f'from_pretrained("{checkpoint}")', f'from_pretrained("{checkpoint}", revision="{revision}")' ) + fn.__doc__ = func_doc + output_doc + built_doc return fn diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 6c85a1e7a1c..728c5628b1f 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -253,10 +253,29 @@ def get_docstring_indent(docstring): return 0 +def is_full_docstring(new_docstring: str) -> bool: + """Check if `new_docstring` is a full docstring, or if it is only part of a docstring that should then + be merged with the existing old one. + """ + # libcst returns the docstrinbgs with litteral `r"""` quotes in front + new_docstring = new_docstring.split('"""', 1)[1] + # The docstring contains Args definition, so it is self-contained + if re.search(r"\n\s*Args:\n", new_docstring): + return True + # If it contains Returns, but starts with text indented with an additional 4 spaces before, it is self-contained + # (this is the scenario when using `@add_start_docstrings_to_model_forward`, but adding more args to docstring) + match_object = re.search(r"\n([^\S\n]*)Returns:\n", new_docstring) + if match_object is not None: + full_indent = match_object.group(1) + striped_doc = new_docstring.strip("\n") + if striped_doc.startswith(full_indent + " " * 4) or striped_doc.startswith(full_indent + "\t"): + return True + return False + + def merge_docstrings(original_docstring, updated_docstring): - # indent_level = get_docstring_indent(updated_docstring) original_level = get_docstring_indent(original_docstring) - if not re.findall(r"\n\s*Args:\n", updated_docstring): + if not is_full_docstring(updated_docstring): # Split the docstring at the example section, assuming `"""` is used to define the docstring parts = original_docstring.split("```") if "```" in updated_docstring and len(parts) > 1: