Fix wrong input shapes in doc-string of models (#37729)

* Fix wrong position_ids shape in doc Supported by ClvpDecoder.forward, line 1212--1215: src/transformers/models/clvp/modeling_clvp.py: 1212 if inputs_embeds is None: 1213 inputs_embeds = self.input_embeds_layer(input_ids) 1214 position_embeds = self.position_embeds_layer(position_ids) 1215 inputs_embeds = inputs_embeds + position_embeds * Fix possibly wrong input_ids shape in doc Since 'input_ids_length' was mentioned immediately after the shape `(batch_size, sequence_length)`, it doesn't make sense to me for `input_ids` to have such shape---IMO it ought to have shape `(batch_size, input_ids_length)` instead. * Fix possibly wrong inputs_embeds shape in doc Supported by CTRLModel.forward, line 448--449: src/transformers/models/ctrl/modeling_ctrl.py: 448 if inputs_embeds is None: 449 inputs_embeds = self.w(input_ids) This commit is introduced due to commit 6f36b56497828642b65f54ea26aa4064186de57a. * Fix possibly wrong token_type_ids shape in doc Supported by CTRLModel.forward, line 441--460: src/transformers/models/ctrl/modeling_ctrl.py: 441 if token_type_ids is not None: 442 token_type_ids = token_type_ids.view(-1, input_shape[-1]) 443 token_type_embeds = self.w(token_type_ids) 444 token_type_embeds *= np.sqrt(self.d_model_size) 445 else: 446 token_type_embeds = 0 447 448 if inputs_embeds is None: 449 inputs_embeds = self.w(input_ids) 450 # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded 451 seq_len = input_shape[-1] 452 mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device) 453 454 inputs_embeds *= np.sqrt(self.d_model_size) 455 456 # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually. 457 self.pos_encoding = self.pos_encoding.to(device) 458 pos_embeds = self.pos_encoding[position_ids, :] 459 460 hidden_states = inputs_embeds + pos_embeds + token_type_embeds This commit is introduced due to commit 6f36b56497828642b65f54ea26aa4064186de57a. * Fix possibly wrong position_ids shape in doc Supported by CTRLModel.forward, line 448--460: src/transformers/models/ctrl/modeling_ctrl.py: 448 if inputs_embeds is None: 449 inputs_embeds = self.w(input_ids) 450 # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded 451 seq_len = input_shape[-1] 452 mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device) 453 454 inputs_embeds *= np.sqrt(self.d_model_size) 455 456 # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually. 457 self.pos_encoding = self.pos_encoding.to(device) 458 pos_embeds = self.pos_encoding[position_ids, :] 459 460 hidden_states = inputs_embeds + pos_embeds + token_type_embeds This commit is introduced due to commit 6f36b56497828642b65f54ea26aa4064186de57a. * Fix wrong token_type_ids shape in doc Supported by TFCTRLMainLayer.call, line 376--394: src/transformers/models/ctrl/modeling_tf_ctrl.py: 376 if token_type_ids is not None: 377 token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) 378 token_type_embeds = self.w(token_type_ids) 379 token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype)) 380 else: 381 token_type_embeds = tf.constant(0.0) 382 position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) 383 384 if inputs_embeds is None: 385 check_embeddings_within_bounds(input_ids, self.w.input_dim) 386 inputs_embeds = self.w(input_ids) 387 seq_len = input_shape[-1] 388 mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) 389 390 inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype)) 391 392 pos_embeds = tf.gather(self.pos_encoding, position_ids) 393 pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype) 394 hidden_states = inputs_embeds + pos_embeds + token_type_embeds * Fix wrong position_ids shape in doc Supported by TFCTRLMainLayer.call, line 384--394: src/transformers/models/ctrl/modeling_tf_ctrl.py: 384 if inputs_embeds is None: 385 check_embeddings_within_bounds(input_ids, self.w.input_dim) 386 inputs_embeds = self.w(input_ids) 387 seq_len = input_shape[-1] 388 mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) 389 390 inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype)) 391 392 pos_embeds = tf.gather(self.pos_encoding, position_ids) 393 pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype) 394 hidden_states = inputs_embeds + pos_embeds + token_type_embeds * Fix wrong inputs_embeds shape in doc Supported by TFCTRLMainLayer.call, line 384--394: src/transformers/models/ctrl/modeling_tf_ctrl.py: 384 if inputs_embeds is None: 385 check_embeddings_within_bounds(input_ids, self.w.input_dim) 386 inputs_embeds = self.w(input_ids) 387 seq_len = input_shape[-1] 388 mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) 389 390 inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype)) 391 392 pos_embeds = tf.gather(self.pos_encoding, position_ids) 393 pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype) 394 hidden_states = inputs_embeds + pos_embeds + token_type_embeds * Fix wrong inputs_embeds shape in doc Supported by ClvpDecoder.forward, line 1212--1213: src/transformers/models/clvp/modeling_clvp.py: 1212 if inputs_embeds is None: 1213 inputs_embeds = self.input_embeds_layer(input_ids) * Fix wrong position_ids shape in doc Supported by FlaxGemmaPreTrainedModel.__call__, line 502--508: src/transformers/models/gemma/modeling_flax_gemma.py: 502 batch_size, sequence_length = input_ids.shape 503 504 if position_ids is None: 505 if past_key_values is not None: 506 raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") 507 508 position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) * Fix wrong position_ids shape in doc Supported by FlaxGPT2PreTrainedModel.__call__, line 482--488: src/transformers/models/gpt2/modeling_flax_gpt2.py: 482 batch_size, sequence_length = input_ids.shape 483 484 if position_ids is None: 485 if past_key_values is not None: 486 raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") 487 488 position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) * Fix wrong position_ids shape in doc Supported by GPT2Model.forward, line 918--921: src/transformers/models/gpt2/modeling_gpt2.py: 918 if inputs_embeds is None: 919 inputs_embeds = self.wte(input_ids) 920 position_embeds = self.wpe(position_ids) 921 hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) * Fix wrong inputs_embeds shape in doc Supported by GPT2Model.forward, line 918--919: src/transformers/models/gpt2/modeling_gpt2.py: 918 if inputs_embeds is None: 919 inputs_embeds = self.wte(input_ids) * Fix wrong labels shape in doc Supported by GPT2LMHeadModel.forward, line 1156--1157: src/transformers/models/gpt2/modeling_gpt2.py: 1156 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 1157 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` * Fix wrong labels shape in doc Supported by GPT2DoubleHeadsModel.forward, line 1314--1315: src/transformers/models/gpt2/modeling_gpt2.py: 1314 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 1315 `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to * Fix wrong token_type_ids shape in doc Supported by TFGPT2MainLayer.call, line 486--500: src/transformers/models/gpt2/modeling_tf_gpt2.py: 486 if inputs_embeds is None: 487 check_embeddings_within_bounds(input_ids, self.config.vocab_size) 488 inputs_embeds = self.wte(input_ids) 489 490 position_embeds = self.wpe(position_ids) 491 492 if token_type_ids is not None: 493 token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) 494 token_type_embeds = self.wte(token_type_ids) 495 else: 496 token_type_embeds = tf.constant(0.0) 497 498 position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype) 499 token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype) 500 hidden_states = inputs_embeds + position_embeds + token_type_embeds * Fix wrong position_ids shape in doc Supported by TFGPT2MainLayer.call, line 486--500: src/transformers/models/gpt2/modeling_tf_gpt2.py: 486 if inputs_embeds is None: 487 check_embeddings_within_bounds(input_ids, self.config.vocab_size) 488 inputs_embeds = self.wte(input_ids) 489 490 position_embeds = self.wpe(position_ids) 491 492 if token_type_ids is not None: 493 token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) 494 token_type_embeds = self.wte(token_type_ids) 495 else: 496 token_type_embeds = tf.constant(0.0) 497 498 position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype) 499 token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype) 500 hidden_states = inputs_embeds + position_embeds + token_type_embeds * Fix wrong inputs_embeds shape in doc Supported by TFGPT2MainLayer.call, line 486--488: src/transformers/models/gpt2/modeling_tf_gpt2.py: 486 if inputs_embeds is None: 487 check_embeddings_within_bounds(input_ids, self.config.vocab_size) 488 inputs_embeds = self.wte(input_ids) * Fix wrong position_ids shape in doc Supported by GPTBigCodeModel.forward, line 962--965: src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py: 962 if inputs_embeds is None: 963 inputs_embeds = self.wte(input_ids) 964 position_embeds = self.wpe(position_ids) 965 hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) * Fix wrong inputs_embeds shape in doc Supported by GPTBigCodeModel.forward, line 962--963: src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py: 962 if inputs_embeds is None: 963 inputs_embeds = self.wte(input_ids) * Fix wrong labels shape in doc Supported by GPTBigCodeForCausalLM.forward, line 1158--1159: src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py: 1158 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 1159 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` * Fix wrong position_ids shape in doc Supported by FlaxGPTNeoModule.__call__, line 549--552: src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py: 549 input_embeds = self.wte(input_ids.astype("i4")) 550 position_embeds = self.wpe(position_ids.astype("i4")) 551 552 hidden_states = input_embeds + position_embeds * Fix wrong position_ids shape in doc Supported by GPTNeoModel.forward, line 685--720: src/transformers/models/gpt_neo/modeling_gpt_neo.py: 685 if inputs_embeds is None: 686 inputs_embeds = self.wte(input_ids) 687 688 # kept for BC (non `Cache` `past_key_values` inputs) 689 return_legacy_cache = False 690 if use_cache and not isinstance(past_key_values, Cache): 691 return_legacy_cache = True 692 if past_key_values is None: 693 past_key_values = DynamicCache() 694 else: 695 past_key_values = DynamicCache.from_legacy_cache(past_key_values) 696 logger.warning_once( 697 "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " 698 "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " 699 "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" 700 ) 701 702 seq_length = inputs_embeds.shape[1] 703 if cache_position is None: 704 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 705 cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device) 706 707 if position_ids is None: 708 position_ids = cache_position.unsqueeze(0) 709 710 causal_mask = self._update_causal_mask( 711 attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions 712 ) 713 714 # Prepare head mask if needed 715 # 1.0 in head_mask indicate we keep the head 716 # attention_probs has shape bsz x num_heads x N x N 717 # head_mask has shape n_layer x batch x num_heads x N x N 718 head_mask = self.get_head_mask(head_mask, self.config.num_layers) 719 position_embeds = self.wpe(position_ids) 720 hidden_states = inputs_embeds + position_embeds * Fix wrong inputs_embeds shape in doc Supported by GPTNeoModel.forward, line 685--686: src/transformers/models/gpt_neo/modeling_gpt_neo.py: 685 if inputs_embeds is None: 686 inputs_embeds = self.wte(input_ids) * Fix wrong labels shape in doc Supported by GPTNeoForCausalLM.forward, line 968--969: src/transformers/models/gpt_neo/modeling_gpt_neo.py: 968 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 969 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` * Fix wrong position_ids shape in doc Supported by FlaxGPTJPreTrainedModel.__call__, line 455--461: src/transformers/models/gptj/modeling_flax_gptj.py: 455 batch_size, sequence_length = input_ids.shape 456 457 if position_ids is None: 458 if past_key_values is not None: 459 raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") 460 461 position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) * Fix wrong token_type_ids shape in doc Supported by TFGPTJMainLayer.call, line 482--493: src/transformers/models/gptj/modeling_tf_gptj.py: 482 if inputs_embeds is None: 483 check_embeddings_within_bounds(input_ids, self.wte.vocab_size) 484 inputs_embeds = self.wte(input_ids, mode="embedding") 485 486 if token_type_ids is not None: 487 token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) 488 token_type_embeds = self.wte(token_type_ids, mode="embedding") 489 else: 490 token_type_embeds = tf.constant(0.0) 491 492 token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype) 493 hidden_states = inputs_embeds + token_type_embeds * Fix wrong position_ids shape in doc Supported by TFGPTJMainLayer.call, line 434--449: src/transformers/models/gptj/modeling_tf_gptj.py: 434 elif input_ids is not None: 435 input_shape = shape_list(input_ids) 436 input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) 437 elif inputs_embeds is not None: 438 input_shape = shape_list(inputs_embeds)[:-1] 439 else: 440 raise ValueError("You have to specify either input_ids or inputs_embeds") 441 442 if past_key_values is None: 443 past_length = 0 444 past_key_values = [None] * len(self.h) 445 else: 446 past_length = shape_list(past_key_values[0][0])[-2] 447 448 if position_ids is None: 449 position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0) * Fix wrong inputs_embeds shape in doc Supported by TFGPTJMainLayer.call, line 482--484: src/transformers/models/gptj/modeling_tf_gptj.py: 482 if inputs_embeds is None: 483 check_embeddings_within_bounds(input_ids, self.wte.vocab_size) 484 inputs_embeds = self.wte(input_ids, mode="embedding") * Fix wrong labels shape in doc Supported by TFGPTJForCausalLM.call, line 812--813: src/transformers/models/gptj/modeling_tf_gptj.py: 812 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 813 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` * Fix possibly wrong input_ids shape in doc Since 'input_ids_length' was mentioned immediately after the shape `(batch_size, sequence_length)`, it doesn't make sense to me for `input_ids` to have such shape---IMO it ought to have shape `(batch_size, input_ids_length)` instead. * Fix possibly wrong token_type_ids shape in doc Supported by ImageGPTModel.forward, line 773--780: src/transformers/models/imagegpt/modeling_imagegpt.py: 773 if inputs_embeds is None: 774 inputs_embeds = self.wte(input_ids) 775 position_embeds = self.wpe(position_ids) 776 hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) 777 778 if token_type_ids is not None: 779 token_type_embeds = self.wte(token_type_ids) 780 hidden_states = hidden_states + token_type_embeds This commit is introduced due to commit 8e594a4143cca79f165b99e4ed4c9f3a90047bf3. * Fix possibly wrong position_ids shape in doc Supported by ImageGPTModel.forward, line 773--776: src/transformers/models/imagegpt/modeling_imagegpt.py: 773 if inputs_embeds is None: 774 inputs_embeds = self.wte(input_ids) 775 position_embeds = self.wpe(position_ids) 776 hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) This commit is introduced due to commit 8e594a4143cca79f165b99e4ed4c9f3a90047bf3. * Fix possibly wrong inputs_embeds shape in doc Supported by ImageGPTModel.forward, line 773--774: src/transformers/models/imagegpt/modeling_imagegpt.py: 773 if inputs_embeds is None: 774 inputs_embeds = self.wte(input_ids) This commit is introduced due to commit 8e594a4143cca79f165b99e4ed4c9f3a90047bf3. * Fix possibly wrong labels shape in doc Supported by ImageGPTForCausalImageModeling.forward, line 923--924: src/transformers/models/imagegpt/modeling_imagegpt.py: 923 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 924 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` This commit is introduced due to commit 8e594a4143cca79f165b99e4ed4c9f3a90047bf3. * Fix possibly wrong labels shape in doc Supported by ImageGPTModel.forward, line 665--666: src/transformers/models/imagegpt/modeling_imagegpt.py: 665 Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set 666 `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` This commit is introduced due to commit 8e594a4143cca79f165b99e4ed4c9f3a90047bf3. * Fix wrong position_ids shape in doc Supported by FlaxLlamaPreTrainedModel.__call__, line 484--490: src/transformers/models/llama/modeling_flax_llama.py: 484 batch_size, sequence_length = input_ids.shape 485 486 if position_ids is None: 487 if past_key_values is not None: 488 raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") 489 490 position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) * Fix wrong position_ids shape in doc Supported by FlaxMistralPreTrainedModel.__call__, line 478--484: src/transformers/models/mistral/modeling_flax_mistral.py: 478 batch_size, sequence_length = input_ids.shape 479 480 if position_ids is None: 481 if past_key_values is not None: 482 raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") 483 484 position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
2025-08-02 11:11:05 +06:00 · 2025-04-24 22:36:03 +08:00 · 2025-04-24 22:36:03 +08:00 · 22e3da92b7
commit 22e3da92b7
parent 4d64c38593
15 changed files with 38 additions and 38 deletions
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@ -935,7 +935,7 @@ CLVP_DECODER_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -946,7 +946,7 @@ CLVP_DECODER_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@ -249,7 +249,7 @@ CTRL_START_DOCSTRING = r"""

 CTRL_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

@ -271,7 +271,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

@ -279,7 +279,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -290,7 +290,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@ -533,7 +533,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

@ -541,7 +541,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -552,7 +552,7 @@ CTRL_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
--- a/src/transformers/models/gemma/modeling_flax_gemma.py
+++ b/src/transformers/models/gemma/modeling_flax_gemma.py
@ -103,7 +103,7 @@ GEMMA_INPUTS_DOCSTRING = r"""

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@ -90,7 +90,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@ -710,7 +710,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -721,7 +721,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
@ -1297,7 +1297,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin):
        **kwargs,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
@ -1443,7 +1443,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin):
        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
            1]`.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@ -705,7 +705,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            `len(past_key_values) + len(input_ids)`

            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

@ -713,7 +713,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -724,7 +724,7 @@ GPT2_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@ -751,7 +751,7 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -762,7 +762,7 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
@ -1154,7 +1154,7 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel, GenerationMixin):
        **kwargs,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
-        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@ -88,7 +88,7 @@ GPT_NEO_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@ -583,7 +583,7 @@ GPT_NEO_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -594,7 +594,7 @@ GPT_NEO_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
@ -964,7 +964,7 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel, GenerationMixin):
        **kwargs,
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@ -89,7 +89,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@ -635,7 +635,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

@ -643,7 +643,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -654,7 +654,7 @@ GPTJ_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
@ -808,7 +808,7 @@ class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss):
        training: Optional[bool] = False,
    ) -> Union[TFCausalLMOutputWithPast, Tuple[tf.Tensor]]:
        r"""
-        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@ -543,7 +543,7 @@ IMAGEGPT_START_DOCSTRING = r"""

 IMAGEGPT_INPUTS_DOCSTRING = r"""
    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.
@ -564,7 +564,7 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

@ -572,7 +572,7 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

@ -583,7 +583,7 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
@ -661,7 +661,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
        **kwargs: Any,
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
@ -919,7 +919,7 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin):
        **kwargs: Any,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@ -109,7 +109,7 @@ LLAMA_INPUTS_DOCSTRING = r"""

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

--- a/src/transformers/models/mistral/modeling_flax_mistral.py
+++ b/src/transformers/models/mistral/modeling_flax_mistral.py
@ -108,7 +108,7 @@ MISTRAL_INPUTS_DOCSTRING = r"""

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.