Docs: formatting nits (#32247)

* doc formatting nits

* ignore non-autodocs

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/esm/modeling_esm.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/esm/modeling_esm.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make fixup

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Joao Gante 2024-07-30 15:49:14 +01:00 committed by GitHub
parent 2fbbcf5007
commit e68ec18ce2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
88 changed files with 172 additions and 170 deletions

View File

@ -147,7 +147,7 @@ def get_original_command(max_width=80, full_python_path=False):
Return the original command line string that can be replayed nicely and wrapped for 80 char width.
Args:
max_width (`int`, `optional`, defaults to 80):
max_width (`int`, *optional*, defaults to 80):
The width to wrap for.
full_python_path (`bool`, `optional`, defaults to `False`):
Whether to replicate the full path or just the last segment (i.e. `python`).

View File

@ -113,7 +113,7 @@ class Problem:
The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
inputs expected (the value used will be `<<input_name>>` in this case).
answer (`str` or `list[str`]):
answer (`str` or `list[str]`):
The theoretical answer (or list of possible valid answers) to the problem, as code.
"""

View File

@ -663,7 +663,7 @@ def spectrogram_batch(
Specifies log scaling strategy; options are None, "log", "log10", "dB".
reference (`float`, *optional*, defaults to 1.0):
Reference value for dB conversion in log_mel.
min_value (`float`, °optional*, defaults to 1e-10):
min_value (`float`, *optional*, defaults to 1e-10):
Minimum floor value for log scale conversions.
db_range (`float`, *optional*):
Dynamic range for dB scale spectrograms.

View File

@ -542,7 +542,7 @@ class QuantoQuantizedCache(QuantizedCache):
Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
Parameters:
cache_config (`QuantizedCacheConfig`,):
cache_config (`QuantizedCacheConfig`):
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
"""
@ -583,7 +583,7 @@ class HQQQuantizedCache(QuantizedCache):
Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
Parameters:
cache_config (`QuantizedCacheConfig`,):
cache_config (`QuantizedCacheConfig`):
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
"""
@ -794,7 +794,7 @@ class StaticCache(Cache):
Static Cache class to be used with `torch.compile(model)` and `torch.export()`.
Parameters:
config (`PretrainedConfig):
config (`PretrainedConfig`):
The configuration file defining the shape-related attributes required to initialize the static cache.
max_batch_size (`int`):
The maximum batch size with which the model will be used.
@ -924,7 +924,7 @@ class SlidingWindowCache(StaticCache):
We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)
Parameters:
config (`PretrainedConfig):
config (`PretrainedConfig`):
The configuration file defining the shape-related attributes required to initialize the static cache.
max_batch_size (`int`):
The maximum batch size with which the model will be used.

View File

@ -225,7 +225,7 @@ def get_resize_output_image_size(
Args:
input_image (`np.ndarray`):
The image to resize.
size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
this.

View File

@ -1389,7 +1389,7 @@ class NeptuneCallback(TrainerCallback):
You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
value of the `NEPTUNE_PROJECT` environment variable is used.
name (`str`, *optional*): Custom name for the run.
base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
base_namespace (`str`, *optional*, defaults to "finetuning"): In the Neptune run, the root namespace
that will contain all of the metadata logged by the callback.
log_parameters (`bool`, *optional*, defaults to `True`):
If True, logs all Trainer arguments and model parameters provided by the Trainer.

View File

@ -266,7 +266,7 @@ class AttentionMaskConverter:
# or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
# Thus, we only set `ignore_causal_mask = True` if the model is set to training.
#
# Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`).
# Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
if (
(is_training or not is_tracing)
and (query_length == 1 or key_value_length == query_length)

View File

@ -39,7 +39,7 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.T
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
Return:
indices (`torch.Tensor):
indices (`torch.Tensor`):
The indices of non-masked tokens from the flattened input sequence.
cu_seqlens (`torch.Tensor`):
The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
@ -83,7 +83,7 @@ def _upad_input(
Target length.
Return:
query_layer (`torch.Tensor):
query_layer (`torch.Tensor`):
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
key_layer (`torch.Tensor`):
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
@ -149,7 +149,7 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
Return:
query (`torch.Tensor):
query (`torch.Tensor`):
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
key (`torch.Tensor`):
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).

View File

@ -1444,7 +1444,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
Args:
dataset (`Any`):
A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
batch_size (`int`, defaults to 8):
batch_size (`int`, *optional*, defaults to 8):
The size of batches to return.
shuffle (`bool`, defaults to `True`):
Whether to return samples from the dataset in random order. Usually `True` for training datasets and
@ -3442,7 +3442,7 @@ class TFSequenceSummary(keras.layers.Layer):
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""

View File

@ -105,10 +105,10 @@ class AutoformerConfig(PretrainedConfig):
label_length (`int`, *optional*, defaults to 10):
Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
non-autoregressive generation).
moving_average (`int`, defaults to 25):
moving_average (`int`, *optional*, defaults to 25):
The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
Layer.
autocorrelation_factor (`int`, defaults to 3):
autocorrelation_factor (`int`, *optional*, defaults to 3):
"Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
It's recommended in the paper to set it to a number between 1 and 5.

View File

@ -1219,7 +1219,7 @@ class BertForPreTraining(BertPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -1291,7 +1291,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Return:

View File

@ -2290,7 +2290,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -57,7 +57,7 @@ def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torc
Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
attention_mask (`torch.Tensor`):
Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
num_heads (`int`, *required*):
num_heads (`int`):
number of heads
dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
dtype of the output tensor
@ -94,13 +94,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
Dropout add function
Args:
x (`torch.tensor`, *required*):
x (`torch.tensor`):
input tensor
residual (`torch.tensor`, *required*):
residual (`torch.tensor`):
residual tensor
prob (`float`, *required*):
prob (`float`):
dropout probability
training (`bool`, *required*):
training (`bool`):
training mode
"""
out = F.dropout(x, p=prob, training=training)
@ -114,7 +114,7 @@ def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
make the model jitable.
Args:
x (`torch.tensor`, *required*):
x (`torch.tensor`):
input hidden states
"""
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
@ -126,9 +126,9 @@ def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
0.3989423 * x * torch.exp(-0.5 * x * x)
Args:
g (`torch.tensor`, *required*):
g (`torch.tensor`):
gradient output tensor
x (`torch.tensor`, *required*):
x (`torch.tensor`):
input tensor
"""
x = x[0] # x is a tuple of 1 element, needs to unpack it first
@ -210,7 +210,7 @@ class BloomAttention(nn.Module):
without making any copies, results share same memory storage as `fused_qkv`
Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns:
query: [batch_size, num_heads, seq_length, head_dim]
@ -229,7 +229,7 @@ class BloomAttention(nn.Module):
Merge heads together over the last dimension
Args:
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
Returns:
torch.tensor: [batch_size, seq_length, num_heads * head_dim]

View File

@ -247,7 +247,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
Image to resize.
size (`Dict[str, int]`):
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
size_divisor (`int`, defaults to 32):
size_divisor (`int`, *optional*, defaults to 32):
The image is resized to a size that is a multiple of this value.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.

View File

@ -972,7 +972,7 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -173,7 +173,7 @@ class ClvpFeatureExtractor(SequenceFeatureExtractor):
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
padding_value (`float`, defaults to 0.0):
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding values / vectors.
max_length (`int`, *optional*):
The maximum input length of the inputs.

View File

@ -41,9 +41,9 @@ class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
Args:
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
patch_size (`int`, optional, defaults to 4):
patch_size (`int`, *optional*, defaults to 4):
Patch size to use in the patch embedding layer.
num_stages (`int`, optional, defaults to 4):
num_stages (`int`, *optional*, defaults to 4):
The number of stages in the model.
hidden_sizes (`List[int]`, *optional*, defaults to [96, 192, 384, 768]):
Dimensionality (hidden size) at each stage.

View File

@ -35,9 +35,9 @@ class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
Args:
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
patch_size (`int`, optional, defaults to 4):
patch_size (`int`, *optional*, defaults to 4):
Patch size to use in the patch embedding layer.
num_stages (`int`, optional, defaults to 4):
num_stages (`int`, *optional*, defaults to 4):
The number of stages in the model.
hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
Dimensionality (hidden size) at each stage.

View File

@ -175,7 +175,7 @@ class TFConvNextV2Layer(keras.layers.Layer):
Model configuration class.
dim (`int`):
Number of input channels.
drop_path (`float`, defaults to 0.0):
drop_path (`float`, *optional*, defaults to 0.0):
Stochastic depth rate.
"""

View File

@ -1077,7 +1077,7 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -37,8 +37,8 @@ class DbrxAttentionConfig(PretrainedConfig):
The dropout probability for the attention layers.
clip_qkv (`float`, *optional*):
If set, clip the queries, keys, and values in the attention layer to this value.
kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
"""
def __init__(
@ -92,11 +92,11 @@ class DbrxFFNConfig(PretrainedConfig):
ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
The dict should have a key 'name' with the value being the name of the activation function along with
any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
"""

View File

@ -144,7 +144,7 @@ def load_balancing_loss_func(
Number of experts.
top_k (`int`):
The number of experts each token is routed to.
attention_mask (`torch.Tensor`, None):
attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function
shape [batch_size X sequence_length] if not None.
@ -757,16 +757,16 @@ class DbrxBlock(nn.Module):
Args:
hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
attention_mask (`torch.Tensor`, *optional*): attention mask of size (batch_size, sequence_length)
if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
if default attention is used.
past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
past_key_value (`Tuple(torch.Tensor)`, *optional*): cached past key and value projection states
output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
attention layers. See `attentions` under returned tensors for more detail.
output_router_logits (`bool`, optional): Whether or not to return the router logits.
use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
output_router_logits (`bool`, *optional*): Whether or not to return the router logits.
use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are
returned and can be used to speed up decoding (see `past_key_values`).
cache_position (`torch.LongTensor`, optional): position ids of the cache
cache_position (`torch.LongTensor`, *optional*): position ids of the cache
"""
# Norm + Attention + Norm

View File

@ -80,7 +80,7 @@ class DebertaConfig(PretrainedConfig):
pos_att_type (`List[str]`, *optional*):
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
`["p2c", "c2p"]`.
layer_norm_eps (`float`, optional, defaults to 1e-12):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example:

View File

@ -602,10 +602,10 @@ class DisentangledSelfAttention(nn.Module):
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
output_attentions (`bool`, optional):
output_attentions (`bool`, *optional*):
Whether return the attention matrix.
query_states (`torch.FloatTensor`, optional):
query_states (`torch.FloatTensor`, *optional*):
The *Q* state in *Attention(Q,K,V)*.
relative_pos (`torch.LongTensor`):

View File

@ -669,10 +669,10 @@ class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
return_att (`bool`, optional):
return_att (`bool`, *optional*):
Whether return the attention matrix.
query_states (`tf.Tensor`, optional):
query_states (`tf.Tensor`, *optional*):
The *Q* state in *Attention(Q,K,V)*.
relative_pos (`tf.Tensor`):

View File

@ -80,7 +80,7 @@ class DebertaV2Config(PretrainedConfig):
pos_att_type (`List[str]`, *optional*):
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
`["p2c", "c2p"]`, `["p2c", "c2p"]`.
layer_norm_eps (`float`, optional, defaults to 1e-12):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example:

View File

@ -678,10 +678,10 @@ class DisentangledSelfAttention(nn.Module):
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
output_attentions (`bool`, optional):
output_attentions (`bool`, *optional*):
Whether return the attention matrix.
query_states (`torch.FloatTensor`, optional):
query_states (`torch.FloatTensor`, *optional*):
The *Q* state in *Attention(Q,K,V)*.
relative_pos (`torch.LongTensor`):

View File

@ -738,10 +738,10 @@ class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
return_att (`bool`, optional):
return_att (`bool`, *optional*):
Whether return the attention matrix.
query_states (`tf.Tensor`, optional):
query_states (`tf.Tensor`, *optional*):
The *Q* state in *Attention(Q,K,V)*.
relative_pos (`tf.Tensor`):

View File

@ -1019,7 +1019,7 @@ class ErnieForPreTraining(ErniePreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -993,7 +993,7 @@ class EsmForMaskedLM(EsmPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -1232,7 +1232,7 @@ class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -229,13 +229,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
Dropout add function
Args:
x (`torch.tensor`, *required*):
x (`torch.tensor`):
input tensor
residual (`torch.tensor`, *required*):
residual (`torch.tensor`):
residual tensor
prob (`float`, *required*):
prob (`float`):
dropout probability
training (`bool`, *required*):
training (`bool`):
training mode
"""
out = F.dropout(x, p=prob, training=training)
@ -315,7 +315,7 @@ class FalconAttention(nn.Module):
Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns:
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
@ -347,7 +347,7 @@ class FalconAttention(nn.Module):
Merge heads together over the last dimension
Args:
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
Returns:
torch.tensor: [batch_size, seq_length, num_heads * head_dim]

View File

@ -389,16 +389,16 @@ class FlavaImageCodebookConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.
Args:
num_groups (`int`, defaults to 4):
num_groups (`int`, *optional*, defaults to 4):
Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
internal calculation and estimations.
input_channels (`int`, defaults to 3):
input_channels (`int`, *optional*, defaults to 3):
Number of channels in the image to be passed.
num_blocks_per_group (`int`, defaults to 2):
num_blocks_per_group (`int`, *optional*, defaults to 2):
Number of conv-based blocks per group.
hidden_size (`int`, defaults to 256):
hidden_size (`int`, *optional*, defaults to 256):
Size of hidden dim for the blocks.
vocab_size (`int`, defaults to 8192):
vocab_size (`int`, *optional*, defaults to 8192):
Size of the output vocabulary for the codebook.
freeze (`bool`, defaults to `True`):
Whether to freeze the weights of the model.

View File

@ -176,7 +176,7 @@ class FlavaForPreTrainingOutput(ModelOutput):
The output of the [`FlavaTextModel`].
multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
multimodal_masked_output (`BaseModelOutputWithPooling`, returned when `input_ids_masked` and `pixel_values` are present):
multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
The output of the [`FlavaMultimodalModel`].
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):

View File

@ -651,7 +651,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -501,9 +501,9 @@ class FSMTEncoder(nn.Module):
BaseModelOutput or Tuple comprised of:
- **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
- **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape *(src_len,
- **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
- **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
- **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout.
"""
# check attention mask and invert

View File

@ -839,7 +839,7 @@ PARALLELIZE_DOCSTRING = r"""
it will evenly distribute blocks across all devices.
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
device_map (`Dict[int, list]`, *optional*):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the

View File

@ -587,7 +587,7 @@ PARALLELIZE_DOCSTRING = r"""
across all devices.
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
device_map (`Dict[int, list]`, *optional*):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the

View File

@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig):
documentation from [`PretrainedConfig`] for more information.
Args:
additional_vocab_size (`int`, *optional`, defaults to 0):
additional_vocab_size (`int`, *optional*, defaults to 0):
Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
are always trainable whereas regular vocab tokens can be frozen or not.
vocab_size (`int`, *optional*, defaults to 32000):

View File

@ -97,7 +97,7 @@ def load_balancing_loss_func(
router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None):
attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function
shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*):

View File

@ -69,7 +69,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None):
attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function
shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*):

View File

@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin):
Args:
bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
The bounding bboxes associated to `texts`.
num_image_tokens (`int`, defaults to 64):
num_image_tokens (`int`, *optional* defaults to 64):
The number of (consecutive) places that are used to mark the placeholders to store image information.
This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
first_image_token_id (`int`, *optional*):

View File

@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
Calculate the number of patches after the preprocessing for images of any resolution.
Args:
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
The size of the input image in the format (height, width). ?
grid_pinpoints (`List`):
A list containing possible resolutions. Each item in the list should be a tuple or list

View File

@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
Calculate the number of patches after the preprocessing for images of any resolution.
Args:
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
The size of the input image in the format (height, width). ?
grid_pinpoints (`List`):
A list containing possible resolutions. Each item in the list should be a tuple or list

View File

@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the
cross(masked)-attention of the decoder.
feature_size_list (`List[torch.Size]` ):
feature_size_list (`List[torch.Size]`):
This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under

View File

@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -84,7 +84,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None):
attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function
shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*):

View File

@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r"""
it will evenly distribute blocks across all devices.
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
device_map (`Dict[int, list]`, *optional*):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the

View File

@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
Args:
outputs ([`OneFormerForUniversalSegmentationOutput`]):
The outputs from [`OneFormerForUniversalSegmentationOutput`].
task_type (`str`, *optional)*, defaults to "instance"):
task_type (`str`, *optional*, defaults to "instance"):
The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
ignore the stuff predictions.
is_demo (`bool`, *optional)*, defaults to `True`):

View File

@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape):
channels is preserved.
Returns
image (`np.ndarray):
image (`np.ndarray`):
The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
input.ndim`.
output_shape (`Tuple`):

View File

@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module):
"""Normalization block
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
"""This module mixes the features in the channel dimension.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module):
"""This module mixes the patch dimension.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module):
"""This module mixes the hidden feature dimension.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module):
The `PatchTSMixer` layer that does all three kinds of mixing.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module):
"""The main computing framework of the `PatchTSMixer` model.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module):
"""Prediction Head for Forecasting
Args:
config (`PatchTSMixerConfig`, *required*): Configuration.
config (`PatchTSMixerConfig`):
Configuration.
"""
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module):
"""Linear head for Classification and Regression.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module):
"""Pretraining head.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
"""
@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for mask pretraining.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
Returns:
@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for forecasting application.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
Returns:
@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for classification application.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
Returns:
@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
`PatchTSMixer` for regression application.
Args:
config (`PatchTSMixerConfig`, *required*):
config (`PatchTSMixerConfig`):
Configuration.
Returns:

View File

@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module):
storage as `fused_qkv`
Args:
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
Returns:
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]

View File

@ -75,7 +75,7 @@ def load_balancing_loss_func(
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
shape [batch_size X sequence_length, num_experts].
attention_mask (`torch.Tensor`, None):
attention_mask (`torch.Tensor`, *optional*):
The attention_mask used in forward function
shape [batch_size X sequence_length] if not None.
num_experts (`int`, *optional*):

View File

@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function.
Returns:

View File

@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function.
Returns:
@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Legacy dictionary, which is required so that model can use *generate()* function.
Returns:

View File

@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -1075,7 +1075,7 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -1151,7 +1151,7 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
ignored (masked), the loss is only computed for the tokens with labels in `[0, ...,
config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
Returns:

View File

@ -59,7 +59,7 @@ class SegGptEncoderOutput(ModelOutput):
attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
Tuple of *torch.FloatTensor* (one for each layer) of shape
`(batch_size, num_heads, seq_len, seq_len)`.
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.intermediate_hidden_state_indices` is set):
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
Additionaly, each feature passes through a LayerNorm.
@ -77,7 +77,7 @@ class SegGptImageSegmentationOutput(ModelOutput):
Output type of [`SegGptImageSegmentationOutput`].
Args:
loss (`torch.FloatTensor`, `optional`, returned when `labels` is provided):
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
The loss value.
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
The predicted masks.

View File

@ -745,10 +745,10 @@ class DisentangledSelfAttention(nn.Module):
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
th token.
output_attentions (`bool`, optional):
output_attentions (`bool`, *optional*):
Whether return the attention matrix.
query_states (`torch.FloatTensor`, optional):
query_states (`torch.FloatTensor`, *optional*):
The *Q* state in *Attention(Q,K,V)*.
relative_pos (`torch.LongTensor`):

View File

@ -220,7 +220,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors.
padding_value (`float`, defaults to 0.0):
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding values / vectors.
"""

View File

@ -181,7 +181,7 @@ PARALLELIZE_DOCSTRING = r"""
it will evenly distribute blocks across all devices.
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
device_map (`Dict[int, list]`, *optional*):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the t5 models have the

View File

@ -1249,7 +1249,7 @@ class TapasTokenizer(PreTrainedTokenizer):
Total number of table columns
max_length (`int`):
Total maximum length.
truncation_strategy (`str` or [`TapasTruncationStrategy`]):
truncation_strategy (`str` or [`TapasTruncationStrategy]`):
Truncation strategy to use. Seeing as this method should only be called when truncating, the only
available strategy is the `"drop_rows_to_fit"` strategy.

View File

@ -833,7 +833,7 @@ class UdopTokenizer(PreTrainedTokenizer):
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).

View File

@ -814,7 +814,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).

View File

@ -243,7 +243,7 @@ class ViltImageProcessor(BaseImageProcessor):
Image to resize.
size (`Dict[str, int]`):
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
size_divisor (`int`, defaults to 32):
size_divisor (`int`, *optional*, defaults to 32):
The image is resized to a size that is a multiple of this value.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.

View File

@ -182,7 +182,7 @@ def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_siz
Relative position embeddings (Lw, num_channels) for width axis.
q_size (`Tuple[int]`):
Spatial sequence size of query q with (queries_height, queries_width).
k_size (`Tuple[int]`]):
k_size (`Tuple[int]`):
Spatial sequence size of key k with (keys_height, keys_width).
Returns:

View File

@ -36,11 +36,11 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
most of the main methods. Users should refer to this superclass for more information regarding those methods.
Args:
feature_size (`int`, defaults to 1):
feature_size (`int`, *optional*, defaults to 1):
The feature dimension of the extracted features.
sampling_rate (`int`, defaults to 16000):
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
padding_value (`float`, defaults to 0.0):
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding values.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
@ -166,7 +166,7 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors.
padding_value (`float`, defaults to 0.0):
padding_value (`float`, *optional*, defaults to 0.0):
"""
if sampling_rate is not None:

View File

@ -184,9 +184,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
If `"rotary"` position embeddings are used, defines the size of the embedding base.
max_source_positions (`int`, *optional*, defaults to 5000):
if `"relative"` position embeddings are used, defines the maximum source input positions.
conv_depthwise_kernel_size (`int`, defaults to 31):
conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
Kernel size of convolutional depthwise 1D layer in Conformer blocks.
conformer_conv_dropout (`float`, defaults to 0.1):
conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all convolutional layers in Conformer blocks.
Example:

View File

@ -44,16 +44,16 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
Fourier Transform` which should match pytorch's `torch.stft` equivalent.
Args:
feature_size (`int`, defaults to 80):
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, defaults to 16000):
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
hop_length (`int`, defaults to 160):
hop_length (`int`, *optional*, defaults to 160):
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, defaults to 30):
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.
n_fft (`int`, defaults to 400):
n_fft (`int`, *optional*, defaults to 400):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
@ -231,7 +231,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
pipeline.
padding_value (`float`, defaults to 0.0):
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding values / vectors.
do_normalize (`bool`, *optional*, defaults to `False`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly

View File

@ -1368,7 +1368,7 @@ class WhisperGenerationMixin:
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
num_segment_frames (`int`, defaults to 3000):
num_segment_frames (`int`, *optional*, defaults to 3000):
The number of log-mel frames the model expects
Return:

View File

@ -565,7 +565,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
"""
offsets = []
@ -615,7 +615,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
Args:
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
"""
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
@ -671,7 +671,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
output_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
timestamps.
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
Whether or not to decode with timestamps included in the raw text.

View File

@ -207,7 +207,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
"""
offsets = []
@ -258,7 +258,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
Args:
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
"""
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
@ -317,7 +317,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
output_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
timestamps.
time_precision (`float`, `optional`, defaults to 0.02):
time_precision (`float`, *optional*, defaults to 0.02):
The time ratio to convert from token to time.
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
Whether or not to decode with timestamps included in the raw text.

View File

@ -1081,7 +1081,7 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -1039,7 +1039,7 @@ class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -1173,7 +1173,7 @@ class XmodForMaskedLM(XmodPreTrainedModel):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -647,8 +647,9 @@ class YolosModel(YolosPreTrainedModel):
Prunes heads of the model.
Args:
heads_to_prune (`dict` of {layer_num: list of heads to prune in this layer}):
See base class `PreTrainedModel`.
heads_to_prune (`dict`):
See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
list of heads to prune in this layer}
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)

View File

@ -218,7 +218,7 @@ def infer_framework_load_model(
If both frameworks are installed and available for `model`, PyTorch is selected.
Args:
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
config ([`AutoConfig`]):
The config associated with the model to help using the correct class
@ -322,7 +322,7 @@ def infer_framework_from_model(
If both frameworks are installed and available for `model`, PyTorch is selected.
Args:
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
model_classes (dictionary `str` to `type`, *optional*):
A mapping framework to class.
@ -349,7 +349,7 @@ def get_framework(model, revision: Optional[str] = None):
Select framework (TensorFlow or PyTorch) to use.
Args:
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
the model name). If no specific model is provided, defaults to using PyTorch.
"""
@ -385,7 +385,7 @@ def get_default_model_and_revision(
Select a default model to use for a given task. Defaults to pytorch if ambiguous.
Args:
targeted_task (`Dict` ):
targeted_task (`Dict`):
Dictionary representing the given task, that should contain default models
framework (`str`, None)

View File

@ -22,7 +22,7 @@ logger = logging.get_logger(__name__)
@add_end_docstrings(
build_pipeline_init_args(has_tokenizer=True),
r"""
top_k (`int`, defaults to 5):
top_k (`int`, *optional*, defaults to 5):
The number of predictions to return.
targets (`str` or `List[str]`, *optional*):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole

View File

@ -31,7 +31,7 @@ class PipelineIterator(IterableDataset):
```
Arguments:
loader (`torch.utils.data.DataLoader` or any iterator):
loader (`torch.utils.data.DataLoader` or `Iterable`):
The iterator that will be used to apply `infer` on.
infer (any function):
The function to apply of each element of `loader`.
@ -163,7 +163,7 @@ class PipelineChunkIterator(PipelineIterator):
```
Arguments:
loader (`torch.utils.data.DataLoader` or any iterator):
loader (`torch.utils.data.DataLoader` or `Iterable`):
The iterator that will be used to apply `infer` on.
infer (any function):
The function to apply of each element of `loader`.
@ -224,7 +224,7 @@ class PipelinePackIterator(PipelineIterator):
```
Arguments:
loader (`torch.utils.data.DataLoader` or any iterator):
loader (`torch.utils.data.DataLoader` or `Iterable`):
The iterator that will be used to apply `infer` on.
infer (any function):
The function to apply of each element of `loader`.

View File

@ -3200,7 +3200,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).

View File

@ -745,7 +745,7 @@ class Trainer:
Add a callback to the current list of [`~transformers.TrainerCallback`].
Args:
callback (`type` or [`~transformers.TrainerCallback`]):
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will instantiate a member of that class.
"""
@ -758,7 +758,7 @@ class Trainer:
If the callback is not found, returns `None` (and no error is raised).
Args:
callback (`type` or [`~transformers.TrainerCallback`]):
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will pop the first member of that class found in the list of callbacks.
@ -772,7 +772,7 @@ class Trainer:
Remove a callback from the current list of [`~transformers.TrainerCallback`].
Args:
callback (`type` or [`~transformers.TrainerCallback`]):
callback (`type` or [`~transformers.TrainerCallback]`):
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
first case, will remove the first member of that class found in the list of callbacks.
"""

View File

@ -80,7 +80,7 @@ class Seq2SeqTrainer(Trainer):
Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
Args:
gen_config_arg (`str` or [`~generation.GenerationConfig`]):
gen_config_arg (`str` or [`~generation.GenerationConfig]`):
`Seq2SeqTrainingArguments.generation_config` argument.
Returns:

View File

@ -1605,7 +1605,7 @@ def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
Args:
path (`str`): The path to the source file
file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
file (`str`, *optional*): The file to join with the path. Defaults to "__init__.py".
Returns:
`ModuleType`: The resulting imported module