mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Docs: formatting nits (#32247)
* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
parent
2fbbcf5007
commit
e68ec18ce2
@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http
|
|||||||
|
|
||||||
To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
|
To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
|
||||||
|
|
||||||
Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
|
Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
|
||||||
|
|
||||||
<frameworkcontent>
|
<frameworkcontent>
|
||||||
<pt>
|
<pt>
|
||||||
|
@ -147,7 +147,7 @@ def get_original_command(max_width=80, full_python_path=False):
|
|||||||
Return the original command line string that can be replayed nicely and wrapped for 80 char width.
|
Return the original command line string that can be replayed nicely and wrapped for 80 char width.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
max_width (`int`, `optional`, defaults to 80):
|
max_width (`int`, *optional*, defaults to 80):
|
||||||
The width to wrap for.
|
The width to wrap for.
|
||||||
full_python_path (`bool`, `optional`, defaults to `False`):
|
full_python_path (`bool`, `optional`, defaults to `False`):
|
||||||
Whether to replicate the full path or just the last segment (i.e. `python`).
|
Whether to replicate the full path or just the last segment (i.e. `python`).
|
||||||
|
@ -113,7 +113,7 @@ class Problem:
|
|||||||
The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
|
The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
|
||||||
values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
|
values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
|
||||||
inputs expected (the value used will be `<<input_name>>` in this case).
|
inputs expected (the value used will be `<<input_name>>` in this case).
|
||||||
answer (`str` or `list[str`]):
|
answer (`str` or `list[str]`):
|
||||||
The theoretical answer (or list of possible valid answers) to the problem, as code.
|
The theoretical answer (or list of possible valid answers) to the problem, as code.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -663,7 +663,7 @@ def spectrogram_batch(
|
|||||||
Specifies log scaling strategy; options are None, "log", "log10", "dB".
|
Specifies log scaling strategy; options are None, "log", "log10", "dB".
|
||||||
reference (`float`, *optional*, defaults to 1.0):
|
reference (`float`, *optional*, defaults to 1.0):
|
||||||
Reference value for dB conversion in log_mel.
|
Reference value for dB conversion in log_mel.
|
||||||
min_value (`float`, °optional*, defaults to 1e-10):
|
min_value (`float`, *optional*, defaults to 1e-10):
|
||||||
Minimum floor value for log scale conversions.
|
Minimum floor value for log scale conversions.
|
||||||
db_range (`float`, *optional*):
|
db_range (`float`, *optional*):
|
||||||
Dynamic range for dB scale spectrograms.
|
Dynamic range for dB scale spectrograms.
|
||||||
|
@ -542,7 +542,7 @@ class QuantoQuantizedCache(QuantizedCache):
|
|||||||
Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
|
Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
cache_config (`QuantizedCacheConfig`,):
|
cache_config (`QuantizedCacheConfig`):
|
||||||
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
|
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -583,7 +583,7 @@ class HQQQuantizedCache(QuantizedCache):
|
|||||||
Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
|
Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
cache_config (`QuantizedCacheConfig`,):
|
cache_config (`QuantizedCacheConfig`):
|
||||||
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
|
A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -794,7 +794,7 @@ class StaticCache(Cache):
|
|||||||
Static Cache class to be used with `torch.compile(model)` and `torch.export()`.
|
Static Cache class to be used with `torch.compile(model)` and `torch.export()`.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (`PretrainedConfig):
|
config (`PretrainedConfig`):
|
||||||
The configuration file defining the shape-related attributes required to initialize the static cache.
|
The configuration file defining the shape-related attributes required to initialize the static cache.
|
||||||
max_batch_size (`int`):
|
max_batch_size (`int`):
|
||||||
The maximum batch size with which the model will be used.
|
The maximum batch size with which the model will be used.
|
||||||
@ -924,7 +924,7 @@ class SlidingWindowCache(StaticCache):
|
|||||||
We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)
|
We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (`PretrainedConfig):
|
config (`PretrainedConfig`):
|
||||||
The configuration file defining the shape-related attributes required to initialize the static cache.
|
The configuration file defining the shape-related attributes required to initialize the static cache.
|
||||||
max_batch_size (`int`):
|
max_batch_size (`int`):
|
||||||
The maximum batch size with which the model will be used.
|
The maximum batch size with which the model will be used.
|
||||||
|
@ -225,7 +225,7 @@ def get_resize_output_image_size(
|
|||||||
Args:
|
Args:
|
||||||
input_image (`np.ndarray`):
|
input_image (`np.ndarray`):
|
||||||
The image to resize.
|
The image to resize.
|
||||||
size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
|
size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
|
||||||
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
|
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
|
||||||
this.
|
this.
|
||||||
|
|
||||||
|
@ -1389,7 +1389,7 @@ class NeptuneCallback(TrainerCallback):
|
|||||||
You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
|
You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
|
||||||
value of the `NEPTUNE_PROJECT` environment variable is used.
|
value of the `NEPTUNE_PROJECT` environment variable is used.
|
||||||
name (`str`, *optional*): Custom name for the run.
|
name (`str`, *optional*): Custom name for the run.
|
||||||
base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
|
base_namespace (`str`, *optional*, defaults to "finetuning"): In the Neptune run, the root namespace
|
||||||
that will contain all of the metadata logged by the callback.
|
that will contain all of the metadata logged by the callback.
|
||||||
log_parameters (`bool`, *optional*, defaults to `True`):
|
log_parameters (`bool`, *optional*, defaults to `True`):
|
||||||
If True, logs all Trainer arguments and model parameters provided by the Trainer.
|
If True, logs all Trainer arguments and model parameters provided by the Trainer.
|
||||||
|
@ -266,7 +266,7 @@ class AttentionMaskConverter:
|
|||||||
# or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
|
# or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
|
||||||
# Thus, we only set `ignore_causal_mask = True` if the model is set to training.
|
# Thus, we only set `ignore_causal_mask = True` if the model is set to training.
|
||||||
#
|
#
|
||||||
# Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`).
|
# Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
|
||||||
if (
|
if (
|
||||||
(is_training or not is_tracing)
|
(is_training or not is_tracing)
|
||||||
and (query_length == 1 or key_value_length == query_length)
|
and (query_length == 1 or key_value_length == query_length)
|
||||||
|
@ -39,7 +39,7 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.T
|
|||||||
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
|
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
indices (`torch.Tensor):
|
indices (`torch.Tensor`):
|
||||||
The indices of non-masked tokens from the flattened input sequence.
|
The indices of non-masked tokens from the flattened input sequence.
|
||||||
cu_seqlens (`torch.Tensor`):
|
cu_seqlens (`torch.Tensor`):
|
||||||
The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
|
The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
|
||||||
@ -83,7 +83,7 @@ def _upad_input(
|
|||||||
Target length.
|
Target length.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
query_layer (`torch.Tensor):
|
query_layer (`torch.Tensor`):
|
||||||
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
|
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
|
||||||
key_layer (`torch.Tensor`):
|
key_layer (`torch.Tensor`):
|
||||||
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
|
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
|
||||||
@ -149,7 +149,7 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
|
|||||||
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
|
Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
query (`torch.Tensor):
|
query (`torch.Tensor`):
|
||||||
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
|
Query state without padding. Shape: (total_target_length, num_heads, head_dim).
|
||||||
key (`torch.Tensor`):
|
key (`torch.Tensor`):
|
||||||
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
|
Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
|
||||||
|
@ -1444,7 +1444,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
|
|||||||
Args:
|
Args:
|
||||||
dataset (`Any`):
|
dataset (`Any`):
|
||||||
A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
|
A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
|
||||||
batch_size (`int`, defaults to 8):
|
batch_size (`int`, *optional*, defaults to 8):
|
||||||
The size of batches to return.
|
The size of batches to return.
|
||||||
shuffle (`bool`, defaults to `True`):
|
shuffle (`bool`, defaults to `True`):
|
||||||
Whether to return samples from the dataset in random order. Usually `True` for training datasets and
|
Whether to return samples from the dataset in random order. Usually `True` for training datasets and
|
||||||
@ -3442,7 +3442,7 @@ class TFSequenceSummary(keras.layers.Layer):
|
|||||||
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
|
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
|
||||||
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
|
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
|
||||||
|
|
||||||
initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
|
initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights.
|
||||||
kwargs (`Dict[str, Any]`, *optional*):
|
kwargs (`Dict[str, Any]`, *optional*):
|
||||||
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
|
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
|
||||||
"""
|
"""
|
||||||
|
@ -105,10 +105,10 @@ class AutoformerConfig(PretrainedConfig):
|
|||||||
label_length (`int`, *optional*, defaults to 10):
|
label_length (`int`, *optional*, defaults to 10):
|
||||||
Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
|
Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
|
||||||
non-autoregressive generation).
|
non-autoregressive generation).
|
||||||
moving_average (`int`, defaults to 25):
|
moving_average (`int`, *optional*, defaults to 25):
|
||||||
The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
|
The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
|
||||||
Layer.
|
Layer.
|
||||||
autocorrelation_factor (`int`, defaults to 3):
|
autocorrelation_factor (`int`, *optional*, defaults to 3):
|
||||||
"Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
|
"Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
|
||||||
It's recommended in the paper to set it to a number between 1 and 5.
|
It's recommended in the paper to set it to a number between 1 and 5.
|
||||||
|
|
||||||
|
@ -1219,7 +1219,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -1291,7 +1291,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
|
@ -2290,7 +2290,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -57,7 +57,7 @@ def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torc
|
|||||||
Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
|
Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
|
||||||
attention_mask (`torch.Tensor`):
|
attention_mask (`torch.Tensor`):
|
||||||
Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
|
Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
|
||||||
num_heads (`int`, *required*):
|
num_heads (`int`):
|
||||||
number of heads
|
number of heads
|
||||||
dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
|
dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
|
||||||
dtype of the output tensor
|
dtype of the output tensor
|
||||||
@ -94,13 +94,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
|
|||||||
Dropout add function
|
Dropout add function
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (`torch.tensor`, *required*):
|
x (`torch.tensor`):
|
||||||
input tensor
|
input tensor
|
||||||
residual (`torch.tensor`, *required*):
|
residual (`torch.tensor`):
|
||||||
residual tensor
|
residual tensor
|
||||||
prob (`float`, *required*):
|
prob (`float`):
|
||||||
dropout probability
|
dropout probability
|
||||||
training (`bool`, *required*):
|
training (`bool`):
|
||||||
training mode
|
training mode
|
||||||
"""
|
"""
|
||||||
out = F.dropout(x, p=prob, training=training)
|
out = F.dropout(x, p=prob, training=training)
|
||||||
@ -114,7 +114,7 @@ def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
|
|||||||
make the model jitable.
|
make the model jitable.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (`torch.tensor`, *required*):
|
x (`torch.tensor`):
|
||||||
input hidden states
|
input hidden states
|
||||||
"""
|
"""
|
||||||
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
|
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
|
||||||
@ -126,9 +126,9 @@ def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
|
|||||||
0.3989423 * x * torch.exp(-0.5 * x * x)
|
0.3989423 * x * torch.exp(-0.5 * x * x)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
g (`torch.tensor`, *required*):
|
g (`torch.tensor`):
|
||||||
gradient output tensor
|
gradient output tensor
|
||||||
x (`torch.tensor`, *required*):
|
x (`torch.tensor`):
|
||||||
input tensor
|
input tensor
|
||||||
"""
|
"""
|
||||||
x = x[0] # x is a tuple of 1 element, needs to unpack it first
|
x = x[0] # x is a tuple of 1 element, needs to unpack it first
|
||||||
@ -210,7 +210,7 @@ class BloomAttention(nn.Module):
|
|||||||
without making any copies, results share same memory storage as `fused_qkv`
|
without making any copies, results share same memory storage as `fused_qkv`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
|
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
query: [batch_size, num_heads, seq_length, head_dim]
|
query: [batch_size, num_heads, seq_length, head_dim]
|
||||||
@ -229,7 +229,7 @@ class BloomAttention(nn.Module):
|
|||||||
Merge heads together over the last dimension
|
Merge heads together over the last dimension
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
|
x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
|
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
|
||||||
|
@ -247,7 +247,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
|
|||||||
Image to resize.
|
Image to resize.
|
||||||
size (`Dict[str, int]`):
|
size (`Dict[str, int]`):
|
||||||
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
|
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
|
||||||
size_divisor (`int`, defaults to 32):
|
size_divisor (`int`, *optional*, defaults to 32):
|
||||||
The image is resized to a size that is a multiple of this value.
|
The image is resized to a size that is a multiple of this value.
|
||||||
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||||
Resampling filter to use when resiizing the image.
|
Resampling filter to use when resiizing the image.
|
||||||
|
@ -972,7 +972,7 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -173,7 +173,7 @@ class ClvpFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||||
- `'np'`: Return Numpy `np.ndarray` objects.
|
- `'np'`: Return Numpy `np.ndarray` objects.
|
||||||
padding_value (`float`, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
The value that is used to fill the padding values / vectors.
|
The value that is used to fill the padding values / vectors.
|
||||||
max_length (`int`, *optional*):
|
max_length (`int`, *optional*):
|
||||||
The maximum input length of the inputs.
|
The maximum input length of the inputs.
|
||||||
|
@ -41,9 +41,9 @@ class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
|
|||||||
Args:
|
Args:
|
||||||
num_channels (`int`, *optional*, defaults to 3):
|
num_channels (`int`, *optional*, defaults to 3):
|
||||||
The number of input channels.
|
The number of input channels.
|
||||||
patch_size (`int`, optional, defaults to 4):
|
patch_size (`int`, *optional*, defaults to 4):
|
||||||
Patch size to use in the patch embedding layer.
|
Patch size to use in the patch embedding layer.
|
||||||
num_stages (`int`, optional, defaults to 4):
|
num_stages (`int`, *optional*, defaults to 4):
|
||||||
The number of stages in the model.
|
The number of stages in the model.
|
||||||
hidden_sizes (`List[int]`, *optional*, defaults to [96, 192, 384, 768]):
|
hidden_sizes (`List[int]`, *optional*, defaults to [96, 192, 384, 768]):
|
||||||
Dimensionality (hidden size) at each stage.
|
Dimensionality (hidden size) at each stage.
|
||||||
|
@ -35,9 +35,9 @@ class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
|
|||||||
Args:
|
Args:
|
||||||
num_channels (`int`, *optional*, defaults to 3):
|
num_channels (`int`, *optional*, defaults to 3):
|
||||||
The number of input channels.
|
The number of input channels.
|
||||||
patch_size (`int`, optional, defaults to 4):
|
patch_size (`int`, *optional*, defaults to 4):
|
||||||
Patch size to use in the patch embedding layer.
|
Patch size to use in the patch embedding layer.
|
||||||
num_stages (`int`, optional, defaults to 4):
|
num_stages (`int`, *optional*, defaults to 4):
|
||||||
The number of stages in the model.
|
The number of stages in the model.
|
||||||
hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
|
hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
|
||||||
Dimensionality (hidden size) at each stage.
|
Dimensionality (hidden size) at each stage.
|
||||||
|
@ -175,7 +175,7 @@ class TFConvNextV2Layer(keras.layers.Layer):
|
|||||||
Model configuration class.
|
Model configuration class.
|
||||||
dim (`int`):
|
dim (`int`):
|
||||||
Number of input channels.
|
Number of input channels.
|
||||||
drop_path (`float`, defaults to 0.0):
|
drop_path (`float`, *optional*, defaults to 0.0):
|
||||||
Stochastic depth rate.
|
Stochastic depth rate.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1077,7 +1077,7 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -37,8 +37,8 @@ class DbrxAttentionConfig(PretrainedConfig):
|
|||||||
The dropout probability for the attention layers.
|
The dropout probability for the attention layers.
|
||||||
clip_qkv (`float`, *optional*):
|
clip_qkv (`float`, *optional*):
|
||||||
If set, clip the queries, keys, and values in the attention layer to this value.
|
If set, clip the queries, keys, and values in the attention layer to this value.
|
||||||
kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
|
kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||||
rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
|
rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -92,11 +92,11 @@ class DbrxFFNConfig(PretrainedConfig):
|
|||||||
ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
|
ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
|
||||||
The dict should have a key 'name' with the value being the name of the activation function along with
|
The dict should have a key 'name' with the value being the name of the activation function along with
|
||||||
any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
|
any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
|
||||||
ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
|
ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
|
||||||
moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
|
moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
|
||||||
moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
|
moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
|
||||||
moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
|
moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
|
||||||
moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
|
moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
|
||||||
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
|
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ def load_balancing_loss_func(
|
|||||||
Number of experts.
|
Number of experts.
|
||||||
top_k (`int`):
|
top_k (`int`):
|
||||||
The number of experts each token is routed to.
|
The number of experts each token is routed to.
|
||||||
attention_mask (`torch.Tensor`, None):
|
attention_mask (`torch.Tensor`, *optional*):
|
||||||
The attention_mask used in forward function
|
The attention_mask used in forward function
|
||||||
shape [batch_size X sequence_length] if not None.
|
shape [batch_size X sequence_length] if not None.
|
||||||
|
|
||||||
@ -757,16 +757,16 @@ class DbrxBlock(nn.Module):
|
|||||||
Args:
|
Args:
|
||||||
hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||||
position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
|
position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
|
||||||
attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
|
attention_mask (`torch.Tensor`, *optional*): attention mask of size (batch_size, sequence_length)
|
||||||
if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
|
if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
|
||||||
if default attention is used.
|
if default attention is used.
|
||||||
past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
|
past_key_value (`Tuple(torch.Tensor)`, *optional*): cached past key and value projection states
|
||||||
output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
|
output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
|
||||||
attention layers. See `attentions` under returned tensors for more detail.
|
attention layers. See `attentions` under returned tensors for more detail.
|
||||||
output_router_logits (`bool`, optional): Whether or not to return the router logits.
|
output_router_logits (`bool`, *optional*): Whether or not to return the router logits.
|
||||||
use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
|
use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are
|
||||||
returned and can be used to speed up decoding (see `past_key_values`).
|
returned and can be used to speed up decoding (see `past_key_values`).
|
||||||
cache_position (`torch.LongTensor`, optional): position ids of the cache
|
cache_position (`torch.LongTensor`, *optional*): position ids of the cache
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Norm + Attention + Norm
|
# Norm + Attention + Norm
|
||||||
|
@ -80,7 +80,7 @@ class DebertaConfig(PretrainedConfig):
|
|||||||
pos_att_type (`List[str]`, *optional*):
|
pos_att_type (`List[str]`, *optional*):
|
||||||
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
|
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
|
||||||
`["p2c", "c2p"]`.
|
`["p2c", "c2p"]`.
|
||||||
layer_norm_eps (`float`, optional, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -602,10 +602,10 @@ class DisentangledSelfAttention(nn.Module):
|
|||||||
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
output_attentions (`bool`, optional):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether return the attention matrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (`torch.FloatTensor`, optional):
|
query_states (`torch.FloatTensor`, *optional*):
|
||||||
The *Q* state in *Attention(Q,K,V)*.
|
The *Q* state in *Attention(Q,K,V)*.
|
||||||
|
|
||||||
relative_pos (`torch.LongTensor`):
|
relative_pos (`torch.LongTensor`):
|
||||||
|
@ -669,10 +669,10 @@ class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
|
|||||||
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
return_att (`bool`, optional):
|
return_att (`bool`, *optional*):
|
||||||
Whether return the attention matrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (`tf.Tensor`, optional):
|
query_states (`tf.Tensor`, *optional*):
|
||||||
The *Q* state in *Attention(Q,K,V)*.
|
The *Q* state in *Attention(Q,K,V)*.
|
||||||
|
|
||||||
relative_pos (`tf.Tensor`):
|
relative_pos (`tf.Tensor`):
|
||||||
|
@ -80,7 +80,7 @@ class DebertaV2Config(PretrainedConfig):
|
|||||||
pos_att_type (`List[str]`, *optional*):
|
pos_att_type (`List[str]`, *optional*):
|
||||||
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
|
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
|
||||||
`["p2c", "c2p"]`, `["p2c", "c2p"]`.
|
`["p2c", "c2p"]`, `["p2c", "c2p"]`.
|
||||||
layer_norm_eps (`float`, optional, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -678,10 +678,10 @@ class DisentangledSelfAttention(nn.Module):
|
|||||||
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
output_attentions (`bool`, optional):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether return the attention matrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (`torch.FloatTensor`, optional):
|
query_states (`torch.FloatTensor`, *optional*):
|
||||||
The *Q* state in *Attention(Q,K,V)*.
|
The *Q* state in *Attention(Q,K,V)*.
|
||||||
|
|
||||||
relative_pos (`torch.LongTensor`):
|
relative_pos (`torch.LongTensor`):
|
||||||
|
@ -738,10 +738,10 @@ class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
|
|||||||
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
return_att (`bool`, optional):
|
return_att (`bool`, *optional*):
|
||||||
Whether return the attention matrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (`tf.Tensor`, optional):
|
query_states (`tf.Tensor`, *optional*):
|
||||||
The *Q* state in *Attention(Q,K,V)*.
|
The *Q* state in *Attention(Q,K,V)*.
|
||||||
|
|
||||||
relative_pos (`tf.Tensor`):
|
relative_pos (`tf.Tensor`):
|
||||||
|
@ -1019,7 +1019,7 @@ class ErnieForPreTraining(ErniePreTrainedModel):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -993,7 +993,7 @@ class EsmForMaskedLM(EsmPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -1232,7 +1232,7 @@ class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -229,13 +229,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
|
|||||||
Dropout add function
|
Dropout add function
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (`torch.tensor`, *required*):
|
x (`torch.tensor`):
|
||||||
input tensor
|
input tensor
|
||||||
residual (`torch.tensor`, *required*):
|
residual (`torch.tensor`):
|
||||||
residual tensor
|
residual tensor
|
||||||
prob (`float`, *required*):
|
prob (`float`):
|
||||||
dropout probability
|
dropout probability
|
||||||
training (`bool`, *required*):
|
training (`bool`):
|
||||||
training mode
|
training mode
|
||||||
"""
|
"""
|
||||||
out = F.dropout(x, p=prob, training=training)
|
out = F.dropout(x, p=prob, training=training)
|
||||||
@ -315,7 +315,7 @@ class FalconAttention(nn.Module):
|
|||||||
Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
|
Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
|
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
|
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
|
||||||
@ -347,7 +347,7 @@ class FalconAttention(nn.Module):
|
|||||||
Merge heads together over the last dimension
|
Merge heads together over the last dimension
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
|
x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
|
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
|
||||||
|
@ -389,16 +389,16 @@ class FlavaImageCodebookConfig(PretrainedConfig):
|
|||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
num_groups (`int`, defaults to 4):
|
num_groups (`int`, *optional*, defaults to 4):
|
||||||
Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
|
Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
|
||||||
internal calculation and estimations.
|
internal calculation and estimations.
|
||||||
input_channels (`int`, defaults to 3):
|
input_channels (`int`, *optional*, defaults to 3):
|
||||||
Number of channels in the image to be passed.
|
Number of channels in the image to be passed.
|
||||||
num_blocks_per_group (`int`, defaults to 2):
|
num_blocks_per_group (`int`, *optional*, defaults to 2):
|
||||||
Number of conv-based blocks per group.
|
Number of conv-based blocks per group.
|
||||||
hidden_size (`int`, defaults to 256):
|
hidden_size (`int`, *optional*, defaults to 256):
|
||||||
Size of hidden dim for the blocks.
|
Size of hidden dim for the blocks.
|
||||||
vocab_size (`int`, defaults to 8192):
|
vocab_size (`int`, *optional*, defaults to 8192):
|
||||||
Size of the output vocabulary for the codebook.
|
Size of the output vocabulary for the codebook.
|
||||||
freeze (`bool`, defaults to `True`):
|
freeze (`bool`, defaults to `True`):
|
||||||
Whether to freeze the weights of the model.
|
Whether to freeze the weights of the model.
|
||||||
|
@ -176,7 +176,7 @@ class FlavaForPreTrainingOutput(ModelOutput):
|
|||||||
The output of the [`FlavaTextModel`].
|
The output of the [`FlavaTextModel`].
|
||||||
multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
|
multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
|
||||||
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
|
||||||
multimodal_masked_output (`BaseModelOutputWithPooling`, returned when `input_ids_masked` and `pixel_values` are present):
|
multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
|
||||||
The output of the [`FlavaMultimodalModel`].
|
The output of the [`FlavaMultimodalModel`].
|
||||||
|
|
||||||
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
|
mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
|
||||||
|
@ -651,7 +651,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -501,9 +501,9 @@ class FSMTEncoder(nn.Module):
|
|||||||
BaseModelOutput or Tuple comprised of:
|
BaseModelOutput or Tuple comprised of:
|
||||||
|
|
||||||
- **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
|
- **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
|
||||||
- **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape *(src_len,
|
- **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
|
||||||
batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
|
batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
|
||||||
- **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
|
- **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
|
||||||
During training might not be of length n_layers because of layer dropout.
|
During training might not be of length n_layers because of layer dropout.
|
||||||
"""
|
"""
|
||||||
# check attention mask and invert
|
# check attention mask and invert
|
||||||
|
@ -839,7 +839,7 @@ PARALLELIZE_DOCSTRING = r"""
|
|||||||
it will evenly distribute blocks across all devices.
|
it will evenly distribute blocks across all devices.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
device_map (`Dict[int, list]`, *optional*):
|
||||||
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
||||||
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
||||||
have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
|
have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
|
||||||
|
@ -587,7 +587,7 @@ PARALLELIZE_DOCSTRING = r"""
|
|||||||
across all devices.
|
across all devices.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
device_map (`Dict[int, list]`, *optional*):
|
||||||
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
||||||
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
||||||
have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
|
have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
|
||||||
|
@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig):
|
|||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
additional_vocab_size (`int`, *optional`, defaults to 0):
|
additional_vocab_size (`int`, *optional*, defaults to 0):
|
||||||
Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
|
Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
|
||||||
are always trainable whereas regular vocab tokens can be frozen or not.
|
are always trainable whereas regular vocab tokens can be frozen or not.
|
||||||
vocab_size (`int`, *optional*, defaults to 32000):
|
vocab_size (`int`, *optional*, defaults to 32000):
|
||||||
|
@ -97,7 +97,7 @@ def load_balancing_loss_func(
|
|||||||
router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
||||||
Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
|
Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
|
||||||
shape [batch_size X sequence_length, num_experts].
|
shape [batch_size X sequence_length, num_experts].
|
||||||
attention_mask (`torch.Tensor`, None):
|
attention_mask (`torch.Tensor`, *optional*):
|
||||||
The attention_mask used in forward function
|
The attention_mask used in forward function
|
||||||
shape [batch_size X sequence_length] if not None.
|
shape [batch_size X sequence_length] if not None.
|
||||||
num_experts (`int`, *optional*):
|
num_experts (`int`, *optional*):
|
||||||
|
@ -69,7 +69,7 @@ def load_balancing_loss_func(
|
|||||||
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
||||||
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
||||||
shape [batch_size X sequence_length, num_experts].
|
shape [batch_size X sequence_length, num_experts].
|
||||||
attention_mask (`torch.Tensor`, None):
|
attention_mask (`torch.Tensor`, *optional*):
|
||||||
The attention_mask used in forward function
|
The attention_mask used in forward function
|
||||||
shape [batch_size X sequence_length] if not None.
|
shape [batch_size X sequence_length] if not None.
|
||||||
num_experts (`int`, *optional*):
|
num_experts (`int`, *optional*):
|
||||||
|
@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin):
|
|||||||
Args:
|
Args:
|
||||||
bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
|
bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
|
||||||
The bounding bboxes associated to `texts`.
|
The bounding bboxes associated to `texts`.
|
||||||
num_image_tokens (`int`, defaults to 64):
|
num_image_tokens (`int`, *optional* defaults to 64):
|
||||||
The number of (consecutive) places that are used to mark the placeholders to store image information.
|
The number of (consecutive) places that are used to mark the placeholders to store image information.
|
||||||
This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
|
This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
|
||||||
first_image_token_id (`int`, *optional*):
|
first_image_token_id (`int`, *optional*):
|
||||||
|
@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
|
|||||||
Calculate the number of patches after the preprocessing for images of any resolution.
|
Calculate the number of patches after the preprocessing for images of any resolution.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
|
image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
|
||||||
The size of the input image in the format (height, width). ?
|
The size of the input image in the format (height, width). ?
|
||||||
grid_pinpoints (`List`):
|
grid_pinpoints (`List`):
|
||||||
A list containing possible resolutions. Each item in the list should be a tuple or list
|
A list containing possible resolutions. Each item in the list should be a tuple or list
|
||||||
|
@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
|
|||||||
Calculate the number of patches after the preprocessing for images of any resolution.
|
Calculate the number of patches after the preprocessing for images of any resolution.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
|
image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
|
||||||
The size of the input image in the format (height, width). ?
|
The size of the input image in the format (height, width). ?
|
||||||
grid_pinpoints (`List`):
|
grid_pinpoints (`List`):
|
||||||
A list containing possible resolutions. Each item in the list should be a tuple or list
|
A list containing possible resolutions. Each item in the list should be a tuple or list
|
||||||
|
@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
|
|||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the
|
Sequence of hidden-states at the output of the last layer of the encoder. Used in the
|
||||||
cross(masked)-attention of the decoder.
|
cross(masked)-attention of the decoder.
|
||||||
feature_size_list (`List[torch.Size]` ):
|
feature_size_list (`List[torch.Size]`):
|
||||||
This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
|
This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
|
||||||
output_attentions (`bool`, *optional*):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
|
@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
|
|||||||
|
|
||||||
- 0 indicates sequence B is a continuation of sequence A,
|
- 0 indicates sequence B is a continuation of sequence A,
|
||||||
- 1 indicates sequence B is a random sequence.
|
- 1 indicates sequence B is a random sequence.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -84,7 +84,7 @@ def load_balancing_loss_func(
|
|||||||
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
||||||
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
||||||
shape [batch_size X sequence_length, num_experts].
|
shape [batch_size X sequence_length, num_experts].
|
||||||
attention_mask (`torch.Tensor`, None):
|
attention_mask (`torch.Tensor`, *optional*):
|
||||||
The attention_mask used in forward function
|
The attention_mask used in forward function
|
||||||
shape [batch_size X sequence_length] if not None.
|
shape [batch_size X sequence_length] if not None.
|
||||||
num_experts (`int`, *optional*):
|
num_experts (`int`, *optional*):
|
||||||
|
@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r"""
|
|||||||
it will evenly distribute blocks across all devices.
|
it will evenly distribute blocks across all devices.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
device_map (`Dict[int, list]`, *optional*):
|
||||||
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
||||||
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
||||||
have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
|
have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
|
||||||
|
@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
|
|||||||
Args:
|
Args:
|
||||||
outputs ([`OneFormerForUniversalSegmentationOutput`]):
|
outputs ([`OneFormerForUniversalSegmentationOutput`]):
|
||||||
The outputs from [`OneFormerForUniversalSegmentationOutput`].
|
The outputs from [`OneFormerForUniversalSegmentationOutput`].
|
||||||
task_type (`str`, *optional)*, defaults to "instance"):
|
task_type (`str`, *optional*, defaults to "instance"):
|
||||||
The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
|
The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
|
||||||
ignore the stuff predictions.
|
ignore the stuff predictions.
|
||||||
is_demo (`bool`, *optional)*, defaults to `True`):
|
is_demo (`bool`, *optional)*, defaults to `True`):
|
||||||
|
@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape):
|
|||||||
channels is preserved.
|
channels is preserved.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
image (`np.ndarray):
|
image (`np.ndarray`):
|
||||||
The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
|
The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
|
||||||
input.ndim`.
|
input.ndim`.
|
||||||
output_shape (`Tuple`):
|
output_shape (`Tuple`):
|
||||||
|
@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module):
|
|||||||
"""Normalization block
|
"""Normalization block
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
|
|||||||
"""This module mixes the features in the channel dimension.
|
"""This module mixes the features in the channel dimension.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module):
|
|||||||
"""This module mixes the patch dimension.
|
"""This module mixes the patch dimension.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module):
|
|||||||
"""This module mixes the hidden feature dimension.
|
"""This module mixes the hidden feature dimension.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module):
|
|||||||
The `PatchTSMixer` layer that does all three kinds of mixing.
|
The `PatchTSMixer` layer that does all three kinds of mixing.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module):
|
|||||||
"""The main computing framework of the `PatchTSMixer` model.
|
"""The main computing framework of the `PatchTSMixer` model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module):
|
|||||||
"""Prediction Head for Forecasting
|
"""Prediction Head for Forecasting
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*): Configuration.
|
config (`PatchTSMixerConfig`):
|
||||||
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
|
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
|
||||||
@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module):
|
|||||||
"""Linear head for Classification and Regression.
|
"""Linear head for Classification and Regression.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
|
def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
|
||||||
@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module):
|
|||||||
"""Pretraining head.
|
"""Pretraining head.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
|
|||||||
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
|
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
|
|||||||
`PatchTSMixer` for mask pretraining.
|
`PatchTSMixer` for mask pretraining.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
|
|||||||
`PatchTSMixer` for forecasting application.
|
`PatchTSMixer` for forecasting application.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
|
|||||||
`PatchTSMixer` for classification application.
|
`PatchTSMixer` for classification application.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
|
|||||||
`PatchTSMixer` for regression application.
|
`PatchTSMixer` for regression application.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (`PatchTSMixerConfig`, *required*):
|
config (`PatchTSMixerConfig`):
|
||||||
Configuration.
|
Configuration.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module):
|
|||||||
storage as `fused_qkv`
|
storage as `fused_qkv`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
|
fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
|
query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
|
||||||
|
@ -75,7 +75,7 @@ def load_balancing_loss_func(
|
|||||||
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
|
||||||
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
||||||
shape [batch_size X sequence_length, num_experts].
|
shape [batch_size X sequence_length, num_experts].
|
||||||
attention_mask (`torch.Tensor`, None):
|
attention_mask (`torch.Tensor`, *optional*):
|
||||||
The attention_mask used in forward function
|
The attention_mask used in forward function
|
||||||
shape [batch_size X sequence_length] if not None.
|
shape [batch_size X sequence_length] if not None.
|
||||||
num_experts (`int`, *optional*):
|
num_experts (`int`, *optional*):
|
||||||
|
@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
reduce_loss (`bool`, *optional*):
|
reduce_loss (`bool`, *optional*):
|
||||||
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
|
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
|
||||||
operation.
|
operation.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Legacy dictionary, which is required so that model can use *generate()* function.
|
Legacy dictionary, which is required so that model can use *generate()* function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
|||||||
reduce_loss (`bool`, *optional*):
|
reduce_loss (`bool`, *optional*):
|
||||||
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
|
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
|
||||||
operation.
|
operation.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Legacy dictionary, which is required so that model can use *generate()* function.
|
Legacy dictionary, which is required so that model can use *generate()* function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
|
|||||||
reduce_loss (`bool`, *optional*):
|
reduce_loss (`bool`, *optional*):
|
||||||
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
|
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
|
||||||
operation.
|
operation.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Legacy dictionary, which is required so that model can use *generate()* function.
|
Legacy dictionary, which is required so that model can use *generate()* function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
|
|||||||
reduce_loss (`bool`, *optional*):
|
reduce_loss (`bool`, *optional*):
|
||||||
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
|
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
|
||||||
operation.
|
operation.
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Legacy dictionary, which is required so that model can use *generate()* function.
|
Legacy dictionary, which is required so that model can use *generate()* function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -1075,7 +1075,7 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -1151,7 +1151,7 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
|
|||||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ...,
|
ignored (masked), the loss is only computed for the tokens with labels in `[0, ...,
|
||||||
config.vocab_size]`
|
config.vocab_size]`
|
||||||
|
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -59,7 +59,7 @@ class SegGptEncoderOutput(ModelOutput):
|
|||||||
attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
|
attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
|
||||||
Tuple of *torch.FloatTensor* (one for each layer) of shape
|
Tuple of *torch.FloatTensor* (one for each layer) of shape
|
||||||
`(batch_size, num_heads, seq_len, seq_len)`.
|
`(batch_size, num_heads, seq_len, seq_len)`.
|
||||||
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.intermediate_hidden_state_indices` is set):
|
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
|
||||||
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
|
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
|
||||||
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
|
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
|
||||||
Additionaly, each feature passes through a LayerNorm.
|
Additionaly, each feature passes through a LayerNorm.
|
||||||
@ -77,7 +77,7 @@ class SegGptImageSegmentationOutput(ModelOutput):
|
|||||||
Output type of [`SegGptImageSegmentationOutput`].
|
Output type of [`SegGptImageSegmentationOutput`].
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
loss (`torch.FloatTensor`, `optional`, returned when `labels` is provided):
|
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
|
||||||
The loss value.
|
The loss value.
|
||||||
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||||
The predicted masks.
|
The predicted masks.
|
||||||
|
@ -745,10 +745,10 @@ class DisentangledSelfAttention(nn.Module):
|
|||||||
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
output_attentions (`bool`, optional):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether return the attention matrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (`torch.FloatTensor`, optional):
|
query_states (`torch.FloatTensor`, *optional*):
|
||||||
The *Q* state in *Attention(Q,K,V)*.
|
The *Q* state in *Attention(Q,K,V)*.
|
||||||
|
|
||||||
relative_pos (`torch.LongTensor`):
|
relative_pos (`torch.LongTensor`):
|
||||||
|
@ -220,7 +220,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
sampling_rate (`int`, *optional*):
|
sampling_rate (`int`, *optional*):
|
||||||
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
||||||
`sampling_rate` at the forward call to prevent silent errors.
|
`sampling_rate` at the forward call to prevent silent errors.
|
||||||
padding_value (`float`, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
The value that is used to fill the padding values / vectors.
|
The value that is used to fill the padding values / vectors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ PARALLELIZE_DOCSTRING = r"""
|
|||||||
it will evenly distribute blocks across all devices.
|
it will evenly distribute blocks across all devices.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
device_map (`Dict[int, list]`, *optional*):
|
||||||
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
||||||
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
||||||
have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
|
have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
|
||||||
|
@ -1249,7 +1249,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
|||||||
Total number of table columns
|
Total number of table columns
|
||||||
max_length (`int`):
|
max_length (`int`):
|
||||||
Total maximum length.
|
Total maximum length.
|
||||||
truncation_strategy (`str` or [`TapasTruncationStrategy`]):
|
truncation_strategy (`str` or [`TapasTruncationStrategy]`):
|
||||||
Truncation strategy to use. Seeing as this method should only be called when truncating, the only
|
Truncation strategy to use. Seeing as this method should only be called when truncating, the only
|
||||||
available strategy is the `"drop_rows_to_fit"` strategy.
|
available strategy is the `"drop_rows_to_fit"` strategy.
|
||||||
|
|
||||||
|
@ -833,7 +833,7 @@ class UdopTokenizer(PreTrainedTokenizer):
|
|||||||
</Tip>
|
</Tip>
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
|
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
|
||||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||||
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
||||||
method).
|
method).
|
||||||
|
@ -814,7 +814,7 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
</Tip>
|
</Tip>
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
|
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
|
||||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||||
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
||||||
method).
|
method).
|
||||||
|
@ -243,7 +243,7 @@ class ViltImageProcessor(BaseImageProcessor):
|
|||||||
Image to resize.
|
Image to resize.
|
||||||
size (`Dict[str, int]`):
|
size (`Dict[str, int]`):
|
||||||
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
|
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
|
||||||
size_divisor (`int`, defaults to 32):
|
size_divisor (`int`, *optional*, defaults to 32):
|
||||||
The image is resized to a size that is a multiple of this value.
|
The image is resized to a size that is a multiple of this value.
|
||||||
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||||
Resampling filter to use when resiizing the image.
|
Resampling filter to use when resiizing the image.
|
||||||
|
@ -182,7 +182,7 @@ def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_siz
|
|||||||
Relative position embeddings (Lw, num_channels) for width axis.
|
Relative position embeddings (Lw, num_channels) for width axis.
|
||||||
q_size (`Tuple[int]`):
|
q_size (`Tuple[int]`):
|
||||||
Spatial sequence size of query q with (queries_height, queries_width).
|
Spatial sequence size of query q with (queries_height, queries_width).
|
||||||
k_size (`Tuple[int]`]):
|
k_size (`Tuple[int]`):
|
||||||
Spatial sequence size of key k with (keys_height, keys_width).
|
Spatial sequence size of key k with (keys_height, keys_width).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -36,11 +36,11 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
|
|||||||
most of the main methods. Users should refer to this superclass for more information regarding those methods.
|
most of the main methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_size (`int`, defaults to 1):
|
feature_size (`int`, *optional*, defaults to 1):
|
||||||
The feature dimension of the extracted features.
|
The feature dimension of the extracted features.
|
||||||
sampling_rate (`int`, defaults to 16000):
|
sampling_rate (`int`, *optional*, defaults to 16000):
|
||||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
||||||
padding_value (`float`, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
The value that is used to fill the padding values.
|
The value that is used to fill the padding values.
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
||||||
@ -166,7 +166,7 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
|
|||||||
sampling_rate (`int`, *optional*):
|
sampling_rate (`int`, *optional*):
|
||||||
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
||||||
`sampling_rate` at the forward call to prevent silent errors.
|
`sampling_rate` at the forward call to prevent silent errors.
|
||||||
padding_value (`float`, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if sampling_rate is not None:
|
if sampling_rate is not None:
|
||||||
|
@ -184,9 +184,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
|
|||||||
If `"rotary"` position embeddings are used, defines the size of the embedding base.
|
If `"rotary"` position embeddings are used, defines the size of the embedding base.
|
||||||
max_source_positions (`int`, *optional*, defaults to 5000):
|
max_source_positions (`int`, *optional*, defaults to 5000):
|
||||||
if `"relative"` position embeddings are used, defines the maximum source input positions.
|
if `"relative"` position embeddings are used, defines the maximum source input positions.
|
||||||
conv_depthwise_kernel_size (`int`, defaults to 31):
|
conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
|
||||||
Kernel size of convolutional depthwise 1D layer in Conformer blocks.
|
Kernel size of convolutional depthwise 1D layer in Conformer blocks.
|
||||||
conformer_conv_dropout (`float`, defaults to 0.1):
|
conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
The dropout probability for all convolutional layers in Conformer blocks.
|
The dropout probability for all convolutional layers in Conformer blocks.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -44,16 +44,16 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
Fourier Transform` which should match pytorch's `torch.stft` equivalent.
|
Fourier Transform` which should match pytorch's `torch.stft` equivalent.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
feature_size (`int`, defaults to 80):
|
feature_size (`int`, *optional*, defaults to 80):
|
||||||
The feature dimension of the extracted features.
|
The feature dimension of the extracted features.
|
||||||
sampling_rate (`int`, defaults to 16000):
|
sampling_rate (`int`, *optional*, defaults to 16000):
|
||||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
||||||
hop_length (`int`, defaults to 160):
|
hop_length (`int`, *optional*, defaults to 160):
|
||||||
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
|
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
|
||||||
chunk_length (`int`, defaults to 30):
|
chunk_length (`int`, *optional*, defaults to 30):
|
||||||
The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
|
The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
|
||||||
sequences.
|
sequences.
|
||||||
n_fft (`int`, defaults to 400):
|
n_fft (`int`, *optional*, defaults to 400):
|
||||||
Size of the Fourier transform.
|
Size of the Fourier transform.
|
||||||
padding_value (`float`, *optional*, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
Padding value used to pad the audio. Should correspond to silences.
|
Padding value used to pad the audio. Should correspond to silences.
|
||||||
@ -231,7 +231,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
|
|||||||
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
||||||
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
|
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
|
||||||
pipeline.
|
pipeline.
|
||||||
padding_value (`float`, defaults to 0.0):
|
padding_value (`float`, *optional*, defaults to 0.0):
|
||||||
The value that is used to fill the padding values / vectors.
|
The value that is used to fill the padding values / vectors.
|
||||||
do_normalize (`bool`, *optional*, defaults to `False`):
|
do_normalize (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
||||||
|
@ -1368,7 +1368,7 @@ class WhisperGenerationMixin:
|
|||||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||||
default values, whose documentation should be checked to parameterize generation.
|
default values, whose documentation should be checked to parameterize generation.
|
||||||
num_segment_frames (`int`, defaults to 3000):
|
num_segment_frames (`int`, *optional*, defaults to 3000):
|
||||||
The number of log-mel frames the model expects
|
The number of log-mel frames the model expects
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
|
@ -565,7 +565,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
|||||||
Args:
|
Args:
|
||||||
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
||||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
"""
|
"""
|
||||||
offsets = []
|
offsets = []
|
||||||
@ -615,7 +615,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
|||||||
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
|
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
"""
|
"""
|
||||||
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
|
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
|
||||||
@ -671,7 +671,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
|||||||
output_offsets (`bool`, *optional*, defaults to `False`):
|
output_offsets (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
||||||
timestamps.
|
timestamps.
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to decode with timestamps included in the raw text.
|
Whether or not to decode with timestamps included in the raw text.
|
||||||
|
@ -207,7 +207,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Args:
|
Args:
|
||||||
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
||||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
"""
|
"""
|
||||||
offsets = []
|
offsets = []
|
||||||
@ -258,7 +258,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
|
Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
"""
|
"""
|
||||||
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
|
return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
|
||||||
@ -317,7 +317,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
output_offsets (`bool`, *optional*, defaults to `False`):
|
output_offsets (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
Whether or not to output the offsets of the tokens. This should only be set if the model predicted
|
||||||
timestamps.
|
timestamps.
|
||||||
time_precision (`float`, `optional`, defaults to 0.02):
|
time_precision (`float`, *optional*, defaults to 0.02):
|
||||||
The time ratio to convert from token to time.
|
The time ratio to convert from token to time.
|
||||||
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
decode_with_timestamps (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not to decode with timestamps included in the raw text.
|
Whether or not to decode with timestamps included in the raw text.
|
||||||
|
@ -1081,7 +1081,7 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -1039,7 +1039,7 @@ class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -1173,7 +1173,7 @@ class XmodForMaskedLM(XmodPreTrainedModel):
|
|||||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||||
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
|
kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
|
||||||
Used to hide legacy arguments that have been deprecated.
|
Used to hide legacy arguments that have been deprecated.
|
||||||
"""
|
"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
@ -647,8 +647,9 @@ class YolosModel(YolosPreTrainedModel):
|
|||||||
Prunes heads of the model.
|
Prunes heads of the model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
heads_to_prune (`dict` of {layer_num: list of heads to prune in this layer}):
|
heads_to_prune (`dict`):
|
||||||
See base class `PreTrainedModel`.
|
See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
|
||||||
|
list of heads to prune in this layer}
|
||||||
"""
|
"""
|
||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
@ -218,7 +218,7 @@ def infer_framework_load_model(
|
|||||||
If both frameworks are installed and available for `model`, PyTorch is selected.
|
If both frameworks are installed and available for `model`, PyTorch is selected.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
|
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
|
||||||
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
|
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
|
||||||
config ([`AutoConfig`]):
|
config ([`AutoConfig`]):
|
||||||
The config associated with the model to help using the correct class
|
The config associated with the model to help using the correct class
|
||||||
@ -322,7 +322,7 @@ def infer_framework_from_model(
|
|||||||
If both frameworks are installed and available for `model`, PyTorch is selected.
|
If both frameworks are installed and available for `model`, PyTorch is selected.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
|
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
|
||||||
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
|
The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
|
||||||
model_classes (dictionary `str` to `type`, *optional*):
|
model_classes (dictionary `str` to `type`, *optional*):
|
||||||
A mapping framework to class.
|
A mapping framework to class.
|
||||||
@ -349,7 +349,7 @@ def get_framework(model, revision: Optional[str] = None):
|
|||||||
Select framework (TensorFlow or PyTorch) to use.
|
Select framework (TensorFlow or PyTorch) to use.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
|
model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
|
||||||
If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
|
If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
|
||||||
the model name). If no specific model is provided, defaults to using PyTorch.
|
the model name). If no specific model is provided, defaults to using PyTorch.
|
||||||
"""
|
"""
|
||||||
@ -385,7 +385,7 @@ def get_default_model_and_revision(
|
|||||||
Select a default model to use for a given task. Defaults to pytorch if ambiguous.
|
Select a default model to use for a given task. Defaults to pytorch if ambiguous.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
targeted_task (`Dict` ):
|
targeted_task (`Dict`):
|
||||||
Dictionary representing the given task, that should contain default models
|
Dictionary representing the given task, that should contain default models
|
||||||
|
|
||||||
framework (`str`, None)
|
framework (`str`, None)
|
||||||
|
@ -22,7 +22,7 @@ logger = logging.get_logger(__name__)
|
|||||||
@add_end_docstrings(
|
@add_end_docstrings(
|
||||||
build_pipeline_init_args(has_tokenizer=True),
|
build_pipeline_init_args(has_tokenizer=True),
|
||||||
r"""
|
r"""
|
||||||
top_k (`int`, defaults to 5):
|
top_k (`int`, *optional*, defaults to 5):
|
||||||
The number of predictions to return.
|
The number of predictions to return.
|
||||||
targets (`str` or `List[str]`, *optional*):
|
targets (`str` or `List[str]`, *optional*):
|
||||||
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
|
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
|
||||||
|
@ -31,7 +31,7 @@ class PipelineIterator(IterableDataset):
|
|||||||
```
|
```
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loader (`torch.utils.data.DataLoader` or any iterator):
|
loader (`torch.utils.data.DataLoader` or `Iterable`):
|
||||||
The iterator that will be used to apply `infer` on.
|
The iterator that will be used to apply `infer` on.
|
||||||
infer (any function):
|
infer (any function):
|
||||||
The function to apply of each element of `loader`.
|
The function to apply of each element of `loader`.
|
||||||
@ -163,7 +163,7 @@ class PipelineChunkIterator(PipelineIterator):
|
|||||||
```
|
```
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loader (`torch.utils.data.DataLoader` or any iterator):
|
loader (`torch.utils.data.DataLoader` or `Iterable`):
|
||||||
The iterator that will be used to apply `infer` on.
|
The iterator that will be used to apply `infer` on.
|
||||||
infer (any function):
|
infer (any function):
|
||||||
The function to apply of each element of `loader`.
|
The function to apply of each element of `loader`.
|
||||||
@ -224,7 +224,7 @@ class PipelinePackIterator(PipelineIterator):
|
|||||||
```
|
```
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loader (`torch.utils.data.DataLoader` or any iterator):
|
loader (`torch.utils.data.DataLoader` or `Iterable`):
|
||||||
The iterator that will be used to apply `infer` on.
|
The iterator that will be used to apply `infer` on.
|
||||||
infer (any function):
|
infer (any function):
|
||||||
The function to apply of each element of `loader`.
|
The function to apply of each element of `loader`.
|
||||||
|
@ -3200,7 +3200,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
</Tip>
|
</Tip>
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
|
text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`):
|
||||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||||
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
|
||||||
method).
|
method).
|
||||||
|
@ -745,7 +745,7 @@ class Trainer:
|
|||||||
Add a callback to the current list of [`~transformers.TrainerCallback`].
|
Add a callback to the current list of [`~transformers.TrainerCallback`].
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
callback (`type` or [`~transformers.TrainerCallback`]):
|
callback (`type` or [`~transformers.TrainerCallback]`):
|
||||||
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
||||||
first case, will instantiate a member of that class.
|
first case, will instantiate a member of that class.
|
||||||
"""
|
"""
|
||||||
@ -758,7 +758,7 @@ class Trainer:
|
|||||||
If the callback is not found, returns `None` (and no error is raised).
|
If the callback is not found, returns `None` (and no error is raised).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
callback (`type` or [`~transformers.TrainerCallback`]):
|
callback (`type` or [`~transformers.TrainerCallback]`):
|
||||||
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
||||||
first case, will pop the first member of that class found in the list of callbacks.
|
first case, will pop the first member of that class found in the list of callbacks.
|
||||||
|
|
||||||
@ -772,7 +772,7 @@ class Trainer:
|
|||||||
Remove a callback from the current list of [`~transformers.TrainerCallback`].
|
Remove a callback from the current list of [`~transformers.TrainerCallback`].
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
callback (`type` or [`~transformers.TrainerCallback`]):
|
callback (`type` or [`~transformers.TrainerCallback]`):
|
||||||
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
|
||||||
first case, will remove the first member of that class found in the list of callbacks.
|
first case, will remove the first member of that class found in the list of callbacks.
|
||||||
"""
|
"""
|
||||||
|
@ -80,7 +80,7 @@ class Seq2SeqTrainer(Trainer):
|
|||||||
Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
|
Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
gen_config_arg (`str` or [`~generation.GenerationConfig`]):
|
gen_config_arg (`str` or [`~generation.GenerationConfig]`):
|
||||||
`Seq2SeqTrainingArguments.generation_config` argument.
|
`Seq2SeqTrainingArguments.generation_config` argument.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -1605,7 +1605,7 @@ def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
path (`str`): The path to the source file
|
path (`str`): The path to the source file
|
||||||
file (`str`, optional): The file to join with the path. Defaults to "__init__.py".
|
file (`str`, *optional*): The file to join with the path. Defaults to "__init__.py".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`ModuleType`: The resulting imported module
|
`ModuleType`: The resulting imported module
|
||||||
|
Loading…
Reference in New Issue
Block a user