mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-30 17:52:35 +06:00
Fix some docs what layerdrop does (#23691)
* Fix some docs what layerdrop does * Update src/transformers/models/data2vec/configuration_data2vec_audio.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Fix more docs --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
357f281ba2
commit
003a0cf8cc
@ -253,7 +253,7 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
|
@ -678,7 +678,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
||||
generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
|
||||
|
||||
Args:
|
||||
prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
|
||||
This function constraints the beam search to allowed tokens only at each step. This function takes 2
|
||||
arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
|
||||
next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
|
||||
|
@ -1522,7 +1522,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
"""
|
||||
|
||||
@ -1593,7 +1593,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
"""
|
||||
|
||||
|
@ -912,7 +912,7 @@ class ModuleUtilsMixin:
|
||||
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
|
||||
num_hidden_layers (`int`):
|
||||
The number of hidden layers in the model.
|
||||
is_attention_chunked: (`bool`, *optional*, defaults to `False`):
|
||||
is_attention_chunked (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the attentions scores are computed by chunks or not.
|
||||
|
||||
Returns:
|
||||
|
@ -184,7 +184,7 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
List of output channel sizes to be used in each block for convolutional layers.
|
||||
depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
|
||||
List of block indices with square padding.
|
||||
strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
|
||||
strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
|
||||
List of stride sizes to be used in each block for convolutional layers.
|
||||
num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
|
||||
List of the number of times each block is to repeated.
|
||||
|
@ -613,7 +613,7 @@ class BlipTextModel(BlipTextPreTrainedModel):
|
||||
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
|
||||
input_shape (`Tuple[int]`):
|
||||
The shape of the input to the model.
|
||||
device: (`torch.device`):
|
||||
device (`torch.device`):
|
||||
The device of the input to the model.
|
||||
|
||||
Returns:
|
||||
|
@ -633,7 +633,7 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
|
||||
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
|
||||
input_shape (`Tuple[int]`):
|
||||
The shape of the input to the model.
|
||||
is_decoder: (`bool`):
|
||||
is_decoder (`bool`):
|
||||
Whether the model is used as a decoder.
|
||||
|
||||
Returns:
|
||||
|
@ -1059,7 +1059,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
||||
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
|
||||
input_shape (`Tuple[int]`):
|
||||
The shape of the input to the model.
|
||||
device: (`torch.device`):
|
||||
device (`torch.device`):
|
||||
The device of the input to the model.
|
||||
|
||||
Returns:
|
||||
|
@ -256,7 +256,7 @@ class BloomAttention(nn.Module):
|
||||
Merge heads together over the last dimenstion
|
||||
|
||||
Args:
|
||||
x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
|
||||
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
|
||||
|
||||
Returns:
|
||||
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
|
||||
|
@ -62,6 +62,9 @@ class Data2VecAudioConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -77,7 +77,7 @@ class DeformableDetrConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
init_xavier_std (`float`, *optional*, defaults to 1):
|
||||
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
|
||||
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
auxiliary_loss (`bool`, *optional*, defaults to `False`):
|
||||
|
@ -71,7 +71,7 @@ class DetaConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
init_xavier_std (`float`, *optional*, defaults to 1):
|
||||
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
|
||||
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
auxiliary_loss (`bool`, *optional*, defaults to `False`):
|
||||
|
@ -60,7 +60,7 @@ class EfficientNetConfig(PretrainedConfig):
|
||||
List of output channel sizes to be used in each block for convolutional layers.
|
||||
depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
|
||||
List of block indices with square padding.
|
||||
strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
|
||||
strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
|
||||
List of stride sizes to be used in each block for convolutional layers.
|
||||
num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
|
||||
List of the number of times each block is to repeated.
|
||||
|
@ -62,6 +62,9 @@ class HubertConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -111,7 +111,7 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.k.
|
||||
question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
@ -153,10 +153,10 @@ class LxmertForPreTrainingOutput(ModelOutput):
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the textual matching objective (classification) head (scores of True/False
|
||||
continuation before SoftMax).
|
||||
question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
|
||||
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
|
||||
@ -828,12 +828,12 @@ LXMERT_INPUTS_DOCSTRING = r"""
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
visual_feats: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
This input represents visual features. They ROI pooled object features from bounding boxes using a
|
||||
faster-RCNN model)
|
||||
|
||||
These are currently not provided by the transformers library.
|
||||
visual_pos: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
|
||||
visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
|
||||
This input represents spacial features corresponding to their relative (via index) visual features. The
|
||||
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
|
||||
1.
|
||||
@ -1171,7 +1171,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
|
||||
obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
|
||||
each key is named after each one of the visual losses and each element of the tuple is of the shape
|
||||
`(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
|
||||
the label score respectively
|
||||
@ -1398,7 +1398,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
|
||||
labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
|
||||
A one-hot representation of the correct answer
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
@ -111,10 +111,10 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
|
||||
(classification) loss.
|
||||
prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`):
|
||||
cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the textual matching objective (classification) head (scores of True/False
|
||||
continuation before SoftMax).
|
||||
question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
|
||||
question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
|
||||
Prediction scores of question answering objective (classification).
|
||||
language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
|
||||
@ -873,12 +873,12 @@ LXMERT_INPUTS_DOCSTRING = r"""
|
||||
[`PreTrainedTokenizer.encode`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
visual_feats: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
This input represents visual features. They ROI pooled object features from bounding boxes using a
|
||||
faster-RCNN model)
|
||||
|
||||
These are currently not provided by the transformers library.
|
||||
visual_pos: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
||||
This input represents spacial features corresponding to their relative (via index) visual features. The
|
||||
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
|
||||
1.
|
||||
@ -1297,7 +1297,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
|
||||
obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
|
||||
each key is named after each one of the visual losses and each element of the tuple is of the shape
|
||||
`(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
|
||||
the label score respectively
|
||||
|
@ -1767,7 +1767,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
|
||||
of the predicted mask for each query, instead of attending to the full feature map.
|
||||
|
||||
Args:
|
||||
config: (`Mask2FormerConfig`):
|
||||
config (`Mask2FormerConfig`):
|
||||
Configuration used to instantiate Mask2FormerMaskedAttentionDecoder.
|
||||
"""
|
||||
|
||||
@ -2003,7 +2003,7 @@ class Mask2FormerMaskPredictor(nn.Module):
|
||||
The feature dimension of the Mask2FormerMaskedAttentionDecoder
|
||||
num_heads (`int`):
|
||||
The number of heads used in the Mask2FormerMaskedAttentionDecoder
|
||||
mask_feature_size: (`torch.Tensor`):
|
||||
mask_feature_size (`torch.Tensor`):
|
||||
one of the output dimensions of the predicted masks for each query
|
||||
"""
|
||||
super().__init__()
|
||||
|
@ -119,7 +119,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
||||
|
||||
This should likely be deactivated for Japanese (see this
|
||||
[issue](https://github.com/huggingface/transformers/issues/328)).
|
||||
strip_accents: (`bool`, *optional*):
|
||||
strip_accents (`bool`, *optional*):
|
||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||
value for `lowercase` (as in the original BERT).
|
||||
"""
|
||||
|
@ -98,7 +98,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
|
||||
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
|
||||
issue](https://github.com/huggingface/transformers/issues/328)).
|
||||
strip_accents: (`bool`, *optional*):
|
||||
strip_accents (`bool`, *optional*):
|
||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||
value for `lowercase` (as in the original BERT).
|
||||
"""
|
||||
|
@ -67,7 +67,7 @@ class OPTConfig(PretrainedConfig):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
|
@ -70,10 +70,10 @@ class PegasusXConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
|
@ -1430,7 +1430,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
|
||||
`batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
|
||||
|
@ -573,10 +573,10 @@ class RagRetriever:
|
||||
Retrieves documents for specified `question_hidden_states`.
|
||||
|
||||
Args:
|
||||
question_input_ids: (`List[List[int]]`) batch of input ids
|
||||
question_input_ids (`List[List[int]]`) batch of input ids
|
||||
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
|
||||
A batch of query vectors to retrieve with.
|
||||
prefix: (`str`, *optional*):
|
||||
prefix (`str`, *optional*):
|
||||
The prefix used by the generator's tokenizer.
|
||||
n_docs (`int`, *optional*):
|
||||
The number of docs retrieved per query.
|
||||
|
@ -726,7 +726,7 @@ class RealmReaderOutput(ModelOutput):
|
||||
The index of the retrieved span candidates in which the predicted answer is most likely.
|
||||
start_pos (`torch.IntTensor` of shape `()`):
|
||||
Predicted answer starting position in *RealmReader*'s inputs.
|
||||
end_pos: (`torch.IntTensor` of shape `()`):
|
||||
end_pos (`torch.IntTensor` of shape `()`):
|
||||
Predicted answer ending position in *RealmReader*'s inputs.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
|
@ -63,6 +63,9 @@ class SEWConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`SEWForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -65,6 +65,9 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`UniSpeechForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -66,6 +66,9 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -63,6 +63,9 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -65,6 +65,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -62,6 +62,9 @@ class WavLMConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for the final projection layer of [`WavLMForCTC`].
|
||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
|
||||
details.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
|
@ -117,9 +117,9 @@ def create_optimizer(
|
||||
The beta2 to use in Adam.
|
||||
adam_epsilon (`float`, *optional*, defaults to 1e-8):
|
||||
The epsilon to use in Adam.
|
||||
adam_clipnorm: (`float`, *optional*, defaults to `None`):
|
||||
adam_clipnorm (`float`, *optional*, defaults to `None`):
|
||||
If not `None`, clip the gradient norm for each weight tensor to this value.
|
||||
adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
|
||||
adam_global_clipnorm (`float`, *optional*, defaults to `None`)
|
||||
If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
|
||||
weight tensors, as if they were concatenated into a single vector.
|
||||
weight_decay_rate (`float`, *optional*, defaults to 0):
|
||||
|
@ -119,7 +119,7 @@ def ffmpeg_microphone_live(
|
||||
The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
|
||||
an audio sample but without using that part to actually make the prediction. Setting this does not change
|
||||
the length of the chunk.
|
||||
format_for_conversion: (`str`, defalts to `f32le`)
|
||||
format_for_conversion (`str`, defalts to `f32le`)
|
||||
The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
|
||||
could also be used.
|
||||
Return:
|
||||
|
@ -514,7 +514,7 @@ class PipelineDataFormat:
|
||||
Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
|
||||
|
||||
Args:
|
||||
format: (`str`):
|
||||
format (`str`):
|
||||
The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
|
||||
output_path (`str`, *optional*):
|
||||
Where to save the outgoing data.
|
||||
|
@ -2093,7 +2093,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
|
||||
error is raised.
|
||||
filename_prefix: (`str`, *optional*):
|
||||
filename_prefix (`str`, *optional*):
|
||||
A prefix to add to the names of the files saved by the tokenizer.
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
|
@ -66,7 +66,7 @@ class TFTrainingArguments(TrainingArguments):
|
||||
The batch size per GPU/TPU core/CPU for training.
|
||||
per_device_eval_batch_size (`int`, *optional*, defaults to 8):
|
||||
The batch size per GPU/TPU core/CPU for evaluation.
|
||||
gradient_accumulation_steps: (`int`, *optional*, defaults to 1):
|
||||
gradient_accumulation_steps (`int`, *optional*, defaults to 1):
|
||||
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
@ -107,10 +107,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see
|
||||
https://arxiv.org/abs/1909.11556) for more details.
|
||||
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
|
||||
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see
|
||||
https://arxiv.org/abs/1909.11556) for more details.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
|
Loading…
Reference in New Issue
Block a user