torchcodec in docstrings and testing utils

This commit is contained in:
Quentin Lhoest 2025-07-01 18:28:34 +02:00 committed by ydshieh
parent d20ab64120
commit 986698a6a7
38 changed files with 216 additions and 175 deletions

View File

@ -64,15 +64,15 @@ predicted token ids.
>>> import torch
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -436,9 +436,10 @@ class ASTModel(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@ -526,9 +527,10 @@ class ASTForAudioClassification(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1053,9 +1053,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1167,9 +1168,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1349,9 +1351,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -985,15 +985,15 @@ class HubertModel(HubertPreTrainedModel):
```python
>>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch
@ -1261,9 +1261,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`HubertProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1459,15 +1459,15 @@ class TFHubertModel(TFHubertPreTrainedModel):
```python
>>> from transformers import AutoProcessor, TFHubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch
@ -1571,15 +1571,15 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
>>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFHubertForCTC
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -239,15 +239,15 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
```python
>>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -563,7 +563,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1028,7 +1029,8 @@ class MoonshineModel(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1204,7 +1206,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -587,7 +587,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -844,7 +845,8 @@ class MoonshineModel(WhisperModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1004,7 +1006,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -797,7 +797,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -1789,9 +1790,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -1782,7 +1782,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -2236,9 +2237,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -360,7 +360,8 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*):
@ -740,9 +741,10 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

View File

@ -1077,9 +1077,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1597,9 +1597,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWDProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args:
inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`.
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
r"""
inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):

View File

@ -620,7 +620,8 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `torch.FloatTensor`. See
[`~Speech2TextFeatureExtractor.__call__`]

View File

@ -848,7 +848,8 @@ class TFSpeech2TextEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1469,7 +1470,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> import tensorflow as tf
>>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> model = TFSpeech2TextForConditionalGeneration.from_pretrained(
... "facebook/s2t-small-librispeech-asr", from_pt=True
@ -1478,8 +1479,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -1486,9 +1486,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1481,9 +1481,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1595,9 +1596,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1777,9 +1779,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1064,15 +1064,15 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
```python
>>> from transformers import AutoProcessor, FlaxWav2Vec2Model
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch
@ -1183,15 +1183,15 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> import jax.numpy as jnp
>>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch
@ -1384,15 +1384,15 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
>>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -1530,15 +1530,15 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
```python
>>> from transformers import AutoProcessor, TFWav2Vec2Model
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch
@ -1642,15 +1642,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data
... return batch

View File

@ -2012,9 +2012,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2126,9 +2127,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2308,9 +2310,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1579,9 +1579,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1681,9 +1682,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1851,9 +1853,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1328,9 +1328,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1442,9 +1443,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1624,9 +1626,10 @@ class WavLMForXVector(WavLMPreTrainedModel):
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -102,9 +102,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
the features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored.
@ -139,9 +140,10 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
the mel features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`].
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored.

View File

@ -601,9 +601,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.*
via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
fbank features, padding and conversion into a tensor of type `tf.Tensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
@ -729,7 +730,8 @@ class TFWhisperEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):

View File

@ -651,7 +651,8 @@ class WhisperEncoder(WhisperPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
`numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*):
@ -1096,9 +1097,10 @@ class WhisperModel(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
@ -1266,9 +1268,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
@ -1600,9 +1603,10 @@ class WhisperForAudioClassification(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -130,7 +130,6 @@ from .utils import (
is_scipy_available,
is_sentencepiece_available,
is_seqio_available,
is_soundfile_available,
is_spacy_available,
is_speech_available,
is_spqr_available,
@ -656,7 +655,7 @@ def require_torchcodec(test_case):
These tests are skipped when Torchcodec isn't installed.
"""
return unittest.skipUnless(is_torchcodec_available(), "test requires Torchvision")(test_case)
return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case)
def require_torch_or_tf(test_case):
@ -1268,16 +1267,6 @@ def require_clearml(test_case):
return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
def require_soundfile(test_case):
"""
Decorator marking a test that requires soundfile
These tests are skipped when soundfile isn't installed.
"""
return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case)
def require_deepspeed(test_case):
"""
Decorator marking a test that requires deepspeed

View File

@ -248,9 +248,10 @@ class ModelArgs:
input_values = {
"description": """
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details.
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`{processor_class}.__call__`] for details.
""",
"shape": "of shape `(batch_size, sequence_length)`",
}

View File

@ -216,6 +216,7 @@ _torchaudio_available = _is_package_available("torchaudio")
_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
_torchdistx_available = _is_package_available("torchdistx")
_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
_torchcodec_available, _torchcodec_version = _is_package_available("torchcodec", return_version=True)
_mlx_available = _is_package_available("mlx")
_num2words_available = _is_package_available("num2words")
_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
@ -457,6 +458,10 @@ def is_torchvision_available():
return _torchvision_available
def is_torchcodec_available():
return _torchcodec_available
def is_torchvision_v2_available():
if not is_torchvision_available():
return False

View File

@ -21,7 +21,7 @@ from datasets import load_dataset
from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
from transformers import Data2VecAudioConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -22,7 +22,7 @@ import unittest
import pytest
from transformers import HubertConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -33,13 +33,13 @@ from transformers import (
from transformers.testing_utils import (
Expectations,
cleanup,
require_soundfile,
require_torch,
require_torch_large_accelerator,
require_torchcodec,
slow,
torch_device,
)
from transformers.utils import is_soundfile_available
from transformers.utils import is_torchcodec_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
@ -54,8 +54,8 @@ if is_vision_available():
from PIL import Image
if is_soundfile_available():
import soundfile
if is_torchcodec_available():
import torchcodec
class Phi4MultimodalModelTester:
@ -300,7 +300,8 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
tmp.write(requests.get(self.audio_url, stream=True).raw.data)
tmp.flush()
tmp.seek(0)
self.audio, self.sampling_rate = soundfile.read(tmp.name)
samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
self.audio, self.sampling_rate = samples.data, samples.sample_rate
cleanup(torch_device, gc_collect=True)
@ -378,7 +379,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assertEqual(response, EXPECTED_RESPONSE)
@require_soundfile
@require_torchcodec
def test_audio_text_generation(self):
model = AutoModelForCausalLM.from_pretrained(
self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device

View File

@ -19,7 +19,7 @@ import unittest
import pytest
from transformers import SEWConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -19,7 +19,7 @@ import unittest
import pytest
from transformers import SEWDConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset
from transformers import UniSpeechConfig, is_torch_available
from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
@require_torch
@require_soundfile
@require_torchcodec
@slow
class UniSpeechModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset
from transformers import UniSpeechSatConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class UniSpeechSatModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):

View File

@ -34,10 +34,10 @@ from transformers.testing_utils import (
is_torchaudio_available,
require_flash_attn,
require_pyctcdecode,
require_soundfile,
require_torch,
require_torch_gpu,
require_torchaudio,
require_torchcodec,
run_test_in_subprocess,
slow,
torch_device,
@ -1444,7 +1444,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
@require_torch
@require_soundfile
@require_torchcodec
@slow
class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def tearDown(self):