torchcodec in docstrings and testing utils

This commit is contained in:
Quentin Lhoest 2025-07-01 18:28:34 +02:00 committed by ydshieh
parent d20ab64120
commit 986698a6a7
38 changed files with 216 additions and 175 deletions

View File

@ -64,15 +64,15 @@ predicted token ids.
>>> import torch >>> import torch
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -436,9 +436,10 @@ class ASTModel(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
@ -526,9 +527,10 @@ class ASTForAudioClassification(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1053,9 +1053,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1167,9 +1168,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1349,9 +1351,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -985,15 +985,15 @@ class HubertModel(HubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, HubertModel >>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1261,9 +1261,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`HubertProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1459,15 +1459,15 @@ class TFHubertModel(TFHubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, TFHubertModel >>> from transformers import AutoProcessor, TFHubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1571,15 +1571,15 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFHubertForCTC >>> from transformers import AutoProcessor, TFHubertForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -239,15 +239,15 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, HubertModel >>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -563,7 +563,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1028,7 +1029,8 @@ class MoonshineModel(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1204,7 +1206,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -587,7 +587,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -844,7 +845,8 @@ class MoonshineModel(WhisperModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1004,7 +1006,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -797,7 +797,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`): feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -1789,9 +1790,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -1782,7 +1782,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`): feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -2236,9 +2237,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -360,7 +360,8 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*): attention_mask (`torch.Tensor`)`, *optional*):
@ -740,9 +741,10 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`): feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

View File

@ -1077,9 +1077,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1597,9 +1597,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWDProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args: Args:
inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`. `torch.FloatTensor`.
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
r""" r"""
inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`. `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):

View File

@ -620,7 +620,8 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`): input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `torch.FloatTensor`. See padding and conversion into a tensor of type `torch.FloatTensor`. See
[`~Speech2TextFeatureExtractor.__call__`] [`~Speech2TextFeatureExtractor.__call__`]

View File

@ -848,7 +848,8 @@ class TFSpeech2TextEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`): input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1469,7 +1470,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration >>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> model = TFSpeech2TextForConditionalGeneration.from_pretrained( >>> model = TFSpeech2TextForConditionalGeneration.from_pretrained(
... "facebook/s2t-small-librispeech-asr", from_pt=True ... "facebook/s2t-small-librispeech-asr", from_pt=True
@ -1478,8 +1479,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -1486,9 +1486,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1481,9 +1481,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1595,9 +1596,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1777,9 +1779,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1064,15 +1064,15 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
```python ```python
>>> from transformers import AutoProcessor, FlaxWav2Vec2Model >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1183,15 +1183,15 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> import jax.numpy as jnp >>> import jax.numpy as jnp
>>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1384,15 +1384,15 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
>>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -1530,15 +1530,15 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, TFWav2Vec2Model >>> from transformers import AutoProcessor, TFWav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1642,15 +1642,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFWav2Vec2ForCTC >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -2012,9 +2012,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2126,9 +2127,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2308,9 +2310,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1579,9 +1579,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1681,9 +1682,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1851,9 +1853,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1328,9 +1328,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1442,9 +1443,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1624,9 +1626,10 @@ class WavLMForXVector(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -102,9 +102,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`] the features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored. is not used. By default the silence in the input log mel spectrogram are ignored.
@ -139,9 +140,10 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]. the mel features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`].
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored. is not used. By default the silence in the input log mel spectrogram are ignored.

View File

@ -601,9 +601,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.*
via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] fbank features, padding and conversion into a tensor of type `tf.Tensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -729,7 +730,8 @@ class TFWhisperEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):

View File

@ -651,7 +651,8 @@ class WhisperEncoder(WhisperPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*): attention_mask (`torch.Tensor`)`, *optional*):
@ -1096,9 +1097,10 @@ class WhisperModel(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -1266,9 +1268,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -1600,9 +1603,10 @@ class WhisperForAudioClassification(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -130,7 +130,6 @@ from .utils import (
is_scipy_available, is_scipy_available,
is_sentencepiece_available, is_sentencepiece_available,
is_seqio_available, is_seqio_available,
is_soundfile_available,
is_spacy_available, is_spacy_available,
is_speech_available, is_speech_available,
is_spqr_available, is_spqr_available,
@ -656,7 +655,7 @@ def require_torchcodec(test_case):
These tests are skipped when Torchcodec isn't installed. These tests are skipped when Torchcodec isn't installed.
""" """
return unittest.skipUnless(is_torchcodec_available(), "test requires Torchvision")(test_case) return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case)
def require_torch_or_tf(test_case): def require_torch_or_tf(test_case):
@ -1268,16 +1267,6 @@ def require_clearml(test_case):
return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
def require_soundfile(test_case):
"""
Decorator marking a test that requires soundfile
These tests are skipped when soundfile isn't installed.
"""
return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case)
def require_deepspeed(test_case): def require_deepspeed(test_case):
""" """
Decorator marking a test that requires deepspeed Decorator marking a test that requires deepspeed

View File

@ -248,9 +248,10 @@ class ModelArgs:
input_values = { input_values = {
"description": """ "description": """
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`{processor_class}.__call__`] for details.
""", """,
"shape": "of shape `(batch_size, sequence_length)`", "shape": "of shape `(batch_size, sequence_length)`",
} }

View File

@ -216,6 +216,7 @@ _torchaudio_available = _is_package_available("torchaudio")
_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True) _torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
_torchdistx_available = _is_package_available("torchdistx") _torchdistx_available = _is_package_available("torchdistx")
_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True) _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
_torchcodec_available, _torchcodec_version = _is_package_available("torchcodec", return_version=True)
_mlx_available = _is_package_available("mlx") _mlx_available = _is_package_available("mlx")
_num2words_available = _is_package_available("num2words") _num2words_available = _is_package_available("num2words")
_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True) _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
@ -457,6 +458,10 @@ def is_torchvision_available():
return _torchvision_available return _torchvision_available
def is_torchcodec_available():
return _torchcodec_available
def is_torchvision_v2_available(): def is_torchvision_v2_available():
if not is_torchvision_available(): if not is_torchvision_available():
return False return False

View File

@ -21,7 +21,7 @@ from datasets import load_dataset
from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
from transformers import Data2VecAudioConfig, is_torch_available from transformers import Data2VecAudioConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase): class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -22,7 +22,7 @@ import unittest
import pytest import pytest
from transformers import HubertConfig, is_torch_available from transformers import HubertConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class HubertModelIntegrationTest(unittest.TestCase): class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -33,13 +33,13 @@ from transformers import (
from transformers.testing_utils import ( from transformers.testing_utils import (
Expectations, Expectations,
cleanup, cleanup,
require_soundfile,
require_torch, require_torch,
require_torch_large_accelerator, require_torch_large_accelerator,
require_torchcodec,
slow, slow,
torch_device, torch_device,
) )
from transformers.utils import is_soundfile_available from transformers.utils import is_torchcodec_available
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@ -54,8 +54,8 @@ if is_vision_available():
from PIL import Image from PIL import Image
if is_soundfile_available(): if is_torchcodec_available():
import soundfile import torchcodec
class Phi4MultimodalModelTester: class Phi4MultimodalModelTester:
@ -300,7 +300,8 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
tmp.write(requests.get(self.audio_url, stream=True).raw.data) tmp.write(requests.get(self.audio_url, stream=True).raw.data)
tmp.flush() tmp.flush()
tmp.seek(0) tmp.seek(0)
self.audio, self.sampling_rate = soundfile.read(tmp.name) samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
self.audio, self.sampling_rate = samples.data, samples.sample_rate
cleanup(torch_device, gc_collect=True) cleanup(torch_device, gc_collect=True)
@ -378,7 +379,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assertEqual(response, EXPECTED_RESPONSE) self.assertEqual(response, EXPECTED_RESPONSE)
@require_soundfile @require_torchcodec
def test_audio_text_generation(self): def test_audio_text_generation(self):
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device

View File

@ -19,7 +19,7 @@ import unittest
import pytest import pytest
from transformers import SEWConfig, is_torch_available from transformers import SEWConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class SEWModelIntegrationTest(unittest.TestCase): class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -19,7 +19,7 @@ import unittest
import pytest import pytest
from transformers import SEWDConfig, is_torch_available from transformers import SEWDConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class SEWDModelIntegrationTest(unittest.TestCase): class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset from datasets import load_dataset
from transformers import UniSpeechConfig, is_torch_available from transformers import UniSpeechConfig, is_torch_available
from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class UniSpeechModelIntegrationTest(unittest.TestCase): class UniSpeechModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset from datasets import load_dataset
from transformers import UniSpeechSatConfig, is_torch_available from transformers import UniSpeechSatConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class UniSpeechSatModelIntegrationTest(unittest.TestCase): class UniSpeechSatModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -34,10 +34,10 @@ from transformers.testing_utils import (
is_torchaudio_available, is_torchaudio_available,
require_flash_attn, require_flash_attn,
require_pyctcdecode, require_pyctcdecode,
require_soundfile,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
require_torchaudio, require_torchaudio,
require_torchcodec,
run_test_in_subprocess, run_test_in_subprocess,
slow, slow,
torch_device, torch_device,
@ -1444,7 +1444,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class Wav2Vec2ModelIntegrationTest(unittest.TestCase): class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def tearDown(self): def tearDown(self):