diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md index 8caf774e733..38d3826a6b6 100644 --- a/docs/source/en/model_doc/speech_to_text_2.md +++ b/docs/source/en/model_doc/speech_to_text_2.md @@ -64,15 +64,15 @@ predicted token ids. >>> import torch >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel >>> from datasets import load_dataset ->>> import soundfile as sf +>>> from torchcodec.decoders import AudioDecoder >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> def map_to_array(batch): -... speech, _ = sf.read(batch["file"]) -... batch["speech"] = speech +... decoder = AudioDecoder(batch["file"]) +... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 602de3ff72b..608a31eec08 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -436,9 +436,10 @@ class ASTModel(ASTPreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -526,9 +527,10 @@ class ASTForAudioClassification(ASTPreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index e4ddac37541..2d86b5410d3 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1053,9 +1053,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1167,9 +1168,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1349,9 +1351,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 0fab4184bfe..9d6d0bec5f1 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -985,15 +985,15 @@ class HubertModel(HubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, HubertModel >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch @@ -1261,9 +1261,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`HubertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index a701252f63e..4913245abce 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1459,15 +1459,15 @@ class TFHubertModel(TFHubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, TFHubertModel >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch @@ -1571,15 +1571,15 @@ class TFHubertForCTC(TFHubertPreTrainedModel): >>> import tensorflow as tf >>> from transformers import AutoProcessor, TFHubertForCTC >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index ec13ad60900..c553f01220e 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -239,15 +239,15 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, HubertModel >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 2909fb386fb..28247413dd3 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -563,7 +563,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1028,7 +1029,8 @@ class MoonshineModel(MoonshinePreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -1204,7 +1206,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 500231f3b48..0a0de82d753 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -587,7 +587,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -844,7 +845,8 @@ class MoonshineModel(WhisperModel): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -1004,7 +1006,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index c63beb73fac..ac433418193 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -797,7 +797,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] feature_lens (`torch.LongTensor` of shape `(batch_size,)`): @@ -1789,9 +1790,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 9acc76c9afa..d58412cef37 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1782,7 +1782,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] feature_lens (`torch.LongTensor` of shape `(batch_size,)`): @@ -2236,9 +2237,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 45fcbe80495..deffdfbee8a 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -360,7 +360,8 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] attention_mask (`torch.Tensor`)`, *optional*): @@ -740,9 +741,10 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`): Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index da4a54b39fc..f066fe08f8e 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -1077,9 +1077,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`SEWProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 5e00ddcd1f3..8fc81c5ea69 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1597,9 +1597,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`SEWDProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py index d2d23d29007..7dbabb61710 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py @@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r""" Args: inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` - or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile - library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or + or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip + install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index b4582949fe6..93f3810e787 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` - or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile - library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or + or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip + install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index aaff8d90fec..c6279da905f 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -620,7 +620,8 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index fad6a026b15..a57d223f3c2 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -848,7 +848,8 @@ class TFSpeech2TextEncoder(keras.layers.Layer): input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1469,7 +1470,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus >>> import tensorflow as tf >>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> model = TFSpeech2TextForConditionalGeneration.from_pretrained( ... "facebook/s2t-small-librispeech-asr", from_pt=True @@ -1478,8 +1479,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 373b25b4e1a..55a4f064680 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1486,9 +1486,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`UniSpeechProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 0ce8a7c8154..faa9acf071c 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1481,9 +1481,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1595,9 +1596,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1777,9 +1779,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index ca5eb700ee8..e80cd014c7b 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -1064,15 +1064,15 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """ ```python >>> from transformers import AutoProcessor, FlaxWav2Vec2Model >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch @@ -1183,15 +1183,15 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """ >>> import jax.numpy as jnp >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch @@ -1384,15 +1384,15 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """ >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 657674c3ff9..cadbccd3d8e 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1530,15 +1530,15 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): ```python >>> from transformers import AutoProcessor, TFWav2Vec2Model >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch @@ -1642,15 +1642,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): >>> import tensorflow as tf >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech + ... decoder = AudioDecoder(batch["file"]) + ... batch["speech"] = decoder.get_all_samples().data ... return batch diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index c7d04dab28f..27daf31200b 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -2012,9 +2012,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -2126,9 +2127,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -2308,9 +2310,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index ec88628296c..a31c6542ddf 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1579,9 +1579,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1681,9 +1682,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1851,9 +1853,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 5904f05dcbf..06e6461b42d 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1328,9 +1328,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1442,9 +1443,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1624,9 +1626,10 @@ class WavLMForXVector(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index 63b7f718536..a2f6c184ab2 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -102,9 +102,10 @@ WHISPER_INPUTS_DOCSTRING = r""" input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a - tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting + the features, padding and conversion into a tensor of type `numpy.ndarray`. + See [`~WhisperFeatureExtractor.__call__`] attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but is not used. By default the silence in the input log mel spectrogram are ignored. @@ -139,9 +140,10 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r""" input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]. + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting + the mel features, padding and conversion into a tensor of type `numpy.ndarray`. + See [`~WhisperFeatureExtractor.__call__`]. attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but is not used. By default the silence in the input log mel spectrogram are ignored. diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index f5ca846b81e..6f13d57f10f 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -601,9 +601,10 @@ WHISPER_INPUTS_DOCSTRING = r""" input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] + via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + fbank features, padding and conversion into a tensor of type `tf.Tensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -729,7 +730,8 @@ class TFWhisperEncoder(keras.layers.Layer): input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index d3e9c8e03a2..94f802086d4 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -651,7 +651,8 @@ class WhisperEncoder(WhisperPreTrainedModel): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] attention_mask (`torch.Tensor`)`, *optional*): @@ -1096,9 +1097,10 @@ class WhisperModel(WhisperPreTrainedModel): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1266,9 +1268,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1600,9 +1603,10 @@ class WhisperForAudioClassification(WhisperPreTrainedModel): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 3b9cb2c5201..5f3c799e197 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -130,7 +130,6 @@ from .utils import ( is_scipy_available, is_sentencepiece_available, is_seqio_available, - is_soundfile_available, is_spacy_available, is_speech_available, is_spqr_available, @@ -656,7 +655,7 @@ def require_torchcodec(test_case): These tests are skipped when Torchcodec isn't installed. """ - return unittest.skipUnless(is_torchcodec_available(), "test requires Torchvision")(test_case) + return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case) def require_torch_or_tf(test_case): @@ -1268,16 +1267,6 @@ def require_clearml(test_case): return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) -def require_soundfile(test_case): - """ - Decorator marking a test that requires soundfile - - These tests are skipped when soundfile isn't installed. - - """ - return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case) - - def require_deepspeed(test_case): """ Decorator marking a test that requires deepspeed diff --git a/src/transformers/utils/args_doc.py b/src/transformers/utils/args_doc.py index 61f947516ff..e96d7790f7f 100644 --- a/src/transformers/utils/args_doc.py +++ b/src/transformers/utils/args_doc.py @@ -248,9 +248,10 @@ class ModelArgs: input_values = { "description": """ Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details. + into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install + torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, + the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. + See [`{processor_class}.__call__`] for details. """, "shape": "of shape `(batch_size, sequence_length)`", } diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 9f90f9e17b1..b0a894c9534 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -216,6 +216,7 @@ _torchaudio_available = _is_package_available("torchaudio") _torchao_available, _torchao_version = _is_package_available("torchao", return_version=True) _torchdistx_available = _is_package_available("torchdistx") _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True) +_torchcodec_available, _torchcodec_version = _is_package_available("torchcodec", return_version=True) _mlx_available = _is_package_available("mlx") _num2words_available = _is_package_available("num2words") _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True) @@ -457,6 +458,10 @@ def is_torchvision_available(): return _torchvision_available +def is_torchcodec_available(): + return _torchcodec_available + + def is_torchvision_v2_available(): if not is_torchvision_available(): return False diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index e275b8d681b..630f6238e76 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -21,7 +21,7 @@ from datasets import load_dataset from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from transformers import Data2VecAudioConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, _config_zero_init @@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class Data2VecAudioModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 905b435bb59..904a04e1f9a 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -22,7 +22,7 @@ import unittest import pytest from transformers import HubertConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index 6ef9aef9626..b154a022036 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -33,13 +33,13 @@ from transformers import ( from transformers.testing_utils import ( Expectations, cleanup, - require_soundfile, require_torch, require_torch_large_accelerator, + require_torchcodec, slow, torch_device, ) -from transformers.utils import is_soundfile_available +from transformers.utils import is_torchcodec_available from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -54,8 +54,8 @@ if is_vision_available(): from PIL import Image -if is_soundfile_available(): - import soundfile +if is_torchcodec_available(): + import torchcodec class Phi4MultimodalModelTester: @@ -300,7 +300,8 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): tmp.write(requests.get(self.audio_url, stream=True).raw.data) tmp.flush() tmp.seek(0) - self.audio, self.sampling_rate = soundfile.read(tmp.name) + samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples() + self.audio, self.sampling_rate = samples.data, samples.sample_rate cleanup(torch_device, gc_collect=True) @@ -378,7 +379,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): self.assertEqual(response, EXPECTED_RESPONSE) - @require_soundfile + @require_torchcodec def test_audio_text_generation(self): model = AutoModelForCausalLM.from_pretrained( self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 6e049b4faba..270f91bdf62 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -19,7 +19,7 @@ import unittest import pytest from transformers import SEWConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 4df373e8391..86064250b8f 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -19,7 +19,7 @@ import unittest import pytest from transformers import SEWDConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 37da494a965..00614bca7c8 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -21,7 +21,7 @@ import pytest from datasets import load_dataset from transformers import UniSpeechConfig, is_torch_available -from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T @require_torch -@require_soundfile +@require_torchcodec @slow class UniSpeechModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index 1b6a1cb8042..2c5001fbbc5 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -21,7 +21,7 @@ import pytest from datasets import load_dataset from transformers import UniSpeechSatConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class UniSpeechSatModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index cea2801f095..560a8af6d9c 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -34,10 +34,10 @@ from transformers.testing_utils import ( is_torchaudio_available, require_flash_attn, require_pyctcdecode, - require_soundfile, require_torch, require_torch_gpu, require_torchaudio, + require_torchcodec, run_test_in_subprocess, slow, torch_device, @@ -1444,7 +1444,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def tearDown(self):