diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md index 38d3826a6b6..708f13a72b6 100644 --- a/docs/source/en/model_doc/speech_to_text_2.md +++ b/docs/source/en/model_doc/speech_to_text_2.md @@ -72,7 +72,7 @@ predicted token ids. >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) -... batch["speech"] = decoder.get_all_samples().data +... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 9eb8140103b..a40c21932f8 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1653,14 +1653,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin): >>> text = "This is an example text." >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + >>> audio = ds.sort("id")["audio"][0] + >>> audio_sample, sr = audio["array"], audio["sampling_rate"] >>> # Define processor and model >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> # Generate processor output and model output - >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") + >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt") >>> speech_embeds = model.get_speech_features( ... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"] ... ) @@ -1732,14 +1733,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin): >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + >>> audio = ds.sort("id")["audio"][0] + >>> audio_sample, sr = audio["array"], audio["sampling_rate"] >>> # Define processor and model >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> # processor outputs and model outputs - >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") + >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt") >>> outputs = model( ... input_ids=processor_output["input_ids"], ... input_features=processor_output["input_features"], diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 9d6d0bec5f1..f2f03ef6ab3 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -993,7 +993,7 @@ class HubertModel(HubertPreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 4913245abce..0b2aae8d80c 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1467,7 +1467,7 @@ class TFHubertModel(TFHubertPreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch @@ -1579,7 +1579,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index c553f01220e..12f022deacc 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -237,6 +237,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel): Example: ```python + >>> import torch >>> from transformers import AutoProcessor, HubertModel >>> from datasets import load_dataset >>> from torchcodec.decoders import AudioDecoder @@ -247,7 +248,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index a57d223f3c2..a1830fb0862 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1480,7 +1480,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index e80cd014c7b..0e74c7e7c96 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -1072,7 +1072,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """ >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch @@ -1191,7 +1191,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """ >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch @@ -1392,7 +1392,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """ >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index cadbccd3d8e..4ac1943ce80 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1538,7 +1538,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch @@ -1650,7 +1650,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): >>> def map_to_array(batch): ... decoder = AudioDecoder(batch["file"]) - ... batch["speech"] = decoder.get_all_samples().data + ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0) ... return batch diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 77c2ec6cd3f..b0ee0066fb0 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -154,7 +154,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py index b2ccb501713..e349e081199 100644 --- a/tests/models/clap/test_feature_extraction_clap.py +++ b/tests/models/clap/test_feature_extraction_clap.py @@ -165,7 +165,7 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index 78bf38e2df5..409e5100058 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -215,7 +215,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=22050)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 60c165fbbe8..a33d787dc7c 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -373,10 +373,12 @@ class ClvpModelForConditionalGenerationTester: ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + audio = ds.sort("id")[0]["audio"] + audio_sample = audio["array"] + sr = audio["sampling_rate"] feature_extractor = ClvpFeatureExtractor() - input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[ + input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[ "input_features" ].to(torch_device) @@ -562,7 +564,8 @@ class ClvpIntegrationTest(unittest.TestCase): self.text = "This is an example text." ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + audio = ds.sort("id")["audio"][0] + self.speech_samples, self.sr = audio["array"], audio["sampling_rate"] self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device) self.model.eval() diff --git a/tests/models/dac/test_feature_extraction_dac.py b/tests/models/dac/test_feature_extraction_dac.py index 13d72326078..c995485d331 100644 --- a/tests/models/dac/test_feature_extraction_dac.py +++ b/tests/models/dac/test_feature_extraction_dac.py @@ -143,7 +143,7 @@ class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/dia/test_feature_extraction_dia.py b/tests/models/dia/test_feature_extraction_dia.py index 6243dc47919..9a6f797d534 100644 --- a/tests/models/dia/test_feature_extraction_dia.py +++ b/tests/models/dia/test_feature_extraction_dia.py @@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py index f9427160c25..447491f9010 100644 --- a/tests/models/dia/test_modeling_dia.py +++ b/tests/models/dia/test_modeling_dia.py @@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase): @require_torch_accelerator def test_dia_model_integration_generate_audio_context(self): text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"] - audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy() - audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy() + audio_sample_1 = ( + torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy() + ) + audio_sample_2 = ( + torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy() + ) audio = [audio_sample_1, audio_sample_2] processor = DiaProcessor.from_pretrained(self.model_checkpoint) diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index 3dc4c5fbb7c..2823b009937 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -139,7 +139,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 67ef91db785..44a86240015 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -340,7 +340,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index 780658c77af..ad516904ef3 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -713,7 +713,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa def _load_datasamples(self, num_samples): self._load_dataset() ds = self._dataset - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @slow diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py index 99573cff096..da19f605f25 100644 --- a/tests/models/moonshine/test_modeling_moonshine.py +++ b/tests/models/moonshine/test_modeling_moonshine.py @@ -443,7 +443,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py index 0163deec33e..8d235b51990 100644 --- a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py @@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index b154a022036..07fd24577bf 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest import requests @@ -296,12 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): self.assistant_token = "<|assistant|>" self.end_token = "<|end|>" self.image = Image.open(requests.get(self.image_url, stream=True).raw) - with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp: - tmp.write(requests.get(self.audio_url, stream=True).raw.data) - tmp.flush() - tmp.seek(0) - samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples() - self.audio, self.sampling_rate = samples.data, samples.sample_rate + audio_bytes = requests.get(self.audio_url, stream=True).raw.data + samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples() + self.audio, self.sampling_rate = samples.data, samples.sample_rate cleanup(torch_device, gc_collect=True) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 49433e11561..15f0d89a3be 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -294,7 +294,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index c886704a04d..9c1a3b524d9 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -381,7 +381,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 2255e895ce7..9c0fa0fa394 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -764,7 +764,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @@ -1792,7 +1792,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py index 51a5fb7724d..e57c40396e2 100644 --- a/tests/models/univnet/test_feature_extraction_univnet.py +++ b/tests/models/univnet/test_feature_extraction_univnet.py @@ -330,7 +330,7 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index 003c63a3e64..00066d89bdb 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -216,7 +216,7 @@ class UnivNetModelIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index ec8748b32e2..0834edb4e2a 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -254,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index dbb241f5ad4..a07e2660178 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1460,7 +1460,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): self._load_dataset() ds = self._dataset - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @slow diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index a9977d912c5..0e3f2246cc5 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1190,7 +1190,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): num_beams=1, ) - transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] + transcription_non_ass = pipe(sample, generate_kwargs={"assistant_model": assistant_model})["text"] transcription_ass = pipe(sample)["text"] self.assertEqual(transcription_ass, transcription_non_ass) diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py index 7147a9c893e..4d0459d9a8b 100644 --- a/tests/utils/test_audio_utils.py +++ b/tests/utils/test_audio_utils.py @@ -278,7 +278,7 @@ class AudioUtilsFunctionTester(unittest.TestCase): if self._dataset is None: self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = self._dataset.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] def test_spectrogram_impulse(self): diff --git a/utils/print_env.py b/utils/print_env.py index e6d54fff2c7..ea2b8773554 100644 --- a/utils/print_env.py +++ b/utils/print_env.py @@ -72,3 +72,14 @@ try: print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU"))) except ImportError: print("TensorFlow version:", None) + + +try: + import torchcodec + + versions = torchcodec._core.get_ffmpeg_library_versions() + print("FFmpeg version:", versions["ffmpeg_version"]) +except ImportError: + print("FFmpeg version:", None) +except (AttributeError, KeyError): + print("Failed to get FFmpeg version")