diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md
index 38d3826a6b6..708f13a72b6 100644
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@@ -72,7 +72,7 @@ predicted token ids.
 
 >>> def map_to_array(batch):
 ...     decoder = AudioDecoder(batch["file"])
-...     batch["speech"] = decoder.get_all_samples().data
+...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
 ...     return batch
 
 
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index 9eb8140103b..a40c21932f8 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -1653,14 +1653,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
         >>> text = "This is an example text."
         >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        >>> audio = ds.sort("id")["audio"][0]
+        >>> audio_sample, sr = audio["array"], audio["sampling_rate"]
 
         >>> # Define processor and model
         >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
         >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
 
         >>> # Generate processor output and model output
-        >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
+        >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
         >>> speech_embeds = model.get_speech_features(
         ...     input_ids=processor_output["input_ids"], input_features=processor_output["input_features"]
         ... )
@@ -1732,14 +1733,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
 
         >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        >>> audio = ds.sort("id")["audio"][0]
+        >>> audio_sample, sr = audio["array"], audio["sampling_rate"]
 
         >>> # Define processor and model
         >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
         >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
 
         >>> # processor outputs and model outputs
-        >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
+        >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
         >>> outputs = model(
         ...     input_ids=processor_output["input_ids"],
         ...     input_features=processor_output["input_features"],
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 9d6d0bec5f1..f2f03ef6ab3 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -993,7 +993,7 @@ class HubertModel(HubertPreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 4913245abce..0b2aae8d80c 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1467,7 +1467,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
@@ -1579,7 +1579,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py
index c553f01220e..12f022deacc 100644
--- a/src/transformers/models/hubert/modular_hubert.py
+++ b/src/transformers/models/hubert/modular_hubert.py
@@ -237,6 +237,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
         Example:
 
         ```python
+        >>> import torch
         >>> from transformers import AutoProcessor, HubertModel
         >>> from datasets import load_dataset
         >>> from torchcodec.decoders import AudioDecoder
@@ -247,7 +248,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index a57d223f3c2..a1830fb0862 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -1480,7 +1480,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index e80cd014c7b..0e74c7e7c96 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -1072,7 +1072,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
 
     >>> def map_to_array(batch):
     ...     decoder = AudioDecoder(batch["file"])
-    ...     batch["speech"] = decoder.get_all_samples().data
+    ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
     ...     return batch
 
 
@@ -1191,7 +1191,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
 
     >>> def map_to_array(batch):
     ...     decoder = AudioDecoder(batch["file"])
-    ...     batch["speech"] = decoder.get_all_samples().data
+    ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
     ...     return batch
 
 
@@ -1392,7 +1392,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
 
     >>> def map_to_array(batch):
     ...     decoder = AudioDecoder(batch["file"])
-    ...     batch["speech"] = decoder.get_all_samples().data
+    ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
     ...     return batch
 
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index cadbccd3d8e..4ac1943ce80 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1538,7 +1538,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
@@ -1650,7 +1650,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
 
         >>> def map_to_array(batch):
         ...     decoder = AudioDecoder(batch["file"])
-        ...     batch["speech"] = decoder.get_all_samples().data
+        ...     batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
         ...     return batch
 
 
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index 77c2ec6cd3f..b0ee0066fb0 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -154,7 +154,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index b2ccb501713..e349e081199 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -165,7 +165,7 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py
index 78bf38e2df5..409e5100058 100644
--- a/tests/models/clvp/test_feature_extraction_clvp.py
+++ b/tests/models/clvp/test_feature_extraction_clvp.py
@@ -215,7 +215,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         ds = ds.cast_column("audio", Audio(sampling_rate=22050))
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
 
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 60c165fbbe8..a33d787dc7c 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -373,10 +373,12 @@ class ClvpModelForConditionalGenerationTester:
 
         ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        audio = ds.sort("id")[0]["audio"]
+        audio_sample = audio["array"]
+        sr = audio["sampling_rate"]
 
         feature_extractor = ClvpFeatureExtractor()
-        input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[
+        input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[
             "input_features"
         ].to(torch_device)
 
@@ -562,7 +564,8 @@ class ClvpIntegrationTest(unittest.TestCase):
         self.text = "This is an example text."
         ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        audio = ds.sort("id")["audio"][0]
+        self.speech_samples, self.sr = audio["array"], audio["sampling_rate"]
 
         self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
         self.model.eval()
diff --git a/tests/models/dac/test_feature_extraction_dac.py b/tests/models/dac/test_feature_extraction_dac.py
index 13d72326078..c995485d331 100644
--- a/tests/models/dac/test_feature_extraction_dac.py
+++ b/tests/models/dac/test_feature_extraction_dac.py
@@ -143,7 +143,7 @@ class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        audio_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in audio_samples]
 
diff --git a/tests/models/dia/test_feature_extraction_dia.py b/tests/models/dia/test_feature_extraction_dia.py
index 6243dc47919..9a6f797d534 100644
--- a/tests/models/dia/test_feature_extraction_dia.py
+++ b/tests/models/dia/test_feature_extraction_dia.py
@@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        audio_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in audio_samples]
 
diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py
index f9427160c25..447491f9010 100644
--- a/tests/models/dia/test_modeling_dia.py
+++ b/tests/models/dia/test_modeling_dia.py
@@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase):
     @require_torch_accelerator
     def test_dia_model_integration_generate_audio_context(self):
         text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"]
-        audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy()
-        audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy()
+        audio_sample_1 = (
+            torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
+        )
+        audio_sample_2 = (
+            torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
+        )
         audio = [audio_sample_1, audio_sample_2]
 
         processor = DiaProcessor.from_pretrained(self.model_checkpoint)
diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py
index 3dc4c5fbb7c..2823b009937 100644
--- a/tests/models/encodec/test_feature_extraction_encodec.py
+++ b/tests/models/encodec/test_feature_extraction_encodec.py
@@ -139,7 +139,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        audio_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in audio_samples]
 
diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py
index 67ef91db785..44a86240015 100644
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@@ -340,7 +340,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index 780658c77af..ad516904ef3 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -713,7 +713,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa
     def _load_datasamples(self, num_samples):
         self._load_dataset()
         ds = self._dataset
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
         return [x["array"] for x in speech_samples]
 
     @slow
diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py
index 99573cff096..da19f605f25 100644
--- a/tests/models/moonshine/test_modeling_moonshine.py
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@@ -443,7 +443,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py
index 0163deec33e..8d235b51990 100644
--- a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py
@@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index b154a022036..07fd24577bf 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 
 import requests
@@ -296,12 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
         self.assistant_token = "<|assistant|>"
         self.end_token = "<|end|>"
         self.image = Image.open(requests.get(self.image_url, stream=True).raw)
-        with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp:
-            tmp.write(requests.get(self.audio_url, stream=True).raw.data)
-            tmp.flush()
-            tmp.seek(0)
-            samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
-            self.audio, self.sampling_rate = samples.data, samples.sample_rate
+        audio_bytes = requests.get(self.audio_url, stream=True).raw.data
+        samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples()
+        self.audio, self.sampling_rate = samples.data, samples.sample_rate
 
         cleanup(torch_device, gc_collect=True)
 
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index 49433e11561..15f0d89a3be 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -294,7 +294,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index c886704a04d..9c1a3b524d9 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -381,7 +381,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 2255e895ce7..9c0fa0fa394 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -764,7 +764,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
@@ -1792,7 +1792,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py
index 51a5fb7724d..e57c40396e2 100644
--- a/tests/models/univnet/test_feature_extraction_univnet.py
+++ b/tests/models/univnet/test_feature_extraction_univnet.py
@@ -330,7 +330,7 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
 
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index 003c63a3e64..00066d89bdb 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -216,7 +216,7 @@ class UnivNetModelIntegrationTests(unittest.TestCase):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
 
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index ec8748b32e2..0834edb4e2a 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -254,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
 
         return [x["array"] for x in speech_samples]
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index dbb241f5ad4..a07e2660178 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1460,7 +1460,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
     def _load_datasamples(self, num_samples):
         self._load_dataset()
         ds = self._dataset
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
         return [x["array"] for x in speech_samples]
 
     @slow
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index a9977d912c5..0e3f2246cc5 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1190,7 +1190,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
             num_beams=1,
         )
 
-        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
+        transcription_non_ass = pipe(sample, generate_kwargs={"assistant_model": assistant_model})["text"]
         transcription_ass = pipe(sample)["text"]
 
         self.assertEqual(transcription_ass, transcription_non_ass)
diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py
index 7147a9c893e..4d0459d9a8b 100644
--- a/tests/utils/test_audio_utils.py
+++ b/tests/utils/test_audio_utils.py
@@ -278,7 +278,7 @@ class AudioUtilsFunctionTester(unittest.TestCase):
 
         if self._dataset is None:
             self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        speech_samples = self._dataset.sort("id")[:num_samples]["audio"]
         return [x["array"] for x in speech_samples]
 
     def test_spectrogram_impulse(self):
diff --git a/utils/print_env.py b/utils/print_env.py
index e6d54fff2c7..ea2b8773554 100644
--- a/utils/print_env.py
+++ b/utils/print_env.py
@@ -72,3 +72,14 @@ try:
     print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU")))
 except ImportError:
     print("TensorFlow version:", None)
+
+
+try:
+    import torchcodec
+
+    versions = torchcodec._core.get_ffmpeg_library_versions()
+    print("FFmpeg version:", versions["ffmpeg_version"])
+except ImportError:
+    print("FFmpeg version:", None)
+except (AttributeError, KeyError):
+    print("Failed to get FFmpeg version")