fix tests

This commit is contained in:
Quentin Lhoest 2025-07-02 22:31:38 +02:00
parent b05f248e79
commit 4339bd71ac
31 changed files with 64 additions and 47 deletions

View File

@ -72,7 +72,7 @@ predicted token ids.
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -1653,14 +1653,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
>>> text = "This is an example text." >>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() >>> audio = ds.sort("id")["audio"][0]
>>> audio_sample, sr = audio["array"], audio["sampling_rate"]
>>> # Define processor and model >>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # Generate processor output and model output >>> # Generate processor output and model output
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
>>> speech_embeds = model.get_speech_features( >>> speech_embeds = model.get_speech_features(
... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"] ... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"]
... ) ... )
@ -1732,14 +1733,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() >>> audio = ds.sort("id")["audio"][0]
>>> audio_sample, sr = audio["array"], audio["sampling_rate"]
>>> # Define processor and model >>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # processor outputs and model outputs >>> # processor outputs and model outputs
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
>>> outputs = model( >>> outputs = model(
... input_ids=processor_output["input_ids"], ... input_ids=processor_output["input_ids"],
... input_features=processor_output["input_features"], ... input_features=processor_output["input_features"],

View File

@ -993,7 +993,7 @@ class HubertModel(HubertPreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -1467,7 +1467,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch
@ -1579,7 +1579,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -237,6 +237,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
Example: Example:
```python ```python
>>> import torch
>>> from transformers import AutoProcessor, HubertModel >>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> from torchcodec.decoders import AudioDecoder >>> from torchcodec.decoders import AudioDecoder
@ -247,7 +248,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -1480,7 +1480,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -1072,7 +1072,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch
@ -1191,7 +1191,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch
@ -1392,7 +1392,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -1538,7 +1538,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch
@ -1650,7 +1650,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> def map_to_array(batch): >>> def map_to_array(batch):
... decoder = AudioDecoder(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = decoder.get_all_samples().data ... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
... return batch ... return batch

View File

@ -154,7 +154,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -165,7 +165,7 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -215,7 +215,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=22050)) ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]

View File

@ -373,10 +373,12 @@ class ClvpModelForConditionalGenerationTester:
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() audio = ds.sort("id")[0]["audio"]
audio_sample = audio["array"]
sr = audio["sampling_rate"]
feature_extractor = ClvpFeatureExtractor() feature_extractor = ClvpFeatureExtractor()
input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[ input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[
"input_features" "input_features"
].to(torch_device) ].to(torch_device)
@ -562,7 +564,8 @@ class ClvpIntegrationTest(unittest.TestCase):
self.text = "This is an example text." self.text = "This is an example text."
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() audio = ds.sort("id")["audio"][0]
self.speech_samples, self.sr = audio["array"], audio["sampling_rate"]
self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device) self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
self.model.eval() self.model.eval()

View File

@ -143,7 +143,7 @@ class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] audio_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in audio_samples] return [x["array"] for x in audio_samples]

View File

@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] audio_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in audio_samples] return [x["array"] for x in audio_samples]

View File

@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase):
@require_torch_accelerator @require_torch_accelerator
def test_dia_model_integration_generate_audio_context(self): def test_dia_model_integration_generate_audio_context(self):
text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"] text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"]
audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy() audio_sample_1 = (
audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy() torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
)
audio_sample_2 = (
torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
)
audio = [audio_sample_1, audio_sample_2] audio = [audio_sample_1, audio_sample_2]
processor = DiaProcessor.from_pretrained(self.model_checkpoint) processor = DiaProcessor.from_pretrained(self.model_checkpoint)

View File

@ -139,7 +139,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] audio_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in audio_samples] return [x["array"] for x in audio_samples]

View File

@ -340,7 +340,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -713,7 +713,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
self._load_dataset() self._load_dataset()
ds = self._dataset ds = self._dataset
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
@slow @slow

View File

@ -443,7 +443,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import tempfile
import unittest import unittest
import requests import requests
@ -296,12 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assistant_token = "<|assistant|>" self.assistant_token = "<|assistant|>"
self.end_token = "<|end|>" self.end_token = "<|end|>"
self.image = Image.open(requests.get(self.image_url, stream=True).raw) self.image = Image.open(requests.get(self.image_url, stream=True).raw)
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp: audio_bytes = requests.get(self.audio_url, stream=True).raw.data
tmp.write(requests.get(self.audio_url, stream=True).raw.data) samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples()
tmp.flush() self.audio, self.sampling_rate = samples.data, samples.sample_rate
tmp.seek(0)
samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
self.audio, self.sampling_rate = samples.data, samples.sample_rate
cleanup(torch_device, gc_collect=True) cleanup(torch_device, gc_collect=True)

View File

@ -294,7 +294,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -381,7 +381,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -764,7 +764,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
@ -1792,7 +1792,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -330,7 +330,7 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]

View File

@ -216,7 +216,7 @@ class UnivNetModelIntegrationTests(unittest.TestCase):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]

View File

@ -254,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech # automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]

View File

@ -1460,7 +1460,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):
self._load_dataset() self._load_dataset()
ds = self._dataset ds = self._dataset
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
@slow @slow

View File

@ -1190,7 +1190,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
num_beams=1, num_beams=1,
) )
transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] transcription_non_ass = pipe(sample, generate_kwargs={"assistant_model": assistant_model})["text"]
transcription_ass = pipe(sample)["text"] transcription_ass = pipe(sample)["text"]
self.assertEqual(transcription_ass, transcription_non_ass) self.assertEqual(transcription_ass, transcription_non_ass)

View File

@ -278,7 +278,7 @@ class AudioUtilsFunctionTester(unittest.TestCase):
if self._dataset is None: if self._dataset is None:
self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"] speech_samples = self._dataset.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples] return [x["array"] for x in speech_samples]
def test_spectrogram_impulse(self): def test_spectrogram_impulse(self):

View File

@ -72,3 +72,14 @@ try:
print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU"))) print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU")))
except ImportError: except ImportError:
print("TensorFlow version:", None) print("TensorFlow version:", None)
try:
import torchcodec
versions = torchcodec._core.get_ffmpeg_library_versions()
print("FFmpeg version:", versions["ffmpeg_version"])
except ImportError:
print("FFmpeg version:", None)
except (AttributeError, KeyError):
print("Failed to get FFmpeg version")