mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
fix tests
This commit is contained in:
parent
b05f248e79
commit
4339bd71ac
@ -72,7 +72,7 @@ predicted token ids.
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -1653,14 +1653,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
|
|||||||
>>> text = "This is an example text."
|
>>> text = "This is an example text."
|
||||||
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
||||||
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
|
>>> audio = ds.sort("id")["audio"][0]
|
||||||
|
>>> audio_sample, sr = audio["array"], audio["sampling_rate"]
|
||||||
|
|
||||||
>>> # Define processor and model
|
>>> # Define processor and model
|
||||||
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
|
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
|
||||||
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
|
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
|
||||||
|
|
||||||
>>> # Generate processor output and model output
|
>>> # Generate processor output and model output
|
||||||
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
|
>>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
|
||||||
>>> speech_embeds = model.get_speech_features(
|
>>> speech_embeds = model.get_speech_features(
|
||||||
... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"]
|
... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"]
|
||||||
... )
|
... )
|
||||||
@ -1732,14 +1733,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
|
|||||||
|
|
||||||
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
||||||
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
|
>>> audio = ds.sort("id")["audio"][0]
|
||||||
|
>>> audio_sample, sr = audio["array"], audio["sampling_rate"]
|
||||||
|
|
||||||
>>> # Define processor and model
|
>>> # Define processor and model
|
||||||
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
|
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
|
||||||
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
|
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
|
||||||
|
|
||||||
>>> # processor outputs and model outputs
|
>>> # processor outputs and model outputs
|
||||||
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
|
>>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt")
|
||||||
>>> outputs = model(
|
>>> outputs = model(
|
||||||
... input_ids=processor_output["input_ids"],
|
... input_ids=processor_output["input_ids"],
|
||||||
... input_features=processor_output["input_features"],
|
... input_features=processor_output["input_features"],
|
||||||
|
@ -993,7 +993,7 @@ class HubertModel(HubertPreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -1467,7 +1467,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
@ -1579,7 +1579,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -237,6 +237,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
|
|||||||
Example:
|
Example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
>>> import torch
|
||||||
>>> from transformers import AutoProcessor, HubertModel
|
>>> from transformers import AutoProcessor, HubertModel
|
||||||
>>> from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
>>> from torchcodec.decoders import AudioDecoder
|
>>> from torchcodec.decoders import AudioDecoder
|
||||||
@ -247,7 +248,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -1480,7 +1480,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -1072,7 +1072,7 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
@ -1191,7 +1191,7 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
@ -1392,7 +1392,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -1538,7 +1538,7 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
@ -1650,7 +1650,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
... decoder = AudioDecoder(batch["file"])
|
... decoder = AudioDecoder(batch["file"])
|
||||||
... batch["speech"] = decoder.get_all_samples().data
|
... batch["speech"] = torch.mean(decoder.get_all_samples().data, axis=0)
|
||||||
... return batch
|
... return batch
|
||||||
|
|
||||||
|
|
||||||
|
@ -154,7 +154,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
|
|||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
ds = ds.cast_column("audio", Audio(sampling_rate=22050))
|
ds = ds.cast_column("audio", Audio(sampling_rate=22050))
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -373,10 +373,12 @@ class ClvpModelForConditionalGenerationTester:
|
|||||||
|
|
||||||
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
||||||
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
|
audio = ds.sort("id")[0]["audio"]
|
||||||
|
audio_sample = audio["array"]
|
||||||
|
sr = audio["sampling_rate"]
|
||||||
|
|
||||||
feature_extractor = ClvpFeatureExtractor()
|
feature_extractor = ClvpFeatureExtractor()
|
||||||
input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[
|
input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[
|
||||||
"input_features"
|
"input_features"
|
||||||
].to(torch_device)
|
].to(torch_device)
|
||||||
|
|
||||||
@ -562,7 +564,8 @@ class ClvpIntegrationTest(unittest.TestCase):
|
|||||||
self.text = "This is an example text."
|
self.text = "This is an example text."
|
||||||
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
|
||||||
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
|
audio = ds.sort("id")["audio"][0]
|
||||||
|
self.speech_samples, self.sr = audio["array"], audio["sampling_rate"]
|
||||||
|
|
||||||
self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
|
self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
|
@ -143,7 +143,7 @@ class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
audio_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in audio_samples]
|
return [x["array"] for x in audio_samples]
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
audio_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in audio_samples]
|
return [x["array"] for x in audio_samples]
|
||||||
|
|
||||||
|
@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_dia_model_integration_generate_audio_context(self):
|
def test_dia_model_integration_generate_audio_context(self):
|
||||||
text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"]
|
text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"]
|
||||||
audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy()
|
audio_sample_1 = (
|
||||||
audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy()
|
torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
|
||||||
|
)
|
||||||
|
audio_sample_2 = (
|
||||||
|
torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
|
||||||
|
)
|
||||||
audio = [audio_sample_1, audio_sample_2]
|
audio = [audio_sample_1, audio_sample_2]
|
||||||
|
|
||||||
processor = DiaProcessor.from_pretrained(self.model_checkpoint)
|
processor = DiaProcessor.from_pretrained(self.model_checkpoint)
|
||||||
|
@ -139,7 +139,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
audio_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in audio_samples]
|
return [x["array"] for x in audio_samples]
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -713,7 +713,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
self._load_dataset()
|
self._load_dataset()
|
||||||
ds = self._dataset
|
ds = self._dataset
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@ -443,7 +443,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase):
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import tempfile
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -296,12 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
|
|||||||
self.assistant_token = "<|assistant|>"
|
self.assistant_token = "<|assistant|>"
|
||||||
self.end_token = "<|end|>"
|
self.end_token = "<|end|>"
|
||||||
self.image = Image.open(requests.get(self.image_url, stream=True).raw)
|
self.image = Image.open(requests.get(self.image_url, stream=True).raw)
|
||||||
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp:
|
audio_bytes = requests.get(self.audio_url, stream=True).raw.data
|
||||||
tmp.write(requests.get(self.audio_url, stream=True).raw.data)
|
samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples()
|
||||||
tmp.flush()
|
self.audio, self.sampling_rate = samples.data, samples.sample_rate
|
||||||
tmp.seek(0)
|
|
||||||
samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
|
|
||||||
self.audio, self.sampling_rate = samples.data, samples.sample_rate
|
|
||||||
|
|
||||||
cleanup(torch_device, gc_collect=True)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
|
@ -294,7 +294,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -381,7 +381,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -764,7 +764,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
@ -1792,7 +1792,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
|
|||||||
|
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -330,7 +330,7 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
|
|||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
|
ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -216,7 +216,7 @@ class UnivNetModelIntegrationTests(unittest.TestCase):
|
|||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -254,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
# automatic decoding with librispeech
|
# automatic decoding with librispeech
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
|
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
|
@ -1460,7 +1460,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|||||||
def _load_datasamples(self, num_samples):
|
def _load_datasamples(self, num_samples):
|
||||||
self._load_dataset()
|
self._load_dataset()
|
||||||
ds = self._dataset
|
ds = self._dataset
|
||||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = ds.sort("id")[:num_samples]["audio"]
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@ -1190,7 +1190,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
|
|||||||
num_beams=1,
|
num_beams=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
|
transcription_non_ass = pipe(sample, generate_kwargs={"assistant_model": assistant_model})["text"]
|
||||||
transcription_ass = pipe(sample)["text"]
|
transcription_ass = pipe(sample)["text"]
|
||||||
|
|
||||||
self.assertEqual(transcription_ass, transcription_non_ass)
|
self.assertEqual(transcription_ass, transcription_non_ass)
|
||||||
|
@ -278,7 +278,7 @@ class AudioUtilsFunctionTester(unittest.TestCase):
|
|||||||
|
|
||||||
if self._dataset is None:
|
if self._dataset is None:
|
||||||
self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
speech_samples = self._dataset.sort("id")[:num_samples]["audio"]
|
||||||
return [x["array"] for x in speech_samples]
|
return [x["array"] for x in speech_samples]
|
||||||
|
|
||||||
def test_spectrogram_impulse(self):
|
def test_spectrogram_impulse(self):
|
||||||
|
@ -72,3 +72,14 @@ try:
|
|||||||
print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU")))
|
print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU")))
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("TensorFlow version:", None)
|
print("TensorFlow version:", None)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torchcodec
|
||||||
|
|
||||||
|
versions = torchcodec._core.get_ffmpeg_library_versions()
|
||||||
|
print("FFmpeg version:", versions["ffmpeg_version"])
|
||||||
|
except ImportError:
|
||||||
|
print("FFmpeg version:", None)
|
||||||
|
except (AttributeError, KeyError):
|
||||||
|
print("Failed to get FFmpeg version")
|
||||||
|
Loading…
Reference in New Issue
Block a user