mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00

* make SpeechT5 model by copying Wav2Vec2 * add paper to docs * whoops added docs in wrong file * remove SpeechT5Tokenizer + put CTC back in the name * remove deprecated class * remove unused docstring * delete SpeechT5FeatureExtractor, use Wav2Vec2FeatureExtractor instead * remove classes we don't need right now * initial stab at speech encoder prenet * add more speech encoder prenet stuff * improve SpeechEncoderPrenet * add encoder (not finished yet) * add relative position bias to self-attention * add encoder CTC layers * fix formatting * add decoder from BART, doesn't work yet * make it work with generate loop * wrap the encoder into a speech encoder class * wrap the decoder in a text decoder class * changed my mind * changed my mind again ;-) * load decoder weights, make it work * add weights for text decoder postnet * add SpeechT5ForCTC model that uses only the encoder * clean up EncoderLayer and DecoderLayer * implement _init_weights in SpeechT5PreTrainedModel * cleanup config + Encoder and Decoder * add head + cross attention masks * improve doc comments * fixup * more cleanup * more fixup * TextDecoderPrenet works now, thanks Kendall * add CTC loss * add placeholders for other pre/postnets * add type annotation * fix freeze_feature_encoder * set padding tokens to 0 in decoder attention mask * encoder attention mask downsampling * remove features_pen calculation * disable the padding tokens thing again * fixup * more fixup * code review fixes * rename encoder/decoder wrapper classes * allow checkpoints to be loaded into SpeechT5Model * put encoder into wrapper for CTC model * clean up conversion script * add encoder for TTS model * add speech decoder prenet * add speech decoder post-net * attempt to reconstruct the generation loop * add speech generation loop * clean up generate_speech * small tweaks * fix forward pass * enable always dropout on speech decoder prenet * sort declaration * rename models * fixup * fix copies * more fixup * make consistency checker happy * add Seq2SeqSpectrogramOutput class * doc comments * quick note about loss and labels * add HiFi-GAN implementation (from Speech2Speech PR) * rename file * add vocoder to TTS model * improve vocoder * working on tokenizer * more better tokenizer * add CTC tokenizer * fix decode and batch_code in CTC tokenizer * fix processor * two processors and feature extractors * use SpeechT5WaveformFeatureExtractor instead of Wav2Vec2 * cleanup * more cleanup * even more fixup * notebooks * fix log-mel spectrograms * support reduction factor * fixup * shift spectrograms to right to create decoder inputs * return correct labels * add labels for stop token prediction * fix doc comments * fixup * remove SpeechT5ForPreTraining * more fixup * update copyright headers * add usage examples * add SpeechT5ProcessorForCTC * fixup * push unofficial checkpoints to hub * initial version of tokenizer unit tests * add slow test * fix failing tests * tests for CTC tokenizer * finish CTC tokenizer tests * processor tests * initial test for feature extractors * tests for spectrogram feature extractor * fixup * more fixup * add decorators * require speech for tests * modeling tests * more tests for ASR model * fix imports * add fake tests for the other models * fixup * remove jupyter notebooks * add missing SpeechT5Model tests * add missing tests for SpeechT5ForCTC * add missing tests for SpeechT5ForTextToSpeech * sort tests by name * fix Hi-Fi GAN tests * fixup * add speech-to-speech model * refactor duplicate speech generation code * add processor for SpeechToSpeech model * add usage example * add tests for speech-to-speech model * fixup * enable gradient checkpointing for SpeechT5FeatureEncoder * code review * push_to_hub now takes repo_id * improve doc comments for HiFi-GAN config * add missing test * add integration tests * make number of layers in speech decoder prenet configurable * rename variable * rename variables * add auto classes for TTS and S2S * REMOVE CTC!!! * S2S processor does not support save/load_pretrained * fixup * these models are now in an auto mapping * fix doc links * rename HiFiGAN to HifiGan, remove separate config file * REMOVE auto classes * there can be only one * fixup * replace assert * reformat * feature extractor can process input and target at same time * update checkpoint names * fix commit hash
422 lines
18 KiB
Python
422 lines
18 KiB
Python
# coding=utf-8
|
|
# Copyright 2021-2023 HuggingFace Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Tests for the SpeechT5 feature extractors."""
|
|
|
|
import itertools
|
|
import random
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from transformers import BatchFeature, is_speech_available
|
|
from transformers.testing_utils import require_torch, require_torchaudio
|
|
from transformers.utils.import_utils import is_torch_available
|
|
|
|
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
|
|
|
|
|
if is_speech_available():
|
|
from transformers import SpeechT5FeatureExtractor
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
|
|
global_rng = random.Random()
|
|
|
|
|
|
def floats_list(shape, scale=1.0, rng=None, name=None):
|
|
"""Creates a random float32 tensor"""
|
|
if rng is None:
|
|
rng = global_rng
|
|
|
|
values = []
|
|
for batch_idx in range(shape[0]):
|
|
values.append([])
|
|
for _ in range(shape[1]):
|
|
values[-1].append(rng.random() * scale)
|
|
|
|
return values
|
|
|
|
|
|
@require_torch
|
|
class SpeechT5FeatureExtractionTester(unittest.TestCase):
|
|
def __init__(
|
|
self,
|
|
parent,
|
|
batch_size=7,
|
|
min_seq_length=400,
|
|
max_seq_length=2000,
|
|
feature_size=1,
|
|
padding_value=0.0,
|
|
sampling_rate=16000,
|
|
do_normalize=True,
|
|
num_mel_bins=80,
|
|
hop_length=16,
|
|
win_length=64,
|
|
win_function="hann_window",
|
|
frame_signal_scale=1.0,
|
|
fmin=80,
|
|
fmax=7600,
|
|
mel_floor=1e-10,
|
|
reduction_factor=2,
|
|
return_attention_mask=True,
|
|
):
|
|
self.parent = parent
|
|
self.batch_size = batch_size
|
|
self.min_seq_length = min_seq_length
|
|
self.max_seq_length = max_seq_length
|
|
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
|
self.feature_size = feature_size
|
|
self.padding_value = padding_value
|
|
self.sampling_rate = sampling_rate
|
|
self.do_normalize = do_normalize
|
|
self.num_mel_bins = num_mel_bins
|
|
self.hop_length = hop_length
|
|
self.win_length = win_length
|
|
self.win_function = win_function
|
|
self.frame_signal_scale = frame_signal_scale
|
|
self.fmin = fmin
|
|
self.fmax = fmax
|
|
self.mel_floor = mel_floor
|
|
self.reduction_factor = reduction_factor
|
|
self.return_attention_mask = return_attention_mask
|
|
|
|
def prepare_feat_extract_dict(self):
|
|
return {
|
|
"feature_size": self.feature_size,
|
|
"padding_value": self.padding_value,
|
|
"sampling_rate": self.sampling_rate,
|
|
"do_normalize": self.do_normalize,
|
|
"num_mel_bins": self.num_mel_bins,
|
|
"hop_length": self.hop_length,
|
|
"win_length": self.win_length,
|
|
"win_function": self.win_function,
|
|
"frame_signal_scale": self.frame_signal_scale,
|
|
"fmin": self.fmin,
|
|
"fmax": self.fmax,
|
|
"mel_floor": self.mel_floor,
|
|
"reduction_factor": self.reduction_factor,
|
|
"return_attention_mask": self.return_attention_mask,
|
|
}
|
|
|
|
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
|
def _flatten(list_of_lists):
|
|
return list(itertools.chain(*list_of_lists))
|
|
|
|
if equal_length:
|
|
speech_inputs = floats_list((self.batch_size, self.max_seq_length))
|
|
else:
|
|
# make sure that inputs increase in size
|
|
speech_inputs = [
|
|
_flatten(floats_list((x, self.feature_size)))
|
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
|
]
|
|
|
|
if numpify:
|
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
|
|
|
return speech_inputs
|
|
|
|
def prepare_inputs_for_target(self, equal_length=False, numpify=False):
|
|
if equal_length:
|
|
speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
|
|
else:
|
|
# make sure that inputs increase in size
|
|
speech_inputs = [
|
|
floats_list((x, self.num_mel_bins))
|
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
|
]
|
|
|
|
if numpify:
|
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
|
|
|
return speech_inputs
|
|
|
|
|
|
@require_torch
|
|
@require_torchaudio
|
|
class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
|
|
|
feature_extraction_class = SpeechT5FeatureExtractor if is_speech_available() else None
|
|
|
|
def setUp(self):
|
|
self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
|
|
|
|
def _check_zero_mean_unit_variance(self, input_vector):
|
|
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
|
|
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
|
|
|
|
def test_call(self):
|
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
# create three inputs of length 800, 1000, and 1200
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
|
|
|
# Test not batched input
|
|
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
|
|
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
|
|
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
|
|
|
# Test batched
|
|
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
|
|
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
|
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
|
|
|
def test_zero_mean_unit_variance_normalization_np(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
|
|
paddings = ["longest", "max_length", "do_not_pad"]
|
|
max_lengths = [None, 1600, None]
|
|
for max_length, padding in zip(max_lengths, paddings):
|
|
processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0][:800])
|
|
self.assertTrue(input_values[0][800:].sum() < 1e-6)
|
|
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
|
self.assertTrue(input_values[0][1000:].sum() < 1e-6)
|
|
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
|
|
|
def test_zero_mean_unit_variance_normalization(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
lengths = range(800, 1400, 200)
|
|
speech_inputs = [floats_list((1, x))[0] for x in lengths]
|
|
|
|
paddings = ["longest", "max_length", "do_not_pad"]
|
|
max_lengths = [None, 1600, None]
|
|
|
|
for max_length, padding in zip(max_lengths, paddings):
|
|
processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0][:800])
|
|
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
|
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
|
|
|
def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
# make sure that if max_length < longest -> then pad to max_length
|
|
self.assertTrue(input_values.shape == (3, 1000))
|
|
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
# make sure that if max_length > longest -> then pad to longest
|
|
self.assertTrue(input_values.shape == (3, 1200))
|
|
|
|
def test_double_precision_pad(self):
|
|
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
np_speech_inputs = np.random.rand(100).astype(np.float64)
|
|
py_speech_inputs = np_speech_inputs.tolist()
|
|
|
|
for inputs in [py_speech_inputs, np_speech_inputs]:
|
|
np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
|
|
self.assertTrue(np_processed.input_values.dtype == np.float32)
|
|
pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
|
|
self.assertTrue(pt_processed.input_values.dtype == torch.float32)
|
|
|
|
def test_call_target(self):
|
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
|
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
# create three inputs of length 8000, 14000, and 2000
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
|
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
|
|
|
# Test feature size
|
|
input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
|
|
self.assertTrue(input_values.ndim == 3)
|
|
self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
|
|
|
|
# Test not batched input
|
|
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
|
|
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
|
|
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
|
|
|
# Test batched
|
|
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
|
|
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
|
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
|
|
|
def test_batch_feature_target(self):
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs})
|
|
|
|
self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
|
|
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
|
|
|
|
batch_features_input = processed_features[input_name]
|
|
|
|
if len(batch_features_input.shape) < 3:
|
|
batch_features_input = batch_features_input[:, :, None]
|
|
|
|
self.assertTrue(
|
|
batch_features_input.shape
|
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
|
)
|
|
|
|
@require_torch
|
|
def test_batch_feature_target_pt(self):
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
|
|
|
|
batch_features_input = processed_features[input_name]
|
|
|
|
if len(batch_features_input.shape) < 3:
|
|
batch_features_input = batch_features_input[:, :, None]
|
|
|
|
self.assertTrue(
|
|
batch_features_input.shape
|
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
|
)
|
|
|
|
@require_torch
|
|
def test_padding_accepts_tensors_target_pt(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs})
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
|
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
|
|
|
|
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
|
|
|
|
def test_attention_mask_target(self):
|
|
feat_dict = self.feat_extract_dict
|
|
feat_dict["return_attention_mask"] = True
|
|
feat_extract = self.feature_extraction_class(**feat_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_lenghts = [len(x) for x in speech_inputs]
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed = BatchFeature({input_name: speech_inputs})
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
|
|
self.assertIn("attention_mask", processed)
|
|
self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
|
|
self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
|
|
|
|
def test_attention_mask_with_truncation_target(self):
|
|
feat_dict = self.feat_extract_dict
|
|
feat_dict["return_attention_mask"] = True
|
|
feat_extract = self.feature_extraction_class(**feat_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_lenghts = [len(x) for x in speech_inputs]
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed = BatchFeature({input_name: speech_inputs})
|
|
max_length = min(input_lenghts)
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
processed_pad = feat_extract.pad(
|
|
processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
|
|
)
|
|
self.assertIn("attention_mask", processed_pad)
|
|
self.assertListEqual(
|
|
list(processed_pad.attention_mask.shape), list((processed_pad[input_name].shape[0], max_length))
|
|
)
|
|
self.assertListEqual(
|
|
processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
|
|
)
|
|
|
|
def _load_datasamples(self, num_samples):
|
|
from datasets import load_dataset
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
# automatic decoding with librispeech
|
|
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
|
|
|
return [x["array"] for x in speech_samples]
|
|
|
|
def test_integration(self):
|
|
# fmt: off
|
|
EXPECTED_INPUT_VALUES = torch.tensor(
|
|
[2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
|
|
3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
|
|
2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
|
|
4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
|
|
7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
|
|
4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(1)
|
|
feature_extractor = SpeechT5FeatureExtractor()
|
|
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
|
|
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
|
|
|
def test_integration_target(self):
|
|
# fmt: off
|
|
EXPECTED_INPUT_VALUES = torch.tensor(
|
|
[-2.7713, -2.8896, -3.2619, -3.0843, -2.9919, -3.0084, -3.2796, -3.3169,
|
|
-3.2397, -3.2053, -2.9151, -2.7921, -2.9403, -2.7411, -3.0654, -2.8314,
|
|
-3.0026, -2.9797, -3.1314, -2.9939, -2.6748, -2.7725, -2.8563, -2.9462,
|
|
-3.2623, -3.3044, -3.1318, -3.2672, -3.4030, -3.1988]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(1)
|
|
feature_extractor = SpeechT5FeatureExtractor()
|
|
input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
|
|
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|