transformers/tests/test_sequence_feature_extraction_common.py
Suraj Patil d26b37e744
Speech2TextTransformer (#10175)
* s2t

* fix config

* conversion script

* fix import

* add tokenizer

* fix tok init

* fix tokenizer

* first version working

* fix embeds

* fix lm head

* remove extra heads

* fix convert script

* handle encoder attn mask

* style

* better enc attn mask

* override _prepare_attention_mask_for_generation

* handle attn_maks in encoder and decoder

* input_ids => input_features

* enable use_cache

* remove old code

* expand embeddings if needed

* remove logits bias

* masked_lm_loss => loss

* hack tokenizer to support feature processing

* fix model_input_names

* style

* fix error message

* doc

* remove inputs_embeds

* remove input_embeds

* remove unnecessary docstring

* quality

* SpeechToText => Speech2Text

* style

* remove shared_embeds

* subsample => conv

* remove Speech2TextTransformerDecoderWrapper

* update output_lengths formula

* fix table

* remove max_position_embeddings

* update conversion scripts

* add possibility to do upper case for now

* add FeatureExtractor and Processor

* add tests for extractor

* require_torch_audio => require_torchaudio

* add processor test

* update import

* remove classification head

* attention mask is now 1D

* update docstrings

* attention mask should be of type long

* handle attention mask from generate

* alwyas return attention_mask

* fix test

* style

* doc

* Speech2TextTransformer => Speech2Text

* Speech2TextTransformerConfig => Speech2TextConfig

* remove dummy_inputs

* nit

* style

* multilinguial tok

* fix tokenizer

* add tgt_lang setter

* save lang_codes

* fix tokenizer

* add forced_bos_token_id to tokenizer

* apply review suggestions

* add torchaudio to extra deps

* add speech deps to CI

* fix dep

* add libsndfile to ci

* libsndfile1

* add speech to extras all

* libsndfile1 -> libsndfile1

* libsndfile

* libsndfile1-dev

* apt update

* add sudo to install

* update deps table

* install libsndfile1-dev on CI

* tuple to list

* init conv layer

* add model tests

* quality

* add integration tests

* skip_special_tokens

* add speech_to_text_transformer in toctree

* fix tokenizer

* fix fp16 tests

* add tokenizer tests

* fix copyright

* input_values => input_features

* doc

* add model in readme

* doc

* change checkpoint names

* fix copyright

* fix code example

* add max_model_input_sizes in tokenizer

* fix integration tests

* add do_lower_case to tokenizer

* remove clamp trick

* fix "Add modeling imports here"

* fix copyrights

* fix tests

* SpeechToTextTransformer => SpeechToText

* fix naming

* fix table formatting

* fix typo

* style

* fix typos

* remove speech dep from extras[testing]

* fix copies

* rename doc file,

* put imports under is_torch_available

* run feat extract tests when torch is available

* dummy objects for processor and extractor

* fix imports in tests

* fix import in modeling test

* fxi imports

* fix torch import

* fix imports again

* fix positional embeddings

* fix typo in import

* adapt new extractor refactor

* style

* fix torchscript test

* doc

* doc

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* fix docs, copied from, style

* fix docstring

* handle imports

* remove speech from all extra deps

* remove s2t from seq2seq lm mapping

* better names

* skip training tests

* add install instructions

* List => Tuple

* doc

* fix conversion script

* fix urls

* add instruction for libsndfile

* fix fp16 test

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-03-10 21:42:04 +05:30

254 lines
11 KiB
Python

# coding=utf-8
# Copyright 2021 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from transformers import BatchFeature
from transformers.testing_utils import require_tf, require_torch
from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
# to overwrite at feature extractactor specific tests
feat_extract_tester = None
feature_extraction_class = None
@property
def feat_extract_dict(self):
return self.feat_extract_tester.prepare_feat_extract_dict()
def test_feat_extract_common_properties(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
self.assertTrue(hasattr(feat_extract, "feature_size"))
self.assertTrue(hasattr(feat_extract, "sampling_rate"))
self.assertTrue(hasattr(feat_extract, "padding_value"))
def test_batch_feature(self):
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
batch_features_input = processed_features[input_name]
if len(batch_features_input.shape) < 3:
batch_features_input = batch_features_input[:, :, None]
self.assertTrue(
batch_features_input.shape
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
@require_torch
def test_batch_feature_pt(self):
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
batch_features_input = processed_features[input_name]
if len(batch_features_input.shape) < 3:
batch_features_input = batch_features_input[:, :, None]
self.assertTrue(
batch_features_input.shape
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
@require_tf
def test_batch_feature_tf(self):
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf")
batch_features_input = processed_features[input_name]
if len(batch_features_input.shape) < 3:
batch_features_input = batch_features_input[:, :, None]
self.assertTrue(
batch_features_input.shape
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
)
def _check_padding(self, numpify=False):
def _inputs_have_equal_length(input):
length = len(input[0])
for input_slice in input[1:]:
if len(input_slice) != length:
return False
return True
def _inputs_are_equal(input_1, input_2):
if len(input_1) != len(input_2):
return False
for input_slice_1, input_slice_2 in zip(input_1, input_2):
if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3):
return False
return True
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify)
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
pad_diff = self.feat_extract_tester.seq_length_diff
pad_max_length = self.feat_extract_tester.max_seq_length + pad_diff
pad_min_length = self.feat_extract_tester.min_seq_length
batch_size = self.feat_extract_tester.batch_size
feature_size = self.feat_extract_tester.feature_size
# test padding for List[int] + numpy
input_1 = feat_extract.pad(processed_features, padding=False)[input_name]
input_2 = feat_extract.pad(processed_features, padding="longest")[input_name]
input_3 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[-1]))[
input_name
]
input_4 = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
# max_length parameter has to be provided when setting `padding="max_length"`
with self.assertRaises(ValueError):
feat_extract.pad(processed_features, padding="max_length")[input_name]
input_5 = feat_extract.pad(
processed_features, padding="max_length", max_length=pad_max_length, return_tensors="np"
)[input_name]
self.assertFalse(_inputs_have_equal_length(input_1))
self.assertTrue(_inputs_have_equal_length(input_2))
self.assertTrue(_inputs_have_equal_length(input_3))
self.assertTrue(_inputs_are_equal(input_2, input_3))
self.assertTrue(len(input_1[0]) == pad_min_length)
self.assertTrue(len(input_1[1]) == pad_min_length + pad_diff)
self.assertTrue(input_4.shape[:2] == (batch_size, len(input_3[0])))
self.assertTrue(input_5.shape[:2] == (batch_size, pad_max_length))
if feature_size > 1:
self.assertTrue(input_4.shape[2] == input_5.shape[2] == feature_size)
# test padding for `pad_to_multiple_of` for List[int] + numpy
input_6 = feat_extract.pad(processed_features, pad_to_multiple_of=10)[input_name]
input_7 = feat_extract.pad(processed_features, padding="longest", pad_to_multiple_of=10)[input_name]
input_8 = feat_extract.pad(
processed_features, padding="max_length", pad_to_multiple_of=10, max_length=pad_max_length
)[input_name]
input_9 = feat_extract.pad(
processed_features,
padding="max_length",
pad_to_multiple_of=10,
max_length=pad_max_length,
return_tensors="np",
)[input_name]
self.assertTrue(all(len(x) % 10 == 0 for x in input_6))
self.assertTrue(_inputs_are_equal(input_6, input_7))
expected_mult_pad_length = pad_max_length if pad_max_length % 10 == 0 else (pad_max_length // 10 + 1) * 10
self.assertTrue(all(len(x) == expected_mult_pad_length for x in input_8))
self.assertTrue(input_9.shape[:2], (batch_size, expected_mult_pad_length))
if feature_size > 1:
self.assertTrue(input_9.shape[2] == feature_size)
# Check padding value is correct
padding_vector_sum = (np.ones(self.feat_extract_tester.feature_size) * feat_extract.padding_value).sum()
self.assertTrue(
abs(np.asarray(input_2[0])[pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length))
< 1e-3
)
self.assertTrue(
abs(
np.asarray(input_2[1])[pad_min_length + pad_diff :].sum()
- padding_vector_sum * (pad_max_length - pad_min_length - pad_diff)
)
< 1e-3
)
self.assertTrue(
abs(
np.asarray(input_2[2])[pad_min_length + 2 * pad_diff :].sum()
- padding_vector_sum * (pad_max_length - pad_min_length - 2 * pad_diff)
)
< 1e-3
)
self.assertTrue(
abs(input_5[0, pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) < 1e-3
)
self.assertTrue(
abs(input_9[0, pad_min_length:].sum() - padding_vector_sum * (expected_mult_pad_length - pad_min_length))
< 1e-3
)
def test_padding_from_list(self):
self._check_padding(numpify=False)
def test_padding_from_array(self):
self._check_padding(numpify=True)
@require_torch
def test_padding_accepts_tensors_pt(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().sum()) < 1e-2)
@require_tf
def test_padding_accepts_tensors_tf(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
input_name = feat_extract.model_input_names[0]
processed_features = BatchFeature({input_name: speech_inputs})
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().sum()) < 1e-2)
def test_attention_mask(self):
feat_dict = self.feat_extract_dict
feat_dict["return_attention_mask"] = True
feat_extract = self.feature_extraction_class(**feat_dict)
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
input_lenghts = [len(x) for x in speech_inputs]
input_name = feat_extract.model_input_names[0]
processed = BatchFeature({input_name: speech_inputs})
processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
self.assertIn("attention_mask", processed)
self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)