transformers/tests/models/mctct/test_processor_mctct.py
Chan Woo Kim 119e3c0fc8
M-CTC-T Model (#16402)
* added cbs to notebooks, made copy-paste error fix in generation_utils

* initial push for mctc model

* mctc feature extractor done

* added processor, tokenizer and their tests for MCTC. Have added an MCTC modeling test, adjusting model code accordingly.

* added processor, tokenizer and their tests for MCTC. Have added an MCTC modeling test, adjusting model code accordingly.

* passing attention, now struggling to figure out how attention masks make sense here

* works when excluding attention masks. ask later how one would integrate attention maskshere

* bizarre configuration error (model prefix comes first in config dict json and messes up the order)

* all passing but bizzarre config dict ordering issue when to_dict

* passing all major tests

* feature extraction, processor, tokenizer added & tests passing

* style & consistency & other logistical fixes

* copy paste fix

* model after feature extraction working

* commiting final feature extraction results; need to fix normalization

* feature extraction passing tests; probably should add tests on the specific flashlight-copied functions?

* delete print ; format code a bit

* fixing tests

* passing major tests

* fixing styles

* completed tokenization test with real example; not sure if these values are entirely correct.

* last test fixes from local

* reverting accidentally included custom setup configs

* remove load tf weights; fix config error

* testing couldnt import featureextractor

* fix docs

* fix docs

* resolving comments

* style fixes

* style fixes

* Update to MCTCConv1dSubSampler

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* relposemb fixes

* conv1d name issue; expecting config fail with paraentheses

* fix config issue

* fix config issue

* fix config issue

* change everything to MCTCT

* fixing naming change errors

* archive list

* copyrights and docs

* copyrights and docs

* copyrights and docs

* merge resolution

* move tests, fix to changed optionaldependency structure

* test directories changed

* fixing tests

* how to avoid tf tests?

* how to avoid tf tests?

* tests passing locally

* allow mctctprocessor imported any env

* allow mctctprocessor imported any env

* fixed second round of feedback, need to fix docs

* doc changes not being applied

* all fixed

* style fix

* feedback fixes

* fix copies and feature extraction style fix

* Update tests/models/visual_bert/test_modeling_visual_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* copy paste huggingface:main visual bert

* added eof newline to visual bert; all tests are passing otherwise

* fix slow tests by adding attention mask

* change model id to speechbrain

* make fix-copies

* fix readme unwanted deletes

* fixing readmes, make fix-copies

* consistent M-CTC-T naming

* Update src/transformers/models/mctct/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* all fixed but variable naming

* adjust double quotes

* fixed variable names

* copyright and mr quilter

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* correct slow tests

* make fix-copies

* Update src/transformers/models/mctct/configuration_mctct.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/mctct/configuration_mctct.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* m-ctc-t not mctct

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-06-08 00:33:07 +02:00

148 lines
5.8 KiB
Python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from transformers import MCTCTProcessor, is_speech_available, is_torch_available
from transformers.file_utils import FEATURE_EXTRACTOR_NAME
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
from transformers.testing_utils import require_torch, require_torchaudio
if is_speech_available() and is_torch_available():
from transformers import MCTCTFeatureExtractor
from .test_feature_extraction_mctct import floats_list
@require_torch
@require_torchaudio
class MCTCTProcessorTest(unittest.TestCase):
def setUp(self):
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.add_kwargs_tokens_map = {
"pad_token": "<pad>",
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
}
feature_extractor_map = {
"feature_size": 1,
"padding_value": 0.0,
"sampling_rate": 16000,
"return_attention_mask": False,
"do_normalize": True,
}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs_init):
kwargs = self.add_kwargs_tokens_map.copy()
kwargs.update(kwargs_init)
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_feature_extractor(self, **kwargs):
return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
processor = MCTCTProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
with processor.as_target_processor():
encoded_processor = processor(input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok.keys():
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)