mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
add a test checking the format of convert_tokens_to_string
's output (#16540)
* add new tests * add comment to overridden tests
This commit is contained in:
parent
24a85cca61
commit
be9474bd35
@ -321,3 +321,14 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
|
||||
def test_conversion_reversible(self):
|
||||
pass
|
||||
|
||||
def test_convert_tokens_to_string_format(self):
|
||||
# The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings
|
||||
# and special added tokens as tokens
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokens = ["t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "x", "t", "</s>"]
|
||||
string = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
self.assertIsInstance(string, str)
|
||||
|
@ -286,3 +286,14 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
|
||||
def test_conversion_reversible(self):
|
||||
pass
|
||||
|
||||
def test_convert_tokens_to_string_format(self):
|
||||
# The default common tokenizer tests uses invalid tokens for Perceiver that can only accept one-character
|
||||
# strings and special added tokens as tokens
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokens = ["[CLS]", "t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "s", "t", "[SEP]"]
|
||||
string = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
self.assertIsInstance(string, str)
|
||||
|
@ -3713,6 +3713,15 @@ class TokenizerTesterMixin:
|
||||
trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
|
||||
self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
|
||||
|
||||
def test_convert_tokens_to_string_format(self):
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokens = ["this", "is", "a", "test"]
|
||||
string = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
self.assertIsInstance(string, str)
|
||||
|
||||
def test_save_slow_from_fast_and_reload_fast(self):
|
||||
if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
|
||||
# we need both slow and fast versions
|
||||
|
@ -753,3 +753,14 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
@unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
|
||||
def test_torch_encode_plus_sent_to_model(self):
|
||||
pass
|
||||
|
||||
def test_convert_tokens_to_string_format(self):
|
||||
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
|
||||
# is not the case for Wav2vec2.
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokens = ["T", "H", "I", "S", "|", "I", "S", "|", "A", "|", "T", "E", "X", "T"]
|
||||
output = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
self.assertIsInstance(output["text"], str)
|
||||
|
@ -398,3 +398,14 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
@unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
|
||||
def test_torch_encode_plus_sent_to_model(self):
|
||||
pass
|
||||
|
||||
def test_convert_tokens_to_string_format(self):
|
||||
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
|
||||
# is not the case for Wav2Vec2PhonemeCTCTokenizer.
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokens = ["ð", "ɪ", "s", "ɪ", "z", "ɐ", "t", "ɛ", "k", "s", "t"]
|
||||
output = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
self.assertIsInstance(output["text"], str)
|
||||
|
Loading…
Reference in New Issue
Block a user